def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        # Parse the index page to get the topics
        resp = downloader.make_request(
            "http://proyectodescartes.org/descartescms/")
        soup = BeautifulSoup(resp.content, "html.parser")
        topics = soup.find_all("a", "item")
        final_topics = self.parse_topics(topics, channel)

        for topic in final_topics:
            self.download_subject(topic[0], topic[1], topic[2])

        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction

        return channel
    def download_subject(self, subject, link, parent):
        """
        Parse each subject page.
        """
        LOGGER.info("Processing subject: {}".format(subject.title))

        # No need to parse the content under the subject when link is not valid
        if "javascript:void(0);" in link:
            parent.add_child(subject)
            return

        # Parse each subject's index page
        resp = downloader.make_request(link)
        soup = BeautifulSoup(resp.content, "html.parser")

        selected_category = soup.find("option", {
            "class": "level0",
            "selected": "selected"
        })
        if not selected_category:
            return

        parent.add_child(subject)

        for item in AGE_RANGE.keys():
            params = OrderedDict([("category", selected_category["value"]),
                                  ("moduleId", "282"), ("format", "count")])
            for index in range(len(AGE_RANGE[item])):
                params["taga[{}]".format(index)] = AGE_RANGE[item][index]

            # Parse the topics of age range under each subject
            resp = downloader.make_request("{}/itemlist/filter".format(link),
                                           params=params)
            count = int(resp.text.split('\n')[0])
            if count == 0:
                continue

            LOGGER.info("Processing topic: {}".format(item))
            age_topic = TopicNode(source_id=item, title=item)
            subject.add_child(age_topic)
            total_pages = ceil(count / 20)

            for i in range(total_pages):
                page_params = OrderedDict(params)
                LOGGER.info("Processing page: {}".format(i))
                self.download_content(age_topic, link, page_params,
                                      selected_category["value"], i * 20)
    def download_content(self, parent, link, params, selected_category, start):
        """
        Parse each content page.
        """
        params["start"] = start
        params.pop("format")

        # Parse each page of the result
        resp = downloader.make_request("{}/itemlist/filter".format(link),
                                       params=params)
        soup = BeautifulSoup(resp.content, "html.parser")

        # Find the all the content in each page
        for item in soup.find("tbody").find_all("a"):
            content_url = "http://proyectodescartes.org{}".format(item["href"])
            title = item.text.strip()
            source_id = item["href"].split("/")[-1]

            # Parse each content's page
            response = downloader.make_request(content_url)
            page = BeautifulSoup(response.content, "html.parser")

            thumbnail_url = "http://proyectodescartes.org{}".format(
                page.find("div", class_="itemFullText").find("img")["src"])
            author = self.get_content_author(page)
            zip_path = self.get_content_zip(page)
            if not zip_path:
                LOGGER.info(
                    "The url for the zip file does not exist in this page: {}".
                    format(content_url))
                continue

            content_node = HTML5AppNode(
                source_id=source_id,
                title=title,
                license=CC_BY_NC_SALicense(
                    copyright_holder="Proyecto Descartes"),
                language=CHANNEL_LANGUAGE,
                files=[files.HTMLZipFile(zip_path)],
                author=author,
                thumbnail=thumbnail_url,
            )

            parent.add_child(content_node)
def books_for_each_category(category):
    """
    Get all the books for every category
    Parameters:
    * category - The name of the category that is related to the books
    """
    LOGGER.info("\tCrawling books for {}......\n".format(category))

    # Get the json file of the page and parse it
    payload = {"page": 1, "per_page": 24, "categories[]": category}
    response = downloader.make_request(BOOK_SEARCH_URL,
                                       params=payload,
                                       clear_cookies=False)
    data = response.json()
    total_pages = data["metadata"]["totalPages"]
    LOGGER.info("\tThere are in total {} pages for {}......\n".format(
        total_pages, category))

    # List of books for the first page
    booklist = get_books_from_results(data["data"])

    # get the rest of the pages' books
    for i in range(1, total_pages):
        payload["page"] = i + 1
        response = downloader.make_request(BOOK_SEARCH_URL,
                                           params=payload,
                                           clear_cookies=False)

        # Skip the page if there is an error (usually a 500 error)
        if response.status_code != 200:
            continue
        data = response.json()
        booklist += get_books_from_results(data["data"])

    LOGGER.info(
        "\tFinished getting all the books for {}\n\t================\n".format(
            category))
    return booklist
def download_all():
    """
    Parse the json returned by StoryWeaver API and generate a dictionary that
    contains all the information regarding category, publisher, language, level and
    book.
    """
    resp = downloader.make_request(FILTERS_URL, clear_cookies=False).json()
    categories = [
        item["name"] for item in resp["data"]["category"]["queryValues"]
    ]

    channel_tree = {}
    for category in categories:
        channel_tree[category] = {}
        booklist = books_for_each_category(category)

        # Reset Storyweaver Community number and index for each category
        storyweaver_community_num = 0
        index = 1
        for book in booklist:
            publisher = book["publisher"]
            language = book["language"]
            level = book["level"]

            if publisher == "StoryWeaver Community":
                storyweaver_community_num += 1
                # Make sure we only have 20 books in one Storyweaver Community folder
                if storyweaver_community_num > 20:
                    index += 1
                    storyweaver_community_num = 1
                publisher = "{}-{}".format(publisher, index)

            if publisher in channel_tree[category]:
                if language in channel_tree[category][publisher]:
                    if level in channel_tree[category][publisher][language]:
                        channel_tree[category][publisher][language][
                            level].append(book)
                    else:
                        channel_tree[category][publisher][language][level] = [
                            book
                        ]
                else:
                    channel_tree[category][publisher][language] = {}
                    channel_tree[category][publisher][language][level] = [book]
            else:
                channel_tree[category][publisher] = {}
                channel_tree[category][publisher][language] = {}
                channel_tree[category][publisher][language][level] = [book]
    return channel_tree
    def get_content_zip(self, page):
        """
        Get the zip path of the content.
        """
        # Find the zip url of the content and check if it's valid.
        zip_href = page.find("a", href=re.compile(".zip"))
        if not zip_href:
            return None
        zip_url = "http://proyectodescartes.org{}".format(zip_href["href"])
        zip_resp = downloader.make_request(zip_url)

        if zip_resp.status_code != 200:
            return None

        filepath = "/tmp/{}".format(zip_url.split("/")[-1])
        with open(filepath, "wb") as f:
            f.write(zip_resp.content)

        dst = tempfile.mkdtemp()
        html_name = page.find(
            "div", class_="itemFullText").find("a")["href"].split("/")[-1]

        # Unzip the downloaded zip file and zip the folder again. In case that
        # index.html does not exist on the top most level, rename the index page
        # in the folder to index.html before zipping the folder again.
        with zipfile.ZipFile(filepath) as zf:
            extracted_src = unquote(filepath.split("/")[-1].split(".zip")[0])
            zf.extractall(dst)
            if html_name != "index.html":
                src_index = os.path.join(dst, extracted_src, html_name)
                dst_index = src_index.replace(html_name, "index.html")
                if os.path.exists(src_index):
                    os.rename(src_index, dst_index)
            zip_path = create_predictable_zip(os.path.join(dst, extracted_src))

        return zip_path
def add_node_document(booklist, level_topic, as_booklist):
    """
    Add books as DocumentNode under a specific level of reading.
    Parameters:
    * booklist - The list of books to be added as DocumentNodes
    * level_topic - The TopicNode regarding current level that the DocumentNodes
                    will be attached to
    * as_booklist - The list of books from African Storybooks
    """
    for item in booklist:
        # Initialize the source domain and content_id
        domain = uuid.uuid5(uuid.NAMESPACE_DNS, "storyweaver.org.in")
        book_id = str(item["source_id"])

        # If the publisher is AS and the book is found,
        # then change the source_domain and content_id
        if item["publisher"] == "African Storybook Initiative":
            check = check_if_story_in_AS(as_booklist, item["title"])
            if check[0]:
                domain = uuid.uuid5(uuid.NAMESPACE_DNS,
                                    "www.africanstorybook.org")
                book_id = check[1]

        # Given that StoryWeaver provides the link to a zip file,
        # we will download the zip file and extract the pdf file from it
        with tempfile.NamedTemporaryFile(suffix=".zip") as tempf:
            try:
                resp = downloader.make_request(item["link"],
                                               clear_cookies=False)
                resp.raise_for_status()
                tempf.write(resp.content)
            except Exception as e:
                # Do not create the node if download fails
                LOGGER.info("Error: {} when downloading {}".format(
                    e, item["link"]))
                continue

            filename = ""
            with zipfile.ZipFile(tempf.name, "r") as f:
                for zipped_file in f.namelist():
                    if os.path.splitext(zipped_file)[1][1:] == "pdf":
                        tempdir = os.path.dirname(tempf.name)
                        f.extract(zipped_file, path=tempdir)
                        filename = os.path.join(tempdir, zipped_file)
                        break

        # If no pdf file has been found in the zip, do not create the node
        if not filename:
            continue

        # Create the document node with given information
        document_file = DocumentFile(path=filename)
        language_obj = getlang_by_name(item["language"])
        book = DocumentNode(
            title=item["title"],
            source_id=book_id,
            author=item["author"],
            provider=item["publisher"],
            files=[document_file],
            license=get_license(licenses.CC_BY,
                                copyright_holder="StoryWeaver"),
            thumbnail=item.get("thumbnail"),
            description=item["description"],
            domain_ns=domain,
            language=language_obj,
        )
        level_topic.add_child(book)