Python TopicNode示例，ricecooker.classes.nodes.TopicNode Python示例

示例#1

0

显示文件

文件： epubchef.py 项目： vansia43/ricecooker

    def construct_channel(self, **kwargs):
        # create channel
        channel = self.get_channel(**kwargs)

        # create a topic "Ebooks" and add it to channel
        topic = TopicNode(
            source_id="ebooksfolder",
            title="Ebooks",
            language=languages.getlang('en').code,
        )
        channel.add_child(topic)

        # Create an ePub file and add ePub file to a DocumentNode
        epub_file = EPubFile(
            path='samplefiles/documents/laozi_tao-te-ching.epub')
        doc_node = DocumentNode(
            source_id="<en_doc_id>",
            title='Tao Te Ching',
            author='Lao Zi',
            description='This is a sample epub document',
            license=licenses.PUBLIC_DOMAIN,
            language=languages.getlang('en').code,
            files=[epub_file],
        )

        # Add document node to the topic
        topic.add_child(doc_node)

        return channel

示例#2

0

显示文件

文件： sushichef.py 项目： lyw07/sushi-chef-google-digital-literacy

    def parse_page(self, channel, page):
        categories = {}

        # Create TopicNodes for each filter
        filters = page.find("nav", class_="course-list__filters").find_all("a")
        for category in filters:
            category_id = category["data-filterby"]
            # Exclude the filter which contains all the courses
            if category_id == "all":
                continue

            category_node = TopicNode(source_id=category.text,
                                      title=category.text)
            categories[category_id] = category_node
            channel.add_child(category_node)

        # Get all the courses in json format
        data = page.find("script", {"id": "__data__"}).text
        pattern = re.compile("courses: \[(.*?)}]")
        courses = pattern.search(data).group(1).split("}, ")
        for item in courses:
            course = json.loads(item + "}")
            course_url = "{base}course/{slug}?enroll-success=1".format(
                base=BASE_URL, slug=course["slug"])
            course_node = TopicNode(
                source_id="{lang}-{course}".format(lang=CHANNEL_LANGUAGE,
                                                   course=course["slug"]),
                title=course["title"].encode("utf-8").decode("unicode_escape"),
                thumbnail=course["image"],
            )
            categories[course["category"]].add_child(course_node)
            self.parse_course(course_node, course_url)

示例#3

0

显示文件

def get_files(topic_name, directory, files):
    topic = TopicNode(title=topic_name, source_id="{}_id".format(topic_name))
    for filename in files:
        if filename.endswith("h5p"):
            title = None
        else:
            title = os.path.splitext(filename)[0]
        node = get_file(topic_name, directory, filename, title)
        if node:
            topic.add_child(node)
    return topic

示例#4

0

显示文件

def construct_channel(*args, **kwargs):

    channel = create_channel(*args, **kwargs)
    citrus_topic = TopicNode(source_id="List_of_citrus_fruits", title="Citrus!")
    channel.add_child(citrus_topic)
    add_subpages_from_wikipedia_list(citrus_topic, "https://en.wikipedia.org/wiki/List_of_citrus_fruits")

    potato_topic = TopicNode(source_id="List_of_potato_cultivars", title="Potatoes!")
    channel.add_child(potato_topic)
    add_subpages_from_wikipedia_list(potato_topic, "https://en.wikipedia.org/wiki/List_of_potato_cultivars")

    return channel

示例#5

0

显示文件

    def construct_channel(self, **kwargs):
        channel = self.get_channel(**kwargs)
        old_title = None
        old_group = None
        i = 0

        for metadata, zfilename, [title, group] in index.no_dl_index():
            i = i + 1
            if title != old_title:
                old_title = title
                title_node = TopicNode(source_id=title + str(i),
                                       title=replace(title))
                channel.add_child(title_node)
                old_group = None
            if group != old_group:
                old_group = group
                group_node = TopicNode(source_id=title + group + str(i + 0.5),
                                       title=replace(group))
                title_node.add_child(group_node)

            doc_node = HTML5AppNode(
                title=metadata.title,
                description=metadata.description,
                source_id=zfilename + str(i + 0.9),
                license=LICENCE,
                language='es',
                files=[HTMLZipFile(path=zfilename)],
            )

            group_node.add_child(doc_node)
        return channel

示例#6

0

显示文件

def build_pdf_topics(main_topic, sections, lang_code):
    """
    Adds the documents from the sections tree to the `main_topic`.
     - CASE A = no children => add as DocumentNode
     - CASE B = has children => add as TopicNode and add all children as DocumentNode
    """
    LICENSE = get_license("CC BY-NC-SA", copyright_holder=POINTB)

    for i, section in enumerate(sections):

        # CASE A: All sections except Section 2
        if 'children' not in section:
            title = section['title']
            abspath = section['path']
            filename = os.path.basename(abspath)
            doc_node = DocumentNode(
                title=title,
                description=
                'Chapter from A GUIDE TO BECOMING A 21ST CENTURY TEACHER',
                source_id='%s-%s' % (filename, lang_code),
                license=LICENSE,
                aggregator=LE,
                language=lang_code,
                role=roles.COACH,
                files=[DocumentFile(path=abspath, language=lang_code)])
            main_topic.add_child(doc_node)

        # CASE B: Section 2
        else:
            section_topic = TopicNode(title=section['title'],
                                      source_id="pointb_section_" + str(i))
            main_topic.add_child(section_topic)

            for subsection in section['children']:
                title = subsection['title']
                abspath = subsection['path']
                filename = os.path.basename(abspath)
                subsection_doc_node = DocumentNode(
                    title=title,
                    description='',
                    source_id='%s-%s' % (filename, lang_code),
                    license=LICENSE,
                    aggregator=LE,
                    language=lang_code,
                    role=roles.COACH,
                    files=[DocumentFile(path=abspath, language=lang_code)])
                section_topic.add_child(subsection_doc_node)

    return main_topic

示例#7

0

显示文件

文件： sushichef.py 项目： mrpau-julius/sushi-chef-pointb-21csguide-1

 def construct_channel(self, **kwargs):
     channel = self.get_channel(**kwargs)
     potato_topic = TopicNode(title="Potatoes!", source_id="<potatos_id>")
     channel.add_child(potato_topic)
     doc_node = DocumentNode(
         title='Growing potatoes',
         description='An article about growing potatoes on your rooftop.',
         source_id='pubs/mafri-potatoe',
         license=get_license('CC BY', copyright_holder='University of Alberta'),
         language='en',
         files=[DocumentFile(path='https://www.gov.mb.ca/inr/pdf/pubs/mafri-potatoe.pdf',
                             language='en')],
     )
     potato_topic.add_child(doc_node)
     return channel

示例#8

0

显示文件

def addAssignment(a_info):
    try:
        assignment = TopicNode(title=a_info['title'],
                               source_id=a_info['id'],
                               description=a_info['description'],
                               language='en',
                               derive_thumbnail=True,
                               thumbnail=None)
    except:
        assignment = TopicNode(title=a_info['title'],
                               source_id=a_info['id'],
                               language='en',
                               derive_thumbnail=True,
                               thumbnail=None)
    return assignment

示例#9

0

显示文件

 def test_generate_tiled_thumbnail(self, document, html, video, audio):
     topic = TopicNode('test-topic', 'Topic')
     topic.add_child(document)
     topic.add_child(html)
     topic.add_child(video)
     topic.add_child(audio)
     config.THUMBNAILS = True
     for child in topic.children:  # must process children before topic node
         child.process_files()
     filenames = topic.process_files()
     assert len(filenames) == 1, 'expected one filename'
     self.check_has_thumbnail(topic)

示例#10

0

显示文件

文件： sushichef.py 项目： learningequality/sushi-chef-proyecto-descartes

    def parse_topics(self, topics, channel):
        """
        Parse the topics on the site.
        """
        final_topics = []
        main_topics = []

        for topic in topics:
            href = topic["href"].split(MAIN_PAGE_HREF)
            subject = href[-1].split("/")[0]

            if subject in SUBJECT_BLACKLIST:
                continue

            # Get subject information for the topic
            subjectLink = BASE_URL.format(href[-1])
            subjectTitle = topic.text.strip()
            subject_topic = TopicNode(source_id=subjectTitle,
                                      title=subjectTitle)

            # When the topic is a subtopic of another
            if topic.parent.parent.attrs["class"][0] == "l2":
                parent = main_topics[-1]
            else:
                parent = channel
                main_topics.append(subject_topic)

            topic_tuple = (subject_topic, subjectLink, parent)
            final_topics.append(topic_tuple)

        return final_topics

示例#11

0

显示文件

文件： sushichef.py 项目： learningequality/sushi-chef-pbslearningmedia

def hier(medium, curriculum_tags):

    out_tags = []
    all_ancestors = []
    for tag in curriculum_tags:
        all_ancestors.extend(tag['ancestor_ids'])
    retry = {}
    for tag in sorted(curriculum_tags, key=lambda x: x['id']):
        slug = tag['slug']
        _id = tag['id']
        name = tag['name']
        if not tag['ancestor_ids']:
            ancestor = "ROOT"
        else:
            ancestor = tag['ancestor_ids'][-1]
        # attach to tree
        if _id not in nodes[medium]:
            nodes[medium][_id] = TopicNode(source_id=slug, title=name)
            try:
                add_child_replacement(nodes[medium][ancestor],
                                      nodes[medium][_id],
                                      before=True)
                # nodes[medium][ancestor].add_child(nodes[medium][_id])
            except Exception:
                retry[(medium, ancestor)] = nodes[medium][_id]
        if _id not in all_ancestors:
            out_tags.append(nodes[medium][_id])
    for k, v in retry.items():
        add_child_replacement(nodes[k[0]][k[1]], v, before=True)
        # nodes[k[0]][k[1]].add_child(v)
    assert out_tags
    return out_tags

示例#12

0

显示文件

文件： chef.py 项目： lyw07/sushi-chef-pratham-books-storyweaver

def parse_through_tree(tree, parent_topic, as_booklist):
    """
    Recursively parsing through the tree and adding TopicNodes and DocumentNodes.
    Parameters:
    * tree - The tree that contains information about category, publisher, language,
            level, and book and is going to be parsed
    * parent_topic - The parent node that will be attached with Nodes created later
    * as_booklist - the list of books from African Storybook
    """
    for topic_name in sorted(tree):
        content = tree[topic_name]
        try:
            title = "Level {}".format(int(topic_name))
        except ValueError:
            title = topic_name
        current_topic = TopicNode(
            source_id="{}_{}".format(parent_topic.source_id,
                                     topic_name.replace(" ", "_")),
            title=title,
        )

        if type(content) is list:
            add_node_document(content, current_topic, as_booklist)
        else:
            parse_through_tree(content, current_topic, as_booklist)

        # Only add the current topic node when it has child nodes
        if current_topic.children:
            parent_topic.add_child(current_topic)

示例#13

0

显示文件

def construct_channel(*args, **kwargs):
    channel = create_channel(*args, **kwargs)

    city_topic = TopicNode(source_id="List_of_largest_cities", title="Cities!")
    channel.add_child(city_topic)
    add_subpages_from_wikipedia_list(
        city_topic, "https://en.wikipedia.org/wiki/List_of_largest_cities")

    return channel

示例#14

0

显示文件

文件： sushichef.py 项目： miyer03/sample-channels

def make_topic_for_settings(title, ffmpeg_settings):
    """
    Assumes global VIDEO_URLS available.
    """
    topic = TopicNode(
        source_id=title,
        title=title,
        description='',
        author=None,
        language=getlang('en').id,
        thumbnail=None,
    )
    for counter, video_url in enumerate(VIDEO_URLS):
        vid_number = counter + 1
        video_title = 'Video ' + str(vid_number)
        video_node = make_video_node(video_title,
                                     video_url,
                                     ffmpeg_settings=ffmpeg_settings)
        topic.add_child(video_node)
    return topic

示例#15

0

显示文件

文件： sushichef.py 项目： sairina/ricecooker

 def construct_channel(self, **kwargs):
     channel = self.get_channel(**kwargs)
     potato_topic = TopicNode(title="Potatoes!", source_id="<potatoes_id>")
     channel.add_child(potato_topic)
     document_node = DocumentNode(
         title="Growing potatoes",
         description="An article about growing potatoes on your rooftop.",
         source_id="pubs/mafri-potatoe",
         license=get_license("CC BY",
                             copyright_holder="University of Alberta"),
         language="en",
         files=[
             DocumentFile(
                 path="https://www.gov.mb.ca/inr/pdf/pubs/mafri-potatoe.pdf",
                 language="en",
             )
         ],
     )
     potato_topic.add_child(document_node)
     return channel

示例#16

0

显示文件

文件： sushichef.py 项目： learningequality/sushi-chef-awa2el

        def get_node(_path, rootparent, nodes):
            global badcount
            path = tuple(_path)
            if path in nodes:
                return nodes[path]
            if len(path) > 1:
                parent = get_node(path[:-1], rootparent, nodes)
                assert parent
            else:
                parent = rootparent

            if path not in nodes:
                title = path[-1]

                if title in badlist.rename:
                    badcount = badcount + 1
                    print("badlist: RENAME", title, badcount)
                    title = badlist.rename[title]

                # mangle title:
                if arabic.grade in title:
                    print("grade: SKIP GRADE", title)
                else:
                    print("grade: NO GRADE", title)
                    drop = False
                    for drop_word in arabic.drop_words:
                        if drop_word in title:
                            drop = drop_word
                    print("grade: ", drop)
                    if drop:
                        newtitle = False
                        for subject in arabic.subjects:
                            if subject in title:
                                title = arabic.subjects[subject]
                                newtitle = True
                        assert newtitle, title
                        print("grade: CHANGE TO", title)

                # check if new title present!
                path = list(path)
                path[-1] = title
                path = tuple(path)
                if path in nodes:
                    print("NEW PATH")
                    return nodes[path]

                nodes[path] = TopicNode(source_id="topic" + title, title=title)

                if title in badlist.badlist:
                    badcount = badcount + 1
                    print("badlist: BAD: ", title, badcount)
                    return nodes[path]  # unconnected!
                parent.add_child(nodes[path])
            return nodes[path]

示例#17

0

显示文件

    def construct_channel(self, **kwargs):
        channel = self.get_channel(**kwargs)
        for name, _id in catnum.items():
            cat_node = TopicNode(source_id=str(_id), title=name)
            channel.add_child(cat_node)
            links = crawl.get_all_links(_id)
            for link in list(links):
                zipfilename, title = localise.zip_from_url(link)
                appzip = HTMLZipFile(zipfilename)
                if os.path.exists(zipfilename + "_2.jpg"):
                    thumb = ThumbnailFile(zipfilename + "_2.jpg")
                else:
                    thumb = None
                zipnode = HTML5AppNode(source_id=link,
                                       title=title,
                                       license=licenses.CC_BY,
                                       copyright_holder=CHANNEL_NAME,
                                       files=[appzip],
                                       author=crawl.author_lookup[link],
                                       thumbnail=thumb)
                zipnode.validate()
                cat_node.add_child(zipnode)

            cat_node.validate()
        print("DONE")
        return channel

示例#18

0

显示文件

    def construct_channel(self, **kwargs):
        channel = self.get_channel(**kwargs)

        # Soupify goalkicker main page
        gk_url = 'https://' + self.channel_info['CHANNEL_SOURCE_DOMAIN'] + '/'
        gk_soup = get_soup(gk_url)

        # Get urls for each goalkicker book
        els_with_page_urls = gk_soup.find_all(class_='bookContainer')
        page_urls = [
            gk_url + el.find('a')['href'] for el in els_with_page_urls
        ]

        for page_url in page_urls:
            # Soupify book page
            page_soup = get_soup(page_url)

            # Extract and construct book info
            book_info = parse_book_info(page_soup)
            book_info['absolute_url'] = page_url + book_info['relative_url']

            # Add book to channel tree
            topic_node_source_id = 'topic/' + book_info['subject']
            page_topic_node = TopicNode(title=book_info['subject'],
                                        source_id=topic_node_source_id)
            channel.add_child(page_topic_node)
            doc_node = DocumentNode(
                title=book_info['title'],
                description=book_info['description'],
                source_id=book_info['source_id'],
                license=get_license('CC BY-SA',
                                    copyright_holder='Creative Commons'),
                language='en',
                files=[
                    DocumentFile(path=book_info['absolute_url'], language='en')
                ],
            )
            page_topic_node.add_child(doc_node)

        return channel

示例#19

0

显示文件

    def construct_channel(self, *args, **kwargs):

        channel = self.get_channel(**kwargs)
        videos_topic = TopicNode(source_id="/wiki/Category:Articles_containing_video_clips",
                                 title="Articles containing video clips")
        channel.add_child(videos_topic)

        thumbnail_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/A_Is_for_Atom_1953.webm/220px--A_Is_for_Atom_1953.webm.jpg'
        page = download_wikipedia_page('/wiki/Category:Articles_containing_video_clips',
                                       thumbnail_url, 'A Is for Atom')
        videos_topic.add_child(page)

        video_url = 'https://upload.wikimedia.org/wikipedia/commons/e/ee/A_Is_for_Atom_1953.webm'
        video_file = VideoFile(path=video_url)
        video_node = VideoNode(title='A Is for Atom 1953', source_id='A_Is_for_Atom_1953.webm',
                               files=[video_file], license=licenses.PublicDomainLicense())

        subtitle_url = 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang={}&trackformat=srt'
        subtitle_languages = [
            'en',
            'es',
        ]
        for lang in subtitle_languages:
            subtitle_file = SubtitleFile(path=subtitle_url.format(lang), language=lang, subtitlesformat='srt')
            video_node.add_file(subtitle_file)

        videos_topic.add_child(video_node)

        return channel

示例#20

0

显示文件

文件： sushichef.py 项目： learningequality/sushi-chef-careergirls

 def get_things(all_things, parent_node, new_node=True):
     for thing in all_things:
         _id = thing.url.strip('/').split('/')[-1] # TODO hash
         if new_node:
             this_node = TopicNode(source_id = thing.url,
                                    title=thing.title)
         else:
             this_node = parent_node
         content_node = make_youtube_video(thing.youtube, "Video: {}".format(thing.title), "video__{}".format(thing.url)) # TODO hash
         if content_node is not None:
             this_node.add_child(content_node)
        
         try:
             os.mkdir('html')
         except Exception:
             pass
         fn = "html/{}.zip".format(_id)
         with open(fn, "wb") as f:
             f.write(thing.app)
         app_zip = HTMLZipFile(fn)
         if thing.title[0] in "AEIOUaeiou":
             an = "an"
         else:
             an = "a" 
         app_node = HTML5AppNode(source_id = "app_{}".format(thing.url),
                                 title = "Being {} {}".format(an, thing.title),
                                 license = LICENCE,
                                 files=[app_zip])
     
         this_node.add_child(app_node)
         if new_node:
             parent_node.add_child(this_node)

示例#21

0

显示文件

文件： large_wikipedia_chef.py 项目： sairina/ricecooker

    def construct_channel(self, *args, **kwargs):
        """
        Create ChannelNode and build topic tree.
        """
        channel = self.get_channel(
            *args,
            **kwargs)  # creates ChannelNode from data in self.channel_info
        city_topic = TopicNode(source_id="List_of_largest_cities",
                               title="Cities!")
        channel.add_child(city_topic)
        add_subpages_from_wikipedia_list(
            city_topic, "https://en.wikipedia.org/wiki/List_of_largest_cities")

        return channel

示例#22

0

显示文件

文件： chef.py 项目： lyw07/sushi-chef-pratham-books-storyweaver

    def construct_channel(self, **kwargs):
        channel = self.get_channel(**kwargs)

        # add topics and corresponding books to the channel
        channel_tree = download_all()
        as_booklist = get_AS_booklist_dict()
        for category in sorted(channel_tree):
            category_topic = TopicNode(source_id=category.replace(" ", "_"),
                                       title=category)
            channel.add_child(category_topic)
            parse_through_tree(channel_tree[category], category_topic,
                               as_booklist)

        return channel

示例#23

0

显示文件

文件： sushichef.py 项目： NathanMaton/forked_sushi_chef

    def construct_channel(self, **kwargs):

        # these 2 methods output to the downloads folder
        download_documents()
        crop_documents()
        en_chapters = split_pdfs('English')
        my_chapters = split_pdfs('Burmese')

        channel = self.get_channel(**kwargs)
        main_topic_en = TopicNode(title="21ST CENTURY GUIDE English Topic",
                                  source_id="main_en",
                                  thumbnail=DOWNLOADS_FOLDER +
                                  '/thumbnail.png')
        topic_videos_en = TopicNode(title="Videos",
                                    source_id="pointb_en_videos",
                                    thumbnail=DOWNLOADS_FOLDER +
                                    '/videothumbnail.png')

        main_topic_my = TopicNode(title="21ST CENTURY GUIDE Burmese Topic",
                                  source_id="main_my",
                                  thumbnail=DOWNLOADS_FOLDER +
                                  '/thumbnail.png')
        topic_videos_my = TopicNode(title="Videos",
                                    source_id="pointb_my_videos",
                                    thumbnail=DOWNLOADS_FOLDER +
                                    '/videothumbnail.png')
        add_documents(main_topic_en, en_chapters, 'en')
        add_documents(main_topic_my, my_chapters, 'my')
        channel.add_child(main_topic_en)
        channel.add_child(main_topic_my)

        topic_videos_en = download_videos(topic_videos_en, 'en')
        topic_videos_my = download_videos(topic_videos_my, 'my')
        channel.add_child(topic_videos_en)
        channel.add_child(topic_videos_my)

        return channel

示例#24

0

显示文件

def generate_child_topics(arvind_contents, main_topic, lang_obj, topic_type):
    # Create a topic for each languages
    data = arvind_contents[lang_obj.name]

    for topic_index in data:
        topic_name = topic_index
        if topic_type == STANDARD_TOPIC:
            source_id = 'arvind-child-topic-{0}'.format(topic_name)
            topic_node = TopicNode(title=topic_name, source_id=source_id)
            download_video_topics(data, topic_name, topic_node, lang_obj)
            main_topic.add_child(topic_node)

        if topic_type == SINGLE_TOPIC:
            download_video_topics(data, topic_name, main_topic, lang_obj)
    return main_topic

示例#25

0

显示文件

文件： sushichef.py 项目： NathanMaton/forked_sushi_chef

def add_documents(topic, chapters, language):
    for idx, chapter in enumerate(chapters):
        # if chapter has 'children'
        if 'children' in chapter.keys():
            doc_title = chapter['title']
            child_topic_node = TopicNode(title=doc_title,
                                         source_id=language + doc_title,
                                         thumbnail=DOWNLOADS_FOLDER +
                                         '/thumbnail.png')
            for child in chapter['children']:
                child_doc_title = child['title']
                doc_node = DocumentNode(
                    title=child_doc_title,
                    description=f'Chapter {idx} from {doc_title}',
                    source_id=language + child_doc_title,
                    license=get_license('CC BY', copyright_holder='NC-SA 4.0'),
                    language=language,
                    thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png',
                    files=[
                        DocumentFile(path=child['path'], language=language)
                    ],
                )
                child_topic_node.add_child(doc_node)
            topic.add_child(child_topic_node)
        else:
            doc_title = chapter['title']
            doc_node = DocumentNode(
                title=doc_title,
                description=f'Chapter {idx} from 21ST CENTURY GUIDE',
                source_id=language + doc_title,
                license=get_license('CC BY', copyright_holder='NC-SA 4.0'),
                language=language,
                thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png',
                files=[DocumentFile(path=chapter['path'], language=language)],
            )
            topic.add_child(doc_node)

示例#26

0

显示文件

def generate_child_topics(arvind_contents, main_topic, lang_obj, topic_type):
    # Create a topic for each languages
    pp = pprint.PrettyPrinter()
    data = arvind_contents[lang_obj.name]
    for topic_index in data:

        if topic_type == STANDARD_TOPIC:
            source_id = lang_obj.code + '_' + topic_index
            topic_node = TopicNode(title=topic_index, source_id=source_id)
            download_video_topics(data, topic_index, topic_node, lang_obj)
            main_topic.add_child(topic_node)

        if topic_type == SINGLE_TOPIC:
            download_video_topics(data, topic_index, main_topic, lang_obj)
    return main_topic

示例#27

0

显示文件

文件： sushichef.py 项目： learningequality/sushi-chef-proyecto-descartes

    def download_subject(self, subject, link, parent):
        """
        Parse each subject page.
        """
        LOGGER.info("Processing subject: {}".format(subject.title))

        # No need to parse the content under the subject when link is not valid
        if "javascript:void(0);" in link:
            parent.add_child(subject)
            return

        # Parse each subject's index page
        resp = downloader.make_request(link)
        soup = BeautifulSoup(resp.content, "html.parser")

        selected_category = soup.find("option", {
            "class": "level0",
            "selected": "selected"
        })
        if not selected_category:
            return

        parent.add_child(subject)

        for item in AGE_RANGE.keys():
            params = OrderedDict([("category", selected_category["value"]),
                                  ("moduleId", "282"), ("format", "count")])
            for index in range(len(AGE_RANGE[item])):
                params["taga[{}]".format(index)] = AGE_RANGE[item][index]

            # Parse the topics of age range under each subject
            resp = downloader.make_request("{}/itemlist/filter".format(link),
                                           params=params)
            count = int(resp.text.split('\n')[0])
            if count == 0:
                continue

            LOGGER.info("Processing topic: {}".format(item))
            age_topic = TopicNode(source_id=item, title=item)
            subject.add_child(age_topic)
            total_pages = ceil(count / 20)

            for i in range(total_pages):
                page_params = OrderedDict(params)
                LOGGER.info("Processing page: {}".format(i))
                self.download_content(age_topic, link, page_params,
                                      selected_category["value"], i * 20)

示例#28

0

显示文件

文件： chef.py 项目： learningequality/sushi-chef-pratham-books-storyweaver

def parse_through_tree(tree, parent_topic, as_booklist):
    for topic_name in sorted(tree):
        content = tree[topic_name]
        try:
            title = 'Level {}'.format(int(topic_name))
        except ValueError:
            title = topic_name
        current_topic = TopicNode(
            source_id='{}_{}'.format(parent_topic.source_id,
                                     topic_name.replace(' ', '_')),
            title=title,
        )
        parent_topic.add_child(current_topic)
        if type(content) is list:
            add_node_document(content, current_topic, as_booklist)
        else:
            parse_through_tree(content, current_topic, as_booklist)

示例#29

0

显示文件

文件： chef.py 项目： nataren/sushi-chef-pratham-open-school

def get_subtopics(parent, path):
    doc = get_page(path)
    try:
        menu_row = doc.find('div', {'id': 'body-row'})
        menu_row = menu_row.find('div', {'class': 'col-md-2'})
    except Exception as e:
        LOGGER.error('get_subtopics: %s : %s' % (e, doc))
        return
    for subtopic in menu_row.find_all('a'):
        try:
            title = subtopic.get_text().strip()
            source_id = get_source_id(subtopic['href'])
            LOGGER.info('  subtopic: %s: %s' % (source_id, title))
            node = TopicNode(title=title, source_id=source_id)
            parent.add_child(node)
            get_lessons(node, subtopic['href'])
        except Exception as e:
            LOGGER.error('get_subtopics: %s : %s' % (e, subtopic))

示例#30

0

显示文件

    def download_category(self, parent, cat_id, categories, sims, keywords,
                          language):
        """
        Process a category, and add all its sub-categories, and its simulations/videos.
        """

        print("Processing category:", cat_id)

        cat = categories[str(cat_id)]

        # loop through all subtopics and recursively add them
        # (reverse order seems to give most rational results)
        for child_id in reversed(cat["childrenIds"]):
            # look up the child category by ID
            subcat = categories[str(child_id)]
            # skip it if it's in our blacklist
            if subcat["name"] in ID_BLACKLIST_BY_LANG.get(
                    language, ID_BLACKLIST_BY_LANG['en']):
                continue
            # make the title human-readable, and clean it up
            title = subcat["name"].replace("-", " ").title()
            title = title.replace(" And ", " and ")
            title = title.replace("Mathconcepts", "Concepts")
            title = title.replace("Mathapplications", "Applications")

            if language == "ar":
                title = ARABIC_NAME_CATEGORY[title]
            # create the topic node, and add it to the parent
            subtopic = TopicNode(
                source_id=subcat["name"],
                title=title,
            )
            parent.add_child(subtopic)
            # recursively download the contents of the topic
            self.download_category(subtopic, child_id, categories, sims,
                                   keywords, language)

        # loop through all sims in this topic and add them, but only if we're at a leaf topic
        if len(parent.children) == 0:
            for sim_id in list(set(cat["simulationIds"])):
                # skip ones that aren't found (probably as they aren't HTML5)
                if sim_id not in sims:
                    continue
                self.download_sim(parent, sims[sim_id], keywords, language)