Пример #1
0
    def content_tree_to_channel(self, channel):
        source_id = self.channel_info['CHANNEL_SOURCE_DOMAIN']
        child_topics = []
        for subject in self.content_tree:
            subject_id = '{}-{}'.format(source_id, subject)
            title = subject
            if subject.endswith('Route'):
                lang = self.lang_id
                if not self.lang_id in TRANSLATIONS[subject]:
                    lang = 'en'
                title = TRANSLATIONS[subject][lang]
            subject_node = nodes.TopicNode(source_id=subject_id, title=title)

            channel.add_child(subject_node)
            modules = self.content_tree[subject]
            for module in modules:
                if 'file' in module:
                    self.create_leaf_node(module, subject_node, subject_id)
                elif 'children' in module:
                    subtopic_id = '{}-{}'.format(subject_id, module['id'])
                    child_topics.append(module['title'])
                    thumbnail = None
                    if 'thumbnail' in module:
                        thumbnail = module['thumbnail']
                    subtopic_node = nodes.TopicNode(source_id=subtopic_id, title=module['title'],
                                                    description=module['description'], thumbnail=thumbnail)
                    subject_node.add_child(subtopic_node)
                    for child in module['children']:
                        self.create_leaf_node(child, subtopic_node, subtopic_id)
def scrape_channel(channel):
    # Read from Categorias dropdown menu
    page = BeautifulSoup(downloader.read(BASE_URL), 'html5lib')
    dropdown = page.find('a', {'id': 'btn-categorias'}).find_next_sibling('ul')

    # Go through dropdown and generate topics and subtopics
    for category_list in dropdown.find_all('li', {'class': 'has-children'}):

        # Parse categories
        for category in category_list.find_all('li', {'class': 'has-children'}):
            # Add this topic to channel when scraping entire channel
            category_name = category.find('a').text
            topic = nodes.TopicNode(title=category_name, source_id=get_source_id(category_name))
            channel.add_child(topic)
            LOGGER.info(topic.title)

            # Parse subcategories
            for subcategory in category.find_all('li'):
                if not subcategory.attrs.get('class') or 'go-back' not in subcategory.attrs['class']:
                    # Get rid of this check to scrape entire site
                    subcategory_name = subcategory.find('a').text
                    subcategory_link = subcategory.find('a')['href']
                    LOGGER.info('  {}'.format(subcategory_name))
                    subtopic = nodes.TopicNode(title=subcategory_name, source_id=get_source_id(subcategory_link))
                    topic.add_child(subtopic)

                    # Parse resources
                    scrape_subcategory(subcategory_link, subtopic)
def scrape_video_menu(url):
    """ Scrape videos from url
        Args:
            url (str): url to scrape from (e.g. https://www.exploratorium.edu/video/subjects)
        Returns TopicNode containing all videos
    """
    LOGGER.info("SCRAPING VIDEOS...")
    video_topic = nodes.TopicNode(title="Videos",
                                  source_id="main-topic-videos")
    contents = BeautifulSoup(read(url), 'html5lib')

    for subject in contents.find_all('div', {'class': 'subject'}):
        title = subject.find('div', {
            'class': 'name'
        }).text.strip().replace("’", "'")
        LOGGER.info("    {}".format(title))
        topic = nodes.TopicNode(
            title=title,
            source_id="videos-{}".format(title),
            thumbnail=get_thumbnail_url(subject.find('img')['src']),
        )
        video_topic.add_child(topic)
        scrape_video_subject(subject.find('a')['href'], topic)

    return video_topic
Пример #4
0
def scrape_english_collection(channel):
    LOGGER.info('Scraping English collection...')
    english_topic = nodes.TopicNode(source_id=ENGLISH_COLLECTION_URL,
                                    title="English")
    channel.add_child(english_topic)

    contents = BeautifulSoup(downloader.read(ENGLISH_COLLECTION_URL),
                             'html5lib')
    collection_key = get_collection_key(contents)

    topic_selection = contents.find('div', {'class': 'asset-list'}).find('div')
    topic_list = [
        t for t in json.loads(topic_selection['data-react-props'])['sections']
        if t['id'] not in EXCLUDED_TOPIC_IDS
    ]

    for topic in topic_list:
        LOGGER.info('    {}'.format(topic['name'].encode('utf-8')))
        topic_node = nodes.TopicNode(source_id=topic['section_key'],
                                     title=topic['name'])
        english_topic.add_child(topic_node)

        # Scrape items in the topic
        url = ENGLISH_ASSETS_URL.format(collection=collection_key,
                                        section=topic['section_key'])
        scrape_collection_files(topic_node, url)
Пример #5
0
    def add_content_to_tree(self, channel):
        tree = self.channel_tree
        lang = 'English'
        lang_obj = getlang("en")
        for class_name in tree[lang]:
            class_obj = tree[lang][class_name]
            class_id = "{}-{}".format(lang, class_name)
            class_node = nodes.TopicNode(source_id=class_name, title=class_name)
            for subject_name in class_obj:
                subject_id = "{}-{}".format(class_id, subject_name)
                subject_node = nodes.TopicNode(source_id=subject_id, title=subject_name)
                subject_obj = class_obj[subject_name]
                for item in subject_obj['items']:
                    item_id = "{}-{}".format(subject_id, get_column(item, 'id'))
                    video = nodes.VideoNode(
                        source_id=item_id,
                        title=get_column(item, 'name'),
                        description=get_column(item, 'description'),
                        files=[
                            files.VideoFile(path=get_column(item, 'file'))
                        ],
                        language=lang_obj,
                        # FIXME: Use the column's license field instead of hardcoding.
                        license=licenses.get_license(le_licenses.CC_BY, copyright_holder=get_column(item, "copyright")),
                        # thumbnail=get_column(item, "thumbnail")
                    )
                    subject_node.add_child(video)

                class_node.add_child(subject_node)


            channel.add_child(class_node)
Пример #6
0
def download_all_writing_topics():
    topic_node = nodes.TopicNode(
        source_id="writing-topic",
        title="Writing Topics",
        language="en",
        thumbnail=writing_topic_thumbnail,
        description=
        ("Do you want to inspire your students to write great"
         " narratives, essays, and reports? Check out these grade-specific"
         " writing topics organized by mode (explanatory, creative, and so on)."
         ),
    )

    doc = get_parsed_html_from_url(
        'https://k12.thoughtfullearning.com/resources/writingtopics')
    for level in doc.select('.view-content .view-grouping'):
        title = level.select_one('.view-grouping-header').contents[0].strip()
        level_node = nodes.TopicNode(source_id=title,
                                     title=title,
                                     language="en")
        print("Downloading writing topic level: %s" % title)
        download_writing_topic_level(
            level_node, level.select_one('.view-grouping-content'))
        topic_node.add_child(level_node)

    return topic_node
Пример #7
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in on the command line
          - kwargs: extra options passed in as key="value" pairs on the command line
            For example, add the command line option   lang="fr"  and the value
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info
        # Channel structure: language --> subject --> experiments
        for lang_code, value in XLS_SHEETS.items():
            # lang_code = language code
            # value = link to xls sheet

            # read xlxs file using pandas
            xls_file = pandas.read_excel(value)
            print(lang_code)
            print(value)
            if lang_code == 'en':
                language = 'English'
            elif lang_code == 'es':
                language = 'Español'
            else:
                language = 'Português'
            topic_node = nodes.TopicNode(
                title=language,
                source_id='sciensation_{}'.format(language),
                author='Sciensation',
                provider='Sciensation',
                description='{} experiements'.format(language),
                language=lang_code)
            # add subject nodes
            for subject in SUBJECTS:
                subject_node = nodes.TopicNode(
                    title=subject,
                    source_id='sciensation_{0}_{1}'.format(language, subject),
                    author='Sciensation',
                    provider='Sciensation',
                    description='',
                    language=lang_code)

                # Add exercises to subject nodes
                experiment_dict = buildDict(xls_file)
                subject_node = add_experiments(subject, lang_code,
                                               subject_node, experiment_dict)

                topic_node.add_child(subject_node)

            channel.add_child(topic_node)
        return channel
def fetch_youtube_playlists(parent_node):
    """Fetch all of the YouTube playlists from the YouTube channel.

    Return a map of YouTube playlist title to the topic node.
    """
    youtube_channel_url = 'https://www.youtube.com/channel/UCNI0qOojpkhsUtaQ4_2NUhQ/playlists'
    print("--- Fetching videos from YouTube channel (%s) ---" %
          youtube_channel_url)
    print()

    topics_map = {}
    info = ydl.extract_info(youtube_channel_url, download=False)
    for i, playlist in enumerate(info['entries']):
        title = playlist['title']
        youtube_url = playlist['webpage_url']
        print("  Downloading playlist %s (%s)" % (title, youtube_url))
        playlist_topic = nodes.TopicNode(source_id=playlist['id'],
                                         title=playlist['title'],
                                         language="en")
        topics_map[title] = playlist_topic
        parent_node.add_child(playlist_topic)
        for j, video in enumerate(playlist['entries']):
            if video:
                playlist_topic.add_child(fetch_video(video))

    return topics_map
Пример #9
0
def scrape_category(title, category_url, language):
    """
    title: Culture
    category_url: http://www.touchableearth.org/china/culture/
        ... redirects to: http://www.touchableearth.org/china-culture-boys-clothing/
    """
    print("  Scraping category node: %s (%s)" % (title, category_url))

    category_node = nodes.TopicNode(source_id=category_url, title=title)

    # Iterate over each item in the "subway" sidebar menu on the left.
    doc = get_parsed_html_from_url(category_url)
    content_items = doc.select(".post_title_sub .current_post")
    slugs_added = set()

    for content in content_items:
        slug = content.select_one(".get_post_title")["value"]

        # Skip duplicates ... seems like the Touchable Earth website has them!
        if slug in slugs_added:
            continue
        else:
            slugs_added.add(slug)

        title = content.select_one(".get_post_title2")["value"]
        site_url = content.select_one(".site_url")["value"]
        url = "%s/%s?lang=%s" % (site_url, slug, language)
        content_node = scrape_content(title, url)
        if content_node:
            category_node.add_child(content_node)

    return category_node
Пример #10
0
def scrape_directory(topic, directory, indent=1):
    for subdirectory, folders, myfiles in os.walk(directory):

        # Go through all of the folders under directory
        for folder in folders:
            print('{}{}'.format('    ' * indent, folder))
            subtopic = nodes.TopicNode(source_id=folder, title=folder)
            topic.add_child(subtopic)

            # Go through folders under directory
            scrape_directory(subtopic,
                             os.sep.join([subdirectory, folder]),
                             indent=indent + 1)
        for file in myfiles:
            name, ext = os.path.splitext(file)
            if ext == '.mp4':
                video = nodes.VideoNode(source_id=subdirectory + file,
                                        title=name,
                                        license=LICENSE,
                                        copyright_holder=COPYRIGHT_HOLDER)
                videofile = files.VideoFile(os.sep.join([subdirectory, file]))
                video.add_file(videofile)
                topic.add_child(video)
            elif ext == '.pdf':
                with PDFParser(os.path.sep.join([subdirectory,
                                                 file])) as parser:
                    chapters = parser.get_data_file()
                    generate_pdf_nodes(chapters,
                                       topic,
                                       source=os.path.basename(file))
        break
Пример #11
0
def add_file_node(target_node,
                  url,
                  title,
                  split=False,
                  contents=None,
                  source_id=None,
                  **details):
    """ Creates file nodes at target topic node """
    if split:
        book_node = nodes.TopicNode(
            source_id=source_id or target_node.source_id + "-main",
            title=title,
            description=details.get('description'),
            thumbnail=details.get('thumbnail'),
        )
        target_node.add_child(book_node)
        chapters = []
        chapter_details = copy.deepcopy(details)
        del chapter_details['description']
        with PDFParser(url, directory=DOWNLOAD_DIRECTORY) as parser:
            chapters = parser.split_chapters(
                jsondata=JSONDATA.get(book_node.source_id))
            for index, chapter in enumerate(chapters):
                source_id = contents[index]['id'] if index < len(
                    contents) else "{}-{}".format(book_node.source_id, index)
                create_document_node(chapter['path'], chapter['title'],
                                     book_node, source_id, **chapter_details)
    else:
        create_document_node(url, title, target_node, source_id
                             or target_node.source_id, **details)
Пример #12
0
    def get_ricecooker_node(self):
        soup = BeautifulSoup(open(self.file_on_disk).read())

        print("opening {}".format(self.file_on_disk))

        # We'll add the title later when we iterate through the sections
        topic_node = nodes.TopicNode(source_id=self.url, title='')
        sections = soup.find_all('div', attrs={'class': 'section-heading'})
        for section in sections:
            # This is the top-level header, meaning it's the page title
            title = section.text.strip()
            if section.find('h1'):
                print("Page title = {}".format(title))
                topic_node.title = title
                continue

            print("Section = {}".format(title))

            content = section.find_next_sibling()
            if "content-block" in content.attrs['class']:
                self.node_for_text_section(content)
            elif "row" in content.attrs['class']:
                # the section rows are siblings in the tree.
                rows = [content]
                next = content.find_next_sibling()
                while "row" in next.attrs['class']:
                    rows.append(next)
                    next = next.find_next_sibling()

                self.node_for_rows(rows)

        return topic_node
Пример #13
0
def _build_tree(node, sourcetree):

    for child_source_node in sourcetree:
        try:
            main_file = child_source_node['files'][0] if 'files' in child_source_node else {}
            kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions"))
        except UnknownContentKindError:
            continue

        if kind == content_kinds.TOPIC:
            child_node = nodes.TopicNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            node.add_child(child_node)

            source_tree_children = child_source_node.get("children", [])

            _build_tree(child_node, source_tree_children)

        elif kind == content_kinds.EXERCISE:
            # node_data = json.dumps(child_source_node)
            if int(len(child_source_node['questions'])) < 5:
                exercise_data = {
                    'mastery_model': exercises.DO_ALL,
                    'randomize': True,
                }
            else:
                exercise_data={
                    'mastery_model': exercises.M_OF_N,
                    'randomize': True,
                    'm': 4,
                    'n': 5,
                }
            child_node = nodes.ExerciseNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                exercise_data=exercise_data,
                copyright_holder='GreyKite Technologies Pvt. Ltd.',
                thumbnail=child_source_node.get("thumbnail"),
            )
    
            add_files(child_node, child_source_node.get("files") or [])
            for q in child_source_node.get("questions"):
                question = create_question(q)
                child_node.add_question(question)
            node.add_child(child_node)

        else:                   # unknown content file format
            continue

    return node
Пример #14
0
    def construct_channel(self, *args, **kwargs):
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        lang_names = list(self.data.keys())
        lang_names.sort()

        for lang_name in lang_names:
            lang_data = self.data[lang_name]
            LOGGER.info("Creating app for language: {}".format(lang_name))
            lang = languages.getlang_by_native_name(lang_name)

            zip_dir = self.client.create_zip_dir_for_page(lang_data['url'])

            soup = self.client.get_page_soup(lang_data['url'])

            # Remove the translation list if found
            translations = soup.find('div', {'id': 'translations'})
            if translations:
                translations.extract()

            # Grab the localized title
            title = soup.find('span', {'id': 'share_title'}).text

            # Save the modified index.html page
            thumbnail = None
            for resource in lang_data['resources']:
                if 'dp3t.png' in resource:
                    thumbnail = os.path.join(zip_dir, resource)
                    break

            with open(os.path.join(zip_dir, 'index.html'), 'wb') as f:
                f.write(soup.prettify(encoding='utf-8'))

            # create_predictable_zip ensures that the ZIP file does not change each time it's created. This
            # ensures that the zip doesn't get re-uploaded just because zip metadata changed.
            zip_file = zip.create_predictable_zip(zip_dir)
            zip_name = lang.primary_code if lang else lang_name
            zip_filename = os.path.join(self.ZIP_DIR,
                                        "{}.zip".format(zip_name))
            os.makedirs(os.path.dirname(zip_filename), exist_ok=True)
            os.rename(zip_file, zip_filename)

            topic = nodes.TopicNode(source_id=lang_name, title=lang_name)
            zip_node = nodes.HTML5AppNode(
                source_id="covid19-sim-{}".format(lang_name),
                title=title,
                files=[files.HTMLZipFile(zip_filename)],
                license=licenses.PublicDomainLicense(
                    "Marcel Salathé & Nicky Case"),
                language=lang,
                thumbnail=thumbnail)
            topic.add_child(zip_node)
            channel.add_child(topic)

        return channel
Пример #15
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        topics = load_json_from_file(JSON_FILE)
        for topic in topics:
            book_title = topic['book_title']
            source_id = book_title.replace(" ", "_")
            url = topic['path_or_url']
            topic_node = nodes.TopicNode(source_id=source_id,
                                         title=book_title,
                                         tags=[
                                             "Teacher facing",
                                             "Professional development",
                                             "Life skills",
                                             "Intercultural skills",
                                             "Mentorship", "Formal contexts"
                                         ])
            channel.add_child(topic_node)

            parser = pdf.PDFParser(url, toc=topic['chapters'])
            parser.open()
            chapters = parser.split_chapters()
            for chapter in chapters:
                title = chapter['title']
                pdf_path = chapter['path']
                pdf_file = files.DocumentFile(pdf_path)
                pdf_node = nodes.DocumentNode(
                    source_id="{} {}".format(book_title, title),
                    title=title,
                    author="INTO",
                    tags=[
                        "Teacher facing", "Professional development",
                        "Life skills", "Intercultural skills", "Mentorship",
                        "Formal contexts"
                    ],
                    files=[pdf_file],
                    license=licenses.get_license(CHANNEL_LICENSE, "INTO",
                                                 LICENSE_DESCRIPTION),
                    copyright_holder="INTO")
                topic_node.add_child(pdf_node)

        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction

        return channel
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode

        Healing Classrooms is organized with the following hierarchy:
            Playlist (TopicNode)
            |   Youtube Video (VideoNode)
            |   Youtube Video (VideoNode)

        """
        channel = self.get_channel(*args, **kwargs)  # Create ChannelNode from data in self.channel_info

        # Download the playlist/video information
        with youtube_dl.YoutubeDL({'skip_download': True}) as ydl:
            info_dict = ydl.extract_info(PLAYLISTS_URL, download=False)

            # Generate topics based off playlist entries in dict
            for playlist in info_dict['entries']:

                # Get language of playlist (hack)
                language = "fr"
                if "English" in playlist['title']:
                    language = "en"
                elif "Arabic" in playlist['title']:
                    language = "ar"

                playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language)
                channel.add_child(playlist_topic)


                # Generate videos based off video entries in dict
                for video in playlist['entries']:
                    thumbnail_url = len(video['thumbnails']) and video['thumbnails'][0]['url']

                    playlist_topic.add_child(nodes.VideoNode(
                        title = video['title'],
                        source_id = video['id'],
                        license = licenses.PublicDomainLicense(),
                        description = video['description'],
                        derive_thumbnail = not thumbnail_url,
                        files = [files.WebVideoFile(video['webpage_url'])],
                        thumbnail = thumbnail_url,
                        author = AUTHOR,
                        # tags = video['categories'] + video['tags'], # TODO: uncomment this when added
                    ))

        raise_for_invalid_channel(channel)  # Check for errors in channel construction

        return channel
def scrape_subcategory(link, topic):
    url = "{}{}".format(BASE_URL, link.lstrip("/"))
    resource_page = BeautifulSoup(downloader.read(url), 'html5lib')

    # Skip "All" category
    for resource_filter in resource_page.find('div', {'class': 'menu-filtro'}).find_all('a')[1:]:
        LOGGER.info('    {}'.format(resource_filter.text))
        source_id = get_source_id('{}/{}'.format(topic.title, resource_filter.text))
        filter_topic = nodes.TopicNode(title=resource_filter.text, source_id=source_id)
        scrape_resource_list(url + resource_filter['href'], filter_topic)
        topic.add_child(filter_topic)
def scrape_snack_menu(url):
    """ Scrape snacks (activities) from  url
        Args:
            url (str): url to scrape from (e.g. https://www.exploratorium.edu/snacks/snacks-by-subject)
        Returns TopicNode containing all snacks
    """
    LOGGER.info("SCRAPING ACTIVITIES...")
    snack_topic = nodes.TopicNode(title="Activities",
                                  source_id="main-topic-activities")
    contents = BeautifulSoup(read(url), 'html5lib')

    # Get #main-content-container .field-items
    contents = contents.find('div', {'id': 'main-content-container'})\
                    .find('div', {'class': 'field-items'})

    for column in contents.find_all('ul', {'class': 'menu'}):
        # Skip nested .menu list items (captured in subdirectory)
        if column.parent.name == 'li':
            continue

        # Go through top-level li elements
        for li in column.find_all('li', recursive=False):
            link = li.find('a')
            LOGGER.info("    {}".format(link['title']))
            topic = nodes.TopicNode(title=link['title'].replace("’", "'"),
                                    source_id=link['href'])
            snack_topic.add_child(topic)

            # Scrape subcategories (if any)
            if li.find('ul'):
                for sublink in li.find('ul').find_all('a'):
                    LOGGER.info("    > {}".format(sublink['title']))
                    subtopic = nodes.TopicNode(title=sublink['title'].replace(
                        "’", "'"),
                                               source_id=sublink['href'])
                    topic.add_child(subtopic)
                    scrape_snack_subject(sublink['href'], subtopic)
            else:
                scrape_snack_subject(link['href'], topic)

    return snack_topic
Пример #19
0
def download_writing_assessment_grade(grade_node, grade_doc):
    for category in grade_doc.select('.item-list'):
        title = category.select_one('h3').text.strip()
        category_node = nodes.TopicNode(
            source_id="%s|%s" % (grade_node.source_id, title),
            title=title,
            language="en",
            thumbnail=writing_assessment_thumbnail,
        )
        print("    Downloading writing assessment category: %s" % title)
        download_writing_assessment_category(category_node, category)
        grade_node.add_child(category_node)
def get_or_create_level_topic(level_id, language_id, language_topic):
    level_title = LEVELS_NAMES[level_id]
    level_source_id = get_level_source_id(language_id, level_id)

    for child in language_topic.children:
        if child.source_id == level_source_id:
            return child

    topic = nodes.TopicNode(source_id=level_source_id, title=level_title)
    language_topic.add_child(topic)

    return topic
Пример #21
0
def download_student_model_level(level_node, level_doc):
    for category in level_doc.select('.item-list'):
        title = category.select_one('h3').text.strip()
        category_node = nodes.TopicNode(
            source_id="%s|%s" % (level_node.source_id, title),
            title=title,
            language="en",
            thumbnail=student_model_thumbnail,
        )
        print("    Downloading student model category: %s" % title)
        download_student_model_category(category_node, category)
        level_node.add_child(category_node)
def process_folder(channel, raw_path, subfolders, filenames):
    """
    Create `ContentNode`s from each file in this folder and the node to `channel`
    under the path `raw_path`.
    """
    path_as_list = get_path_as_list(raw_path)

    # A. TOPIC
    topic_title = path_as_list.pop()
    parent_node = get_node_for_path(channel, path_as_list)

    # read parent metadata to get title and description
    parent_path, _ = os.path.split(raw_path)
    ini_filepath = os.path.join(parent_path, 'metadata.ini')
    parent_config = configparser.ConfigParser()
    parent_config.read(ini_filepath)

    # create topic
    topic = nodes.TopicNode(
        source_id=raw_path,
        title=parent_config.get(topic_title, 'title'),
        description=parent_config.get(topic_title, 'description', fallback=None),
    )
    parent_node.add_child(topic)

    # remove metadata.ini from filenames list
    assert 'metadata.ini' in filenames
    filenames.remove('metadata.ini')

    # B. PROCESS FILES
    files_config = configparser.ConfigParser()
    folder_ini = os.path.join(raw_path, 'metadata.ini')
    files_config.read(folder_ini)
    for filename in filenames:
        if filename in IGNORABLE_FILENAMES:
            continue
        file_key, file_ext = os.path.splitext(filename)
        ext = file_ext[1:]
        kind = None
        if ext in content_kinds.MAPPING:
            kind = content_kinds.MAPPING[ext]
        # prepare node data
        filepath = os.path.abspath(os.path.join(raw_path, filename))
        source_id = os.path.join(raw_path, filename)
        license = files_config.get(file_key, 'license')
        title = files_config.get(file_key, 'title')
        optionals = {}
        optionals['author'] = files_config.get(file_key, 'author', fallback=None)
        optionals['description'] = files_config.get(file_key, 'description', fallback=None)
        node = make_content_node(kind, source_id, title, license, filepath, optionals)
        # attach to containing topic
        topic.add_child(node)
def get_or_create_language_topic(language, channel):
    language_id = language["id"]
    language_title = language["name"]
    language_source_id = get_language_source_id(language_id)

    for child in channel.children:
        if child.source_id == language_source_id:
            return child

    topic = nodes.TopicNode(source_id=language_source_id, title=language_title)
    channel.add_child(topic)

    return topic
def get_or_create_tag_topic(tag, language_id, level_id, level_topic):
    tag_id = tag["id"]
    tag_title = get_tag_name(tag, language_id)
    tag_source_id = get_tag_source_id(language_id, level_id, tag_id)

    for child in level_topic.children:
        if child.source_id == tag_source_id:
            return child

    topic = nodes.TopicNode(source_id=tag_source_id, title=tag_title)
    level_topic.add_child(topic)

    return topic
Пример #25
0
def download_all_minilessons():
    topic_node = nodes.TopicNode(
        source_id="minilesson",
        title="Minilessons",
        language="en",
        thumbnail=minilesson_thumbnail,
        description=
        "Do you want quick lessons that teach concepts or skills? Each 10-15 minute minilesson presents a concept and engages students in an activity.",
    )

    doc = get_parsed_html_from_url(
        'https://k12.thoughtfullearning.com/resources/minilessons')
    for pane in doc.select('.pane-views-panes'):
        title = pane.select_one('.view-header').text.strip()
        category_node = nodes.TopicNode(source_id=title,
                                        title=title,
                                        language="en")
        print("Downloading minilesson category %s" % title)
        download_minilesson_category(category_node, pane)
        topic_node.add_child(category_node)

    return topic_node
def scrape_iversity(channel):
    url = "{}/en/my/courses/rethinking-us-them-integration-and-diversity-in-europe/lesson_units".format(
        BASE_URL)
    LOGGER.info("   Scraping Migration Matters at {}".format(url))
    source = read_source(url)
    chapters = source.find_all('div', {'class': 'chapter-units-wrapper'})

    for chapter in chapters:
        title = str(chapter.find('div', {'class': 'chapter-title'}).string)
        source_id = title.strip().replace(" ", "_")
        topic = nodes.TopicNode(source_id=source_id, title=title)
        lessons = chapter.find_all('a', {'class': 'unit-wrapper'})

        for lesson in lessons:
            video_exists = lesson.find('i', {'class': 'unit_video'})
            video_title = str(
                lesson.find('span', {
                    'class': 'unit-title'
                }).string).strip()

            if video_exists:
                video_source_id = video_title.replace(" ", "_")
                video_url = "{}{}".format(BASE_URL, lesson.attrs["href"])
                video_source = read_source(video_url)
                video_info = video_source.find('video')
                video_subtitle_path = video_info.find('track', {
                    'kind': 'subtitles'
                }).attrs["src"]
                video_subtitle = files.SubtitleFile(
                    path=video_subtitle_path,
                    language=languages.getlang('en').code)
                video_link = video_info.find('source', {
                    'res': '480'
                }).attrs["src"]
                video_file = files.VideoFile(
                    path=video_link, language=languages.getlang('en').code)
                video_node = nodes.VideoNode(
                    source_id=video_source_id,
                    title=video_title,
                    files=[video_file, video_subtitle],
                    license=CHANNEL_LICENSE,
                    copyright_holder=COPYRIGHT_HOLDER)
                LOGGER.info("   Uploading video - {}".format(
                    video_title.strip()))
                topic.add_child(video_node)
            else:
                LOGGER.info(
                    "Format of the file is not supported by the sushi chef : {}"
                    .format(video_title))

        channel.add_child(topic)
Пример #27
0
def parse_resources(resource_name, resource_data, book_node, **auth_info):
    """ Creates resource topics """
    resource_data = resource_data or []
    resource_str = "{}-{}".format(book_node.source_id, resource_name.replace(' ', '-').lower())

    # Create resource topic
    resource_node = nodes.TopicNode(source_id=resource_str, title=resource_name)
    book_node.add_child(resource_node)

    # Add resource documents
    for resource in resource_data:
        if resource.get('link_document_url') and resource['link_document_url'].endswith(".pdf"):
            description = parse_description(resource.get('resource_description'))
            add_file_node(resource_node, resource.get("link_document_url"), resource.get('resource_heading'), description=description, **auth_info)
Пример #28
0
def download_all_student_models():
    topic_node = nodes.TopicNode(
        source_id="student-models",
        title="Student Models",
        language="en",
        thumbnail=student_model_thumbnail,
        description=
        "When you need an example written by a student, check out our vast collection of free student models.",
    )

    doc = get_parsed_html_from_url(
        'https://k12.thoughtfullearning.com/resources/studentmodels')
    for level in doc.select('.view-content .view-grouping'):
        title = level.select_one('.view-grouping-header').contents[0].strip()
        level_node = nodes.TopicNode(source_id=title,
                                     title=title,
                                     language="en")
        print("Downloading student model level: %s" % title)
        download_student_model_level(
            level_node, level.select_one('.view-grouping-content'))
        topic_node.add_child(level_node)

    return topic_node
Пример #29
0
def _build_tree(node, sourcetree):
    """
    Parse nodes given in `sourcetree` and add as children of `node`.
    """
    for child_source_node in sourcetree:
        try:
            main_file = child_source_node['files'][
                0] if 'files' in child_source_node else {}
            kind = guess_content_kind(
                path=main_file.get('path'),
                web_video_data=main_file.get('youtube_id')
                or main_file.get('web_url'),
                questions=child_source_node.get("questions"))
        except UnknownContentKindError:
            continue

        if kind == content_kinds.TOPIC:
            child_node = nodes.TopicNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            node.add_child(child_node)

            source_tree_children = child_source_node.get("children", [])

            _build_tree(child_node, source_tree_children)

        elif kind == content_kinds.VIDEO:
            child_node = nodes.VideoNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=get_license(child_source_node.get("license"),
                                    description="Description of license",
                                    copyright_holder=child_source_node.get(
                                        'copyright_holder')),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                derive_thumbnail=True,  # video-specific data
                thumbnail=child_source_node.get('thumbnail'),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        else:  # unknown content file format
            continue

    return node
Пример #30
0
def download_all_writing_assessments():
    topic_node = nodes.TopicNode(
        source_id="writing-assessment",
        title="Writing Assessments",
        language="en",
        thumbnail=writing_assessment_thumbnail,
        description=
        "When you want students to understand how writing is graded, turn to our vast selection of assessment examples. You'll find elementary and middle school models in all of the major modes of writing, along with rubrics that assess each example as \"Strong,\" \"Good,\" \"Okay,\" or \"Poor.\"",
    )

    doc = get_parsed_html_from_url(
        'https://k12.thoughtfullearning.com/resources/writingassessment')
    for grade in doc.select('.view-writing-assessment-silo'):
        title = grade.select_one('.view-grouping-header').contents[0].strip()
        grade_node = nodes.TopicNode(source_id=title,
                                     title=title,
                                     language="en")
        print("Downloading writing assessment grade: %s" % title)
        download_writing_assessment_grade(grade_node,
                                          grade.select_one('.view-content'))
        topic_node.add_child(grade_node)

    return topic_node