def content_tree_to_channel(self, channel): source_id = self.channel_info['CHANNEL_SOURCE_DOMAIN'] child_topics = [] for subject in self.content_tree: subject_id = '{}-{}'.format(source_id, subject) title = subject if subject.endswith('Route'): lang = self.lang_id if not self.lang_id in TRANSLATIONS[subject]: lang = 'en' title = TRANSLATIONS[subject][lang] subject_node = nodes.TopicNode(source_id=subject_id, title=title) channel.add_child(subject_node) modules = self.content_tree[subject] for module in modules: if 'file' in module: self.create_leaf_node(module, subject_node, subject_id) elif 'children' in module: subtopic_id = '{}-{}'.format(subject_id, module['id']) child_topics.append(module['title']) thumbnail = None if 'thumbnail' in module: thumbnail = module['thumbnail'] subtopic_node = nodes.TopicNode(source_id=subtopic_id, title=module['title'], description=module['description'], thumbnail=thumbnail) subject_node.add_child(subtopic_node) for child in module['children']: self.create_leaf_node(child, subtopic_node, subtopic_id)
def scrape_channel(channel): # Read from Categorias dropdown menu page = BeautifulSoup(downloader.read(BASE_URL), 'html5lib') dropdown = page.find('a', {'id': 'btn-categorias'}).find_next_sibling('ul') # Go through dropdown and generate topics and subtopics for category_list in dropdown.find_all('li', {'class': 'has-children'}): # Parse categories for category in category_list.find_all('li', {'class': 'has-children'}): # Add this topic to channel when scraping entire channel category_name = category.find('a').text topic = nodes.TopicNode(title=category_name, source_id=get_source_id(category_name)) channel.add_child(topic) LOGGER.info(topic.title) # Parse subcategories for subcategory in category.find_all('li'): if not subcategory.attrs.get('class') or 'go-back' not in subcategory.attrs['class']: # Get rid of this check to scrape entire site subcategory_name = subcategory.find('a').text subcategory_link = subcategory.find('a')['href'] LOGGER.info(' {}'.format(subcategory_name)) subtopic = nodes.TopicNode(title=subcategory_name, source_id=get_source_id(subcategory_link)) topic.add_child(subtopic) # Parse resources scrape_subcategory(subcategory_link, subtopic)
def scrape_video_menu(url): """ Scrape videos from url Args: url (str): url to scrape from (e.g. https://www.exploratorium.edu/video/subjects) Returns TopicNode containing all videos """ LOGGER.info("SCRAPING VIDEOS...") video_topic = nodes.TopicNode(title="Videos", source_id="main-topic-videos") contents = BeautifulSoup(read(url), 'html5lib') for subject in contents.find_all('div', {'class': 'subject'}): title = subject.find('div', { 'class': 'name' }).text.strip().replace("’", "'") LOGGER.info(" {}".format(title)) topic = nodes.TopicNode( title=title, source_id="videos-{}".format(title), thumbnail=get_thumbnail_url(subject.find('img')['src']), ) video_topic.add_child(topic) scrape_video_subject(subject.find('a')['href'], topic) return video_topic
def scrape_english_collection(channel): LOGGER.info('Scraping English collection...') english_topic = nodes.TopicNode(source_id=ENGLISH_COLLECTION_URL, title="English") channel.add_child(english_topic) contents = BeautifulSoup(downloader.read(ENGLISH_COLLECTION_URL), 'html5lib') collection_key = get_collection_key(contents) topic_selection = contents.find('div', {'class': 'asset-list'}).find('div') topic_list = [ t for t in json.loads(topic_selection['data-react-props'])['sections'] if t['id'] not in EXCLUDED_TOPIC_IDS ] for topic in topic_list: LOGGER.info(' {}'.format(topic['name'].encode('utf-8'))) topic_node = nodes.TopicNode(source_id=topic['section_key'], title=topic['name']) english_topic.add_child(topic_node) # Scrape items in the topic url = ENGLISH_ASSETS_URL.format(collection=collection_key, section=topic['section_key']) scrape_collection_files(topic_node, url)
def add_content_to_tree(self, channel): tree = self.channel_tree lang = 'English' lang_obj = getlang("en") for class_name in tree[lang]: class_obj = tree[lang][class_name] class_id = "{}-{}".format(lang, class_name) class_node = nodes.TopicNode(source_id=class_name, title=class_name) for subject_name in class_obj: subject_id = "{}-{}".format(class_id, subject_name) subject_node = nodes.TopicNode(source_id=subject_id, title=subject_name) subject_obj = class_obj[subject_name] for item in subject_obj['items']: item_id = "{}-{}".format(subject_id, get_column(item, 'id')) video = nodes.VideoNode( source_id=item_id, title=get_column(item, 'name'), description=get_column(item, 'description'), files=[ files.VideoFile(path=get_column(item, 'file')) ], language=lang_obj, # FIXME: Use the column's license field instead of hardcoding. license=licenses.get_license(le_licenses.CC_BY, copyright_holder=get_column(item, "copyright")), # thumbnail=get_column(item, "thumbnail") ) subject_node.add_child(video) class_node.add_child(subject_node) channel.add_child(class_node)
def download_all_writing_topics(): topic_node = nodes.TopicNode( source_id="writing-topic", title="Writing Topics", language="en", thumbnail=writing_topic_thumbnail, description= ("Do you want to inspire your students to write great" " narratives, essays, and reports? Check out these grade-specific" " writing topics organized by mode (explanatory, creative, and so on)." ), ) doc = get_parsed_html_from_url( 'https://k12.thoughtfullearning.com/resources/writingtopics') for level in doc.select('.view-content .view-grouping'): title = level.select_one('.view-grouping-header').contents[0].strip() level_node = nodes.TopicNode(source_id=title, title=title, language="en") print("Downloading writing topic level: %s" % title) download_writing_topic_level( level_node, level.select_one('.view-grouping-content')) topic_node.add_child(level_node) return topic_node
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in on the command line - kwargs: extra options passed in as key="value" pairs on the command line For example, add the command line option lang="fr" and the value "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info # Channel structure: language --> subject --> experiments for lang_code, value in XLS_SHEETS.items(): # lang_code = language code # value = link to xls sheet # read xlxs file using pandas xls_file = pandas.read_excel(value) print(lang_code) print(value) if lang_code == 'en': language = 'English' elif lang_code == 'es': language = 'Español' else: language = 'Português' topic_node = nodes.TopicNode( title=language, source_id='sciensation_{}'.format(language), author='Sciensation', provider='Sciensation', description='{} experiements'.format(language), language=lang_code) # add subject nodes for subject in SUBJECTS: subject_node = nodes.TopicNode( title=subject, source_id='sciensation_{0}_{1}'.format(language, subject), author='Sciensation', provider='Sciensation', description='', language=lang_code) # Add exercises to subject nodes experiment_dict = buildDict(xls_file) subject_node = add_experiments(subject, lang_code, subject_node, experiment_dict) topic_node.add_child(subject_node) channel.add_child(topic_node) return channel
def fetch_youtube_playlists(parent_node): """Fetch all of the YouTube playlists from the YouTube channel. Return a map of YouTube playlist title to the topic node. """ youtube_channel_url = 'https://www.youtube.com/channel/UCNI0qOojpkhsUtaQ4_2NUhQ/playlists' print("--- Fetching videos from YouTube channel (%s) ---" % youtube_channel_url) print() topics_map = {} info = ydl.extract_info(youtube_channel_url, download=False) for i, playlist in enumerate(info['entries']): title = playlist['title'] youtube_url = playlist['webpage_url'] print(" Downloading playlist %s (%s)" % (title, youtube_url)) playlist_topic = nodes.TopicNode(source_id=playlist['id'], title=playlist['title'], language="en") topics_map[title] = playlist_topic parent_node.add_child(playlist_topic) for j, video in enumerate(playlist['entries']): if video: playlist_topic.add_child(fetch_video(video)) return topics_map
def scrape_category(title, category_url, language): """ title: Culture category_url: http://www.touchableearth.org/china/culture/ ... redirects to: http://www.touchableearth.org/china-culture-boys-clothing/ """ print(" Scraping category node: %s (%s)" % (title, category_url)) category_node = nodes.TopicNode(source_id=category_url, title=title) # Iterate over each item in the "subway" sidebar menu on the left. doc = get_parsed_html_from_url(category_url) content_items = doc.select(".post_title_sub .current_post") slugs_added = set() for content in content_items: slug = content.select_one(".get_post_title")["value"] # Skip duplicates ... seems like the Touchable Earth website has them! if slug in slugs_added: continue else: slugs_added.add(slug) title = content.select_one(".get_post_title2")["value"] site_url = content.select_one(".site_url")["value"] url = "%s/%s?lang=%s" % (site_url, slug, language) content_node = scrape_content(title, url) if content_node: category_node.add_child(content_node) return category_node
def scrape_directory(topic, directory, indent=1): for subdirectory, folders, myfiles in os.walk(directory): # Go through all of the folders under directory for folder in folders: print('{}{}'.format(' ' * indent, folder)) subtopic = nodes.TopicNode(source_id=folder, title=folder) topic.add_child(subtopic) # Go through folders under directory scrape_directory(subtopic, os.sep.join([subdirectory, folder]), indent=indent + 1) for file in myfiles: name, ext = os.path.splitext(file) if ext == '.mp4': video = nodes.VideoNode(source_id=subdirectory + file, title=name, license=LICENSE, copyright_holder=COPYRIGHT_HOLDER) videofile = files.VideoFile(os.sep.join([subdirectory, file])) video.add_file(videofile) topic.add_child(video) elif ext == '.pdf': with PDFParser(os.path.sep.join([subdirectory, file])) as parser: chapters = parser.get_data_file() generate_pdf_nodes(chapters, topic, source=os.path.basename(file)) break
def add_file_node(target_node, url, title, split=False, contents=None, source_id=None, **details): """ Creates file nodes at target topic node """ if split: book_node = nodes.TopicNode( source_id=source_id or target_node.source_id + "-main", title=title, description=details.get('description'), thumbnail=details.get('thumbnail'), ) target_node.add_child(book_node) chapters = [] chapter_details = copy.deepcopy(details) del chapter_details['description'] with PDFParser(url, directory=DOWNLOAD_DIRECTORY) as parser: chapters = parser.split_chapters( jsondata=JSONDATA.get(book_node.source_id)) for index, chapter in enumerate(chapters): source_id = contents[index]['id'] if index < len( contents) else "{}-{}".format(book_node.source_id, index) create_document_node(chapter['path'], chapter['title'], book_node, source_id, **chapter_details) else: create_document_node(url, title, target_node, source_id or target_node.source_id, **details)
def get_ricecooker_node(self): soup = BeautifulSoup(open(self.file_on_disk).read()) print("opening {}".format(self.file_on_disk)) # We'll add the title later when we iterate through the sections topic_node = nodes.TopicNode(source_id=self.url, title='') sections = soup.find_all('div', attrs={'class': 'section-heading'}) for section in sections: # This is the top-level header, meaning it's the page title title = section.text.strip() if section.find('h1'): print("Page title = {}".format(title)) topic_node.title = title continue print("Section = {}".format(title)) content = section.find_next_sibling() if "content-block" in content.attrs['class']: self.node_for_text_section(content) elif "row" in content.attrs['class']: # the section rows are siblings in the tree. rows = [content] next = content.find_next_sibling() while "row" in next.attrs['class']: rows.append(next) next = next.find_next_sibling() self.node_for_rows(rows) return topic_node
def _build_tree(node, sourcetree): for child_source_node in sourcetree: try: main_file = child_source_node['files'][0] if 'files' in child_source_node else {} kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions")) except UnknownContentKindError: continue if kind == content_kinds.TOPIC: child_node = nodes.TopicNode( source_id=child_source_node["id"], title=child_source_node["title"], author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) node.add_child(child_node) source_tree_children = child_source_node.get("children", []) _build_tree(child_node, source_tree_children) elif kind == content_kinds.EXERCISE: # node_data = json.dumps(child_source_node) if int(len(child_source_node['questions'])) < 5: exercise_data = { 'mastery_model': exercises.DO_ALL, 'randomize': True, } else: exercise_data={ 'mastery_model': exercises.M_OF_N, 'randomize': True, 'm': 4, 'n': 5, } child_node = nodes.ExerciseNode( source_id=child_source_node["id"], title=child_source_node["title"], license=child_source_node.get("license"), author=child_source_node.get("author"), description=child_source_node.get("description"), exercise_data=exercise_data, copyright_holder='GreyKite Technologies Pvt. Ltd.', thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) for q in child_source_node.get("questions"): question = create_question(q) child_node.add_question(question) node.add_child(child_node) else: # unknown content file format continue return node
def construct_channel(self, *args, **kwargs): channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info lang_names = list(self.data.keys()) lang_names.sort() for lang_name in lang_names: lang_data = self.data[lang_name] LOGGER.info("Creating app for language: {}".format(lang_name)) lang = languages.getlang_by_native_name(lang_name) zip_dir = self.client.create_zip_dir_for_page(lang_data['url']) soup = self.client.get_page_soup(lang_data['url']) # Remove the translation list if found translations = soup.find('div', {'id': 'translations'}) if translations: translations.extract() # Grab the localized title title = soup.find('span', {'id': 'share_title'}).text # Save the modified index.html page thumbnail = None for resource in lang_data['resources']: if 'dp3t.png' in resource: thumbnail = os.path.join(zip_dir, resource) break with open(os.path.join(zip_dir, 'index.html'), 'wb') as f: f.write(soup.prettify(encoding='utf-8')) # create_predictable_zip ensures that the ZIP file does not change each time it's created. This # ensures that the zip doesn't get re-uploaded just because zip metadata changed. zip_file = zip.create_predictable_zip(zip_dir) zip_name = lang.primary_code if lang else lang_name zip_filename = os.path.join(self.ZIP_DIR, "{}.zip".format(zip_name)) os.makedirs(os.path.dirname(zip_filename), exist_ok=True) os.rename(zip_file, zip_filename) topic = nodes.TopicNode(source_id=lang_name, title=lang_name) zip_node = nodes.HTML5AppNode( source_id="covid19-sim-{}".format(lang_name), title=title, files=[files.HTMLZipFile(zip_filename)], license=licenses.PublicDomainLicense( "Marcel Salathé & Nicky Case"), language=lang, thumbnail=thumbnail) topic.add_child(zip_node) channel.add_child(topic) return channel
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info topics = load_json_from_file(JSON_FILE) for topic in topics: book_title = topic['book_title'] source_id = book_title.replace(" ", "_") url = topic['path_or_url'] topic_node = nodes.TopicNode(source_id=source_id, title=book_title, tags=[ "Teacher facing", "Professional development", "Life skills", "Intercultural skills", "Mentorship", "Formal contexts" ]) channel.add_child(topic_node) parser = pdf.PDFParser(url, toc=topic['chapters']) parser.open() chapters = parser.split_chapters() for chapter in chapters: title = chapter['title'] pdf_path = chapter['path'] pdf_file = files.DocumentFile(pdf_path) pdf_node = nodes.DocumentNode( source_id="{} {}".format(book_title, title), title=title, author="INTO", tags=[ "Teacher facing", "Professional development", "Life skills", "Intercultural skills", "Mentorship", "Formal contexts" ], files=[pdf_file], license=licenses.get_license(CHANNEL_LICENSE, "INTO", LICENSE_DESCRIPTION), copyright_holder="INTO") topic_node.add_child(pdf_node) raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode Healing Classrooms is organized with the following hierarchy: Playlist (TopicNode) | Youtube Video (VideoNode) | Youtube Video (VideoNode) """ channel = self.get_channel(*args, **kwargs) # Create ChannelNode from data in self.channel_info # Download the playlist/video information with youtube_dl.YoutubeDL({'skip_download': True}) as ydl: info_dict = ydl.extract_info(PLAYLISTS_URL, download=False) # Generate topics based off playlist entries in dict for playlist in info_dict['entries']: # Get language of playlist (hack) language = "fr" if "English" in playlist['title']: language = "en" elif "Arabic" in playlist['title']: language = "ar" playlist_topic = nodes.TopicNode(title=playlist['title'], source_id=playlist['id'], language=language) channel.add_child(playlist_topic) # Generate videos based off video entries in dict for video in playlist['entries']: thumbnail_url = len(video['thumbnails']) and video['thumbnails'][0]['url'] playlist_topic.add_child(nodes.VideoNode( title = video['title'], source_id = video['id'], license = licenses.PublicDomainLicense(), description = video['description'], derive_thumbnail = not thumbnail_url, files = [files.WebVideoFile(video['webpage_url'])], thumbnail = thumbnail_url, author = AUTHOR, # tags = video['categories'] + video['tags'], # TODO: uncomment this when added )) raise_for_invalid_channel(channel) # Check for errors in channel construction return channel
def scrape_subcategory(link, topic): url = "{}{}".format(BASE_URL, link.lstrip("/")) resource_page = BeautifulSoup(downloader.read(url), 'html5lib') # Skip "All" category for resource_filter in resource_page.find('div', {'class': 'menu-filtro'}).find_all('a')[1:]: LOGGER.info(' {}'.format(resource_filter.text)) source_id = get_source_id('{}/{}'.format(topic.title, resource_filter.text)) filter_topic = nodes.TopicNode(title=resource_filter.text, source_id=source_id) scrape_resource_list(url + resource_filter['href'], filter_topic) topic.add_child(filter_topic)
def scrape_snack_menu(url): """ Scrape snacks (activities) from url Args: url (str): url to scrape from (e.g. https://www.exploratorium.edu/snacks/snacks-by-subject) Returns TopicNode containing all snacks """ LOGGER.info("SCRAPING ACTIVITIES...") snack_topic = nodes.TopicNode(title="Activities", source_id="main-topic-activities") contents = BeautifulSoup(read(url), 'html5lib') # Get #main-content-container .field-items contents = contents.find('div', {'id': 'main-content-container'})\ .find('div', {'class': 'field-items'}) for column in contents.find_all('ul', {'class': 'menu'}): # Skip nested .menu list items (captured in subdirectory) if column.parent.name == 'li': continue # Go through top-level li elements for li in column.find_all('li', recursive=False): link = li.find('a') LOGGER.info(" {}".format(link['title'])) topic = nodes.TopicNode(title=link['title'].replace("’", "'"), source_id=link['href']) snack_topic.add_child(topic) # Scrape subcategories (if any) if li.find('ul'): for sublink in li.find('ul').find_all('a'): LOGGER.info(" > {}".format(sublink['title'])) subtopic = nodes.TopicNode(title=sublink['title'].replace( "’", "'"), source_id=sublink['href']) topic.add_child(subtopic) scrape_snack_subject(sublink['href'], subtopic) else: scrape_snack_subject(link['href'], topic) return snack_topic
def download_writing_assessment_grade(grade_node, grade_doc): for category in grade_doc.select('.item-list'): title = category.select_one('h3').text.strip() category_node = nodes.TopicNode( source_id="%s|%s" % (grade_node.source_id, title), title=title, language="en", thumbnail=writing_assessment_thumbnail, ) print(" Downloading writing assessment category: %s" % title) download_writing_assessment_category(category_node, category) grade_node.add_child(category_node)
def get_or_create_level_topic(level_id, language_id, language_topic): level_title = LEVELS_NAMES[level_id] level_source_id = get_level_source_id(language_id, level_id) for child in language_topic.children: if child.source_id == level_source_id: return child topic = nodes.TopicNode(source_id=level_source_id, title=level_title) language_topic.add_child(topic) return topic
def download_student_model_level(level_node, level_doc): for category in level_doc.select('.item-list'): title = category.select_one('h3').text.strip() category_node = nodes.TopicNode( source_id="%s|%s" % (level_node.source_id, title), title=title, language="en", thumbnail=student_model_thumbnail, ) print(" Downloading student model category: %s" % title) download_student_model_category(category_node, category) level_node.add_child(category_node)
def process_folder(channel, raw_path, subfolders, filenames): """ Create `ContentNode`s from each file in this folder and the node to `channel` under the path `raw_path`. """ path_as_list = get_path_as_list(raw_path) # A. TOPIC topic_title = path_as_list.pop() parent_node = get_node_for_path(channel, path_as_list) # read parent metadata to get title and description parent_path, _ = os.path.split(raw_path) ini_filepath = os.path.join(parent_path, 'metadata.ini') parent_config = configparser.ConfigParser() parent_config.read(ini_filepath) # create topic topic = nodes.TopicNode( source_id=raw_path, title=parent_config.get(topic_title, 'title'), description=parent_config.get(topic_title, 'description', fallback=None), ) parent_node.add_child(topic) # remove metadata.ini from filenames list assert 'metadata.ini' in filenames filenames.remove('metadata.ini') # B. PROCESS FILES files_config = configparser.ConfigParser() folder_ini = os.path.join(raw_path, 'metadata.ini') files_config.read(folder_ini) for filename in filenames: if filename in IGNORABLE_FILENAMES: continue file_key, file_ext = os.path.splitext(filename) ext = file_ext[1:] kind = None if ext in content_kinds.MAPPING: kind = content_kinds.MAPPING[ext] # prepare node data filepath = os.path.abspath(os.path.join(raw_path, filename)) source_id = os.path.join(raw_path, filename) license = files_config.get(file_key, 'license') title = files_config.get(file_key, 'title') optionals = {} optionals['author'] = files_config.get(file_key, 'author', fallback=None) optionals['description'] = files_config.get(file_key, 'description', fallback=None) node = make_content_node(kind, source_id, title, license, filepath, optionals) # attach to containing topic topic.add_child(node)
def get_or_create_language_topic(language, channel): language_id = language["id"] language_title = language["name"] language_source_id = get_language_source_id(language_id) for child in channel.children: if child.source_id == language_source_id: return child topic = nodes.TopicNode(source_id=language_source_id, title=language_title) channel.add_child(topic) return topic
def get_or_create_tag_topic(tag, language_id, level_id, level_topic): tag_id = tag["id"] tag_title = get_tag_name(tag, language_id) tag_source_id = get_tag_source_id(language_id, level_id, tag_id) for child in level_topic.children: if child.source_id == tag_source_id: return child topic = nodes.TopicNode(source_id=tag_source_id, title=tag_title) level_topic.add_child(topic) return topic
def download_all_minilessons(): topic_node = nodes.TopicNode( source_id="minilesson", title="Minilessons", language="en", thumbnail=minilesson_thumbnail, description= "Do you want quick lessons that teach concepts or skills? Each 10-15 minute minilesson presents a concept and engages students in an activity.", ) doc = get_parsed_html_from_url( 'https://k12.thoughtfullearning.com/resources/minilessons') for pane in doc.select('.pane-views-panes'): title = pane.select_one('.view-header').text.strip() category_node = nodes.TopicNode(source_id=title, title=title, language="en") print("Downloading minilesson category %s" % title) download_minilesson_category(category_node, pane) topic_node.add_child(category_node) return topic_node
def scrape_iversity(channel): url = "{}/en/my/courses/rethinking-us-them-integration-and-diversity-in-europe/lesson_units".format( BASE_URL) LOGGER.info(" Scraping Migration Matters at {}".format(url)) source = read_source(url) chapters = source.find_all('div', {'class': 'chapter-units-wrapper'}) for chapter in chapters: title = str(chapter.find('div', {'class': 'chapter-title'}).string) source_id = title.strip().replace(" ", "_") topic = nodes.TopicNode(source_id=source_id, title=title) lessons = chapter.find_all('a', {'class': 'unit-wrapper'}) for lesson in lessons: video_exists = lesson.find('i', {'class': 'unit_video'}) video_title = str( lesson.find('span', { 'class': 'unit-title' }).string).strip() if video_exists: video_source_id = video_title.replace(" ", "_") video_url = "{}{}".format(BASE_URL, lesson.attrs["href"]) video_source = read_source(video_url) video_info = video_source.find('video') video_subtitle_path = video_info.find('track', { 'kind': 'subtitles' }).attrs["src"] video_subtitle = files.SubtitleFile( path=video_subtitle_path, language=languages.getlang('en').code) video_link = video_info.find('source', { 'res': '480' }).attrs["src"] video_file = files.VideoFile( path=video_link, language=languages.getlang('en').code) video_node = nodes.VideoNode( source_id=video_source_id, title=video_title, files=[video_file, video_subtitle], license=CHANNEL_LICENSE, copyright_holder=COPYRIGHT_HOLDER) LOGGER.info(" Uploading video - {}".format( video_title.strip())) topic.add_child(video_node) else: LOGGER.info( "Format of the file is not supported by the sushi chef : {}" .format(video_title)) channel.add_child(topic)
def parse_resources(resource_name, resource_data, book_node, **auth_info): """ Creates resource topics """ resource_data = resource_data or [] resource_str = "{}-{}".format(book_node.source_id, resource_name.replace(' ', '-').lower()) # Create resource topic resource_node = nodes.TopicNode(source_id=resource_str, title=resource_name) book_node.add_child(resource_node) # Add resource documents for resource in resource_data: if resource.get('link_document_url') and resource['link_document_url'].endswith(".pdf"): description = parse_description(resource.get('resource_description')) add_file_node(resource_node, resource.get("link_document_url"), resource.get('resource_heading'), description=description, **auth_info)
def download_all_student_models(): topic_node = nodes.TopicNode( source_id="student-models", title="Student Models", language="en", thumbnail=student_model_thumbnail, description= "When you need an example written by a student, check out our vast collection of free student models.", ) doc = get_parsed_html_from_url( 'https://k12.thoughtfullearning.com/resources/studentmodels') for level in doc.select('.view-content .view-grouping'): title = level.select_one('.view-grouping-header').contents[0].strip() level_node = nodes.TopicNode(source_id=title, title=title, language="en") print("Downloading student model level: %s" % title) download_student_model_level( level_node, level.select_one('.view-grouping-content')) topic_node.add_child(level_node) return topic_node
def _build_tree(node, sourcetree): """ Parse nodes given in `sourcetree` and add as children of `node`. """ for child_source_node in sourcetree: try: main_file = child_source_node['files'][ 0] if 'files' in child_source_node else {} kind = guess_content_kind( path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions")) except UnknownContentKindError: continue if kind == content_kinds.TOPIC: child_node = nodes.TopicNode( source_id=child_source_node["id"], title=child_source_node["title"], author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) node.add_child(child_node) source_tree_children = child_source_node.get("children", []) _build_tree(child_node, source_tree_children) elif kind == content_kinds.VIDEO: child_node = nodes.VideoNode( source_id=child_source_node["id"], title=child_source_node["title"], license=get_license(child_source_node.get("license"), description="Description of license", copyright_holder=child_source_node.get( 'copyright_holder')), author=child_source_node.get("author"), description=child_source_node.get("description"), derive_thumbnail=True, # video-specific data thumbnail=child_source_node.get('thumbnail'), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) else: # unknown content file format continue return node
def download_all_writing_assessments(): topic_node = nodes.TopicNode( source_id="writing-assessment", title="Writing Assessments", language="en", thumbnail=writing_assessment_thumbnail, description= "When you want students to understand how writing is graded, turn to our vast selection of assessment examples. You'll find elementary and middle school models in all of the major modes of writing, along with rubrics that assess each example as \"Strong,\" \"Good,\" \"Okay,\" or \"Poor.\"", ) doc = get_parsed_html_from_url( 'https://k12.thoughtfullearning.com/resources/writingassessment') for grade in doc.select('.view-writing-assessment-silo'): title = grade.select_one('.view-grouping-header').contents[0].strip() grade_node = nodes.TopicNode(source_id=title, title=title, language="en") print("Downloading writing assessment grade: %s" % title) download_writing_assessment_grade(grade_node, grade.select_one('.view-content')) topic_node.add_child(grade_node) return topic_node