def get_nodes_by_ids_complete(self, studio_id): headers = {"Authorization": "Token {0}".format(self.token)} url = NODES_ENDPOINT + studio_id LOGGER.info(' GET ' + url) response = requests.get(url, headers=headers) studio_node = response.json()[0] return studio_node
def add_subpages_from_wikipedia_list(topic, list_url): """ add_subpages_from_wikipedia_list: Parses wiki pages and creates corresponding files To understand how the following parsing works, look at: 1. the source of the page (e.g. https://en.wikipedia.org/wiki/List_of_citrus_fruits), or inspect in chrome dev tools 2. the documentation for BeautifulSoup version 4: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ page = read_source(list_url) # Parse the the page into BeautifulSoup format, so we can loop through and manipulate it table = page.find("table") # Extract the main table from the page # Loop through all the rows in the table for row in table.find_all("tr"): columns = row.find_all("td") # Extract the columns (cells, really) within the current row if not columns: # Some rows are empty, so just skip continue link = columns[0].find("a") # Get the link to the subpage if not link: # Some rows don't have links, so skip continue # Extract the URL and title for the subpage url = make_fully_qualified_url(link["href"]) title = link.text LOGGER.info(" Writing {}...".format(title)) # Attempt to extract a thumbnail for the subpage, from the second column in the table image = columns[1].find("img") thumbnail_url = make_fully_qualified_url(image["src"]) if image else None if thumbnail_url and not (thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png")): thumbnail_url = None # Download the wikipedia page into an HTML5 app node html5app = download_wikipedia_page(url, thumbnail=thumbnail_url, title=title) # Add the downloaded HTML5 app node into the topic topic.add_child(html5app)
def construct_channel(self, *args, **kwargs): """ construct_channel: Creates ChannelNode and build topic tree Wikipedia is organized with the following hierarchy: Citrus (Folder) | Citrus Page HTML Zip (File) Potatoes (Folder) | Potatoes Page HTML Zip (File) Returns: ChannelNode """ LOGGER.info("Constructing channel from {}...".format(BASE_URL)) channel = self.get_channel( *args, **kwargs) # Creates ChannelNode from data in self.channel_info create_topic(channel, "Citrus!", "List_of_citrus_fruits") # Add Citrus folder create_topic(channel, "Potatoes!", "List_of_potato_cultivars") # Add Potatoes folder raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def add_content_nodes(self, channel): """ Build the hierarchy of topic nodes and content nodes. """ LOGGER.info('Creating channel content nodes...') course_list = json.load( open(os.path.join(COURSES_DIR, 'course_list.json'))) for course in course_list['courses']: # [1:2]: basedir = os.path.join(COURSES_DIR, course['name']) coursedir = os.path.join(basedir, 'course') course_data = extract_course_tree(coursedir) course_id = course_data['course'] write_tree_to_json_tree( os.path.join(ORIGINAL_TREES_DIR, course_id + '.json'), course_data) # print_course(course_data, translate_from='ar') clean_subtree(course_data, coursedir) print('Cleaned course', course_data['course'], '#' * 80) write_tree_to_json_tree( os.path.join(CLEAN_TREES_DIR, course_id + '.json'), course_data) transformed_tree = transform_tree(course_data, coursedir) write_tree_to_json_tree( os.path.join(TRANSFORMED_TREES_DIR, course_id + '.json'), transformed_tree) print_transfomed_tree(transformed_tree, translate_from='ar') channel['children'].append(transformed_tree) print('\n\n')
def get_subtitle_languages(youtube_id): """ Returns a list of the subtitle language codes available for a given video. We'll try to get the list using two approach: 1. The Youtube API (works for public videos when YOUTUBE_API_KEY defined) 2. Slow by using YouTubeResource, which in turn calls youtube_dl """ # Check if we already have the lang_codes list for this youtube_id cached... cache_filename = '{}__lang_codes.json'.format(youtube_id) cache_filepath = os.path.join(SUBTITLE_LANGUAGES_CACHE_DIR, cache_filename) if os.path.exists(cache_filepath): # Cache hit! with open(cache_filepath) as jsonf: cache_data = json.load(jsonf) return cache_data['lang_codes'] if YOUTUBE_API_KEY: try: lang_codes = get_subtitles_using_api(youtube_id) return lang_codes except HttpError as e: LOGGER.info("Can't access API for video {} ...".format(youtube_id)) lang_codes = get_subtitles_using_youtube_dl(youtube_id) # Cache the results in chefdata/sublangscache/{youtube_id}__lang_codes.json cache_data = {"lang_codes": lang_codes} with open(cache_filepath, 'w') as jsonf: json.dump(cache_data, jsonf, ensure_ascii=True) return lang_codes
def pre_run(self, args, options): """ Build the ricecooker json tree for the channel. The code here is similar to the code in `ricecooker_channel/chef.py`, but the channel hiearachy is build using dictionary objects instead of classes. """ LOGGER.info('In pre_run...') # 1. Create the channel tree ricecooker_json_tree = dict( title='Sample JSON channel', source_domain='source.org', source_id='sample-json-channel', description='This channel was created from the files in the content/ ' \ + 'directory and the metadata in sample_ricecooker_json_tree.json', thumbnail='./content/sample-json-channel-files/channel_thumbnail.jpg', language='en', children=[], ) # The root object of the ricecooker json tree contains the channel info; # add topic and content nodes and to the children list to build the tree. # 2. Add topics nodes and content nodes as to the tree self.create_content_nodes(ricecooker_json_tree) self.create_exercise_nodes(ricecooker_json_tree) # 3. Save the tree to chefdata/trees/sample_ricecooker_json_tree.json json_tree_path = self.get_json_tree_path() write_tree_to_json_tree(json_tree_path, ricecooker_json_tree) LOGGER.info('Finished writing ricecooker json tree.')
def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None): youtube_info = None # 1. Try to get from cache if allowed: if os.path.exists(self.cache_path) and use_cache: LOGGER.info("==> [%s] Retrieving cached information...", self.__str__()) youtube_info = json.load(open(self.cache_path)) # 2. Fetch info from youtube_dl if not youtube_info: LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__()) os.makedirs(self.cache_dir, exist_ok=True) try: youtube_resource = YouTubeResource(self.url, useproxy=use_proxy) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url) return None if youtube_resource: try: # Save YouTube info to JSON cache file youtube_info = youtube_resource.get_resource_info(options) if youtube_info: json.dump(youtube_info, open(self.cache_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) else: LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__()) except Exception as e: LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e) return None return youtube_info
def crawling_part(): """ Visit all the urls on engageny.org/resource/ and engageny.org/content, and extract content structure. """ # crawl website to build web_resource_tree ela_hierarchy, math_hierarchy = crawl(ENGAGENY_CC_START_URL) web_resource_tree = dict( kind="EngageNYWebResourceTree", title="Engage NY Web Resource Tree (ELS and CCSSM)", language='en', children={ 'math': { 'grades': math_hierarchy, }, 'ela': { 'grades': ela_hierarchy, }, }, ) json_file_name = os.path.join(TREES_DATA_DIR, CRAWLING_STAGE_OUTPUT) with open(json_file_name, 'w') as json_file: json.dump(web_resource_tree, json_file, indent=2) LOGGER.info('Crawling results stored in ' + json_file_name) return web_resource_tree
def recursive_extract_website_games(subtree): """ Processes all child nodes of the subtree then calls itself on any folder-like child nodes. Weird, I know, but it works so I'm not touching it. """ if 'children' in subtree: # do processing new_children = [] for child in subtree['children']: child_url = child['url'] if child['kind'] == 'PrathamZipResource': if is_website_game(child_url): # extract all game names referenced in manual curation Excel file to process separately... child_url = child_url.replace( 'https://www.prathamopenschool.org/CourseContent/Games/', '') child_url = child_url.replace( 'http://www.prathamopenschool.org/CourseContent/Games/', '') child['title_en'] = child_url.replace('.zip', '') print('EXTRACTED game name', child['title_en'], 'form url', child['url']) website_games.append(child) else: # leave other games where they are LOGGER.info('Undocumented game-like web resource ' + child['url']) new_children.append(child) else: # leave other content as is new_children.append(child) # # recurse for child in subtree['children']: recursive_extract_website_games(child)
def get_nodes_by_ids_bulk(self, studio_ids): """ A more efficient version of `get_nodes_by_ids_complete` that GETs tree content node data in chunks of 10 from the Studio API. """ CHUNK_SIZE = 25 NODES_ENDPOINT = self.studio_url + '/api/get_nodes_by_ids_complete/' headers = {"Authorization": "Token {0}".format(self.token)} studio_nodes = [] studio_ids_chunks = [ studio_ids[i:i + CHUNK_SIZE] for i in range(0, len(studio_ids), CHUNK_SIZE) ] for studio_ids_chunk in studio_ids_chunks: studio_ids_csv = ','.join(studio_ids_chunk) url = NODES_ENDPOINT + studio_ids_csv LOGGER.info(' GET ' + url) response = requests.get(url, headers=headers) chunk_nodes = response.json() for chunk_node in chunk_nodes: if 'children' in chunk_node: child_nodes = self.get_nodes_by_ids_bulk( chunk_node['children']) chunk_node['children'] = child_nodes studio_nodes.extend(chunk_nodes) return studio_nodes
def construct_channel(self, *args, **kwargs): """ construct_channel: Creates ChannelNode and build topic tree Solar Spell is organized with the following hierarchy(Sample): Creative Arts (source_id = dir-creative-arts) |--- Culinary Arts (source_id = dir-culinary-arts) |--- |--- Real Pasifik 2 introducing Chef Alexis Tahiapuhe of Tahiti (source_id = file-real pasifik 2 introducing chef lela bolobolo of fiji.mp4) |--- Pacific Islands Arts and Culture(source_id = dir_pacific_islands_arts_and_culture) |--- |--- Cook Islands National Cultural Policy 10 July 2017_final english (File) |--- Teaching Resources and Classroom Activities Environment (source_id = dir-environment) |--- Adapting to Climate Change |--- |--- Action Against Climate Change Tuvalu Water and climate change |--- Climate Change Info |--- |--- Animated Pacific Island Climate Change Videos ... Returns: ChannelNode """ LOGGER.info("Constructing channel from {}...".format(BASE_URL)) channel = self.get_channel( *args, **kwargs) # Creates ChannelNode from data in self.channel_info LOGGER.info(' Writing {} Folder...'.format(CHANNEL_NAME)) endpoint = BASE_URL + "content/" scrape_content(endpoint, channel) raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def scrape_multilanguage_slideshows(channel): LOGGER.info('Scraping multi-language content...') contents = BeautifulSoup(downloader.read(SLIDESHOWS_URL), 'html5lib') collection_key = get_collection_key(contents) languages_selection = contents.find('div', { 'class': 'asset-list' }).find('div') language_list = json.loads( languages_selection['data-react-props'])['sections'] for language in language_list: asset_url = SLIDESHOW_ASSETS_URL.format( collection='qac6i4-foozd4-68u325', section=language['section_key']) slide_data = json.loads(downloader.read(asset_url))['data'] translated_name = languages.getlang( LANGUAGE_MAP[language['name']]).native_name if LANGUAGE_MAP[ language['name']] else language['name'] LOGGER.info(' {}'.format(translated_name.encode('utf-8'))) slides = [{ 'url': slide['attributes']['thumbnail_url'].replace( 'element.png', '*****@*****.**') } for slide in slide_data] if len(slides): channel.add_child( create_slideshow(slides, asset_url, translated_name, language['name']))
def scrape_english_collection(channel): LOGGER.info('Scraping English collection...') english_topic = nodes.TopicNode(source_id=ENGLISH_COLLECTION_URL, title="English") channel.add_child(english_topic) contents = BeautifulSoup(downloader.read(ENGLISH_COLLECTION_URL), 'html5lib') collection_key = get_collection_key(contents) topic_selection = contents.find('div', {'class': 'asset-list'}).find('div') topic_list = [ t for t in json.loads(topic_selection['data-react-props'])['sections'] if t['id'] not in EXCLUDED_TOPIC_IDS ] for topic in topic_list: LOGGER.info(' {}'.format(topic['name'].encode('utf-8'))) topic_node = nodes.TopicNode(source_id=topic['section_key'], title=topic['name']) english_topic.add_child(topic_node) # Scrape items in the topic url = ENGLISH_ASSETS_URL.format(collection=collection_key, section=topic['section_key']) scrape_collection_files(topic_node, url)
def scrape_channel(channel): # Read from Categorias dropdown menu page = BeautifulSoup(downloader.read(BASE_URL), 'html5lib') dropdown = page.find('a', {'id': 'btn-categorias'}).find_next_sibling('ul') # Go through dropdown and generate topics and subtopics for category_list in dropdown.find_all('li', {'class': 'has-children'}): # Parse categories for category in category_list.find_all('li', {'class': 'has-children'}): # Add this topic to channel when scraping entire channel category_name = category.find('a').text topic = nodes.TopicNode(title=category_name, source_id=get_source_id(category_name)) channel.add_child(topic) LOGGER.info(topic.title) # Parse subcategories for subcategory in category.find_all('li'): if not subcategory.attrs.get('class') or 'go-back' not in subcategory.attrs['class']: # Get rid of this check to scrape entire site subcategory_name = subcategory.find('a').text subcategory_link = subcategory.find('a')['href'] LOGGER.info(' {}'.format(subcategory_name)) subtopic = nodes.TopicNode(title=subcategory_name, source_id=get_source_id(subcategory_link)) topic.add_child(subtopic) # Parse resources scrape_subcategory(subcategory_link, subtopic)
def scrape_resource(url, topic): resource = BeautifulSoup(downloader.read(url), 'html5lib') LOGGER.info(' {}'.format(resource.find('h2').text)) filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href']) license = None author = '' for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'): if 'Licencia' in data_section.text: try: license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal") except KeyError as e: LOGGER.error(str(e)) license = licenses.CC_BYLicense elif 'Autor' in data_section.text: author = data_section.find_next_sibling('p').text if filepath: thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src'] if thumbnail.endswith('.gif'): thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')]) with open(thumbnail, 'wb') as fobj: fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src'])) topic.add_child(nodes.HTML5AppNode( title=resource.find('h2').text, source_id=url, license=license, author=author, description=resource.find('form').find_all('p')[1].text, thumbnail=thumbnail, tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})], files=[files.HTMLZipFile(path=filepath)], ))
def scrape_snack_subject(slug, topic): """ Scrape snack subject page Args: slug (str): url slug to scrape from (e.g. /subject/arts) topic (TopicNode): topic to add html nodes to """ contents = BeautifulSoup(read(slug), 'html5lib') for activity in contents.find_all('div', {'class': 'activity'}): LOGGER.info(" {}".format(activity.find('h5').text.strip())) # Scrape snack pages into zips write_to_path, tags = scrape_snack_page(activity.find('a')['href']) if not write_to_path: continue # Create html node description = activity.find('div', {'class': 'pod-description'}) topic.add_child( nodes.HTML5AppNode( source_id=activity.find('a')['href'], title=activity.find('h5').text.strip().replace("’", "'"), description=description.text.strip() if description else "", license=LICENSE, copyright_holder=COPYRIGHT_HOLDER, files=[files.HTMLZipFile(path=write_to_path)], thumbnail=get_thumbnail_url(activity.find('img')['src']), tags=tags, )) # Scrape next page (if any) next_page_url = get_next_page_url(contents) if next_page_url: scrape_snack_subject(next_page_url, topic)
def get_html5_app_zip_path(slug): resp = session.get(READ_URL.format(slug)) if resp.status_code == 200: resp = resp.json() else: LOGGER.info('The story {} is not available.\n'.format(slug)) return None content = "" for page in (resp['data']['pages']): soup = BeautifulSoup(page['html'], 'html.parser') if page.get('coverImage', None): img_src = page['coverImage']['sizes'][-1]['url'] soup.img['src'] = img_src content = content + "\n" + str(soup) context = {'content': content} handle, destination = tempfile.mkstemp(suffix=".zip") os.close(handle) htmlwriter = HTMLWriter(destination) with htmlwriter as f: index_html = TEMPLATE_ENVIRONMENT.get_template('indexfile').render( context) f.write_index_contents(index_html) LOGGER.info(destination) return destination
def scrape_video_menu(url): """ Scrape videos from url Args: url (str): url to scrape from (e.g. https://www.exploratorium.edu/video/subjects) Returns TopicNode containing all videos """ LOGGER.info("SCRAPING VIDEOS...") video_topic = nodes.TopicNode(title="Videos", source_id="main-topic-videos") contents = BeautifulSoup(read(url), 'html5lib') for subject in contents.find_all('div', {'class': 'subject'}): title = subject.find('div', { 'class': 'name' }).text.strip().replace("’", "'") LOGGER.info(" {}".format(title)) topic = nodes.TopicNode( title=title, source_id="videos-{}".format(title), thumbnail=get_thumbnail_url(subject.find('img')['src']), ) video_topic.add_child(topic) scrape_video_subject(subject.find('a')['href'], topic) return video_topic
def write_question_row_from_question_dict(self, source_id, question_dict): file_path = get_metadata_file_path(self.channeldir, self.questionsinfo) with open(file_path, 'a') as csv_file: csvwriter = csv.DictWriter(csv_file, EXERCISE_QUESTIONS_INFO_HEADER) def _safe_list_get(l, idx, default): try: return l[idx] except IndexError: return default # change image links to local question_dict = self._make_local_question_images(question_dict) type_lookup = { 'single_selection': exercises.SINGLE_SELECTION, 'true_false': exercises.SINGLE_SELECTION, 'multiple_selection': exercises.MULTIPLE_SELECTION, 'input_question': exercises.INPUT_QUESTION, } # ANSWERS answers = json.loads(question_dict['answers']) options = [] # all options correct = [] # correct andwers for ans in answers: options.append(ans['answer']) if ans['correct']: correct.append(ans['answer']) extra_options = DEFAULT_EXTRA_ITEMS_SEPARATOR.join(options[5:]) # HINTS hints_raw = json.loads(question_dict['hints']) if hints_raw: raise ValueError('Found hints but not handled..') LOGGER.info(' - writing question with studio_id=' + question_dict['assessment_id']) question_row = {} question_row[EXERCISE_SOURCEID_KEY] = source_id question_row[EXERCISE_QUESTIONS_QUESTIONID_KEY] = question_dict['assessment_id'] # question_dict['assessment_id'] question_row[EXERCISE_QUESTIONS_TYPE_KEY] = type_lookup[question_dict['type']] question_row[EXERCISE_QUESTIONS_QUESTION_KEY] = question_dict['question'] question_row[EXERCISE_QUESTIONS_OPTION_A_KEY] = _safe_list_get(options, 0, None) question_row[EXERCISE_QUESTIONS_OPTION_B_KEY] = _safe_list_get(options, 1, None) question_row[EXERCISE_QUESTIONS_OPTION_C_KEY] = _safe_list_get(options, 2, None) question_row[EXERCISE_QUESTIONS_OPTION_D_KEY] = _safe_list_get(options, 3, None) question_row[EXERCISE_QUESTIONS_OPTION_E_KEY] = _safe_list_get(options, 4, None) question_row[EXERCISE_QUESTIONS_OPTION_FGHI_KEY] = extra_options question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY] = _safe_list_get(correct, 0, None) question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY] = _safe_list_get(correct, 1, None) question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY] = _safe_list_get(correct, 2, None) question_row[EXERCISE_QUESTIONS_HINT_1_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_2_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_3_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_4_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_5_KEY] = None # TODO question_row[EXERCISE_QUESTIONS_HINT_6789_KEY] = None # TODO # WRITE QUESTION ROW csvwriter.writerow(question_row)
def construct_channel(self, *args, **kwargs): channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info lang_names = list(self.data.keys()) lang_names.sort() for lang_name in lang_names: lang_data = self.data[lang_name] LOGGER.info("Creating app for language: {}".format(lang_name)) lang = languages.getlang_by_native_name(lang_name) zip_dir = self.client.create_zip_dir_for_page(lang_data['url']) soup = self.client.get_page_soup(lang_data['url']) # Remove the translation list if found translations = soup.find('div', {'id': 'translations'}) if translations: translations.extract() # Grab the localized title title = soup.find('span', {'id': 'share_title'}).text # Save the modified index.html page thumbnail = None for resource in lang_data['resources']: if 'dp3t.png' in resource: thumbnail = os.path.join(zip_dir, resource) break with open(os.path.join(zip_dir, 'index.html'), 'wb') as f: f.write(soup.prettify(encoding='utf-8')) # create_predictable_zip ensures that the ZIP file does not change each time it's created. This # ensures that the zip doesn't get re-uploaded just because zip metadata changed. zip_file = zip.create_predictable_zip(zip_dir) zip_name = lang.primary_code if lang else lang_name zip_filename = os.path.join(self.ZIP_DIR, "{}.zip".format(zip_name)) os.makedirs(os.path.dirname(zip_filename), exist_ok=True) os.rename(zip_file, zip_filename) topic = nodes.TopicNode(source_id=lang_name, title=lang_name) zip_node = nodes.HTML5AppNode( source_id="covid19-sim-{}".format(lang_name), title=title, files=[files.HTMLZipFile(zip_filename)], license=licenses.PublicDomainLicense( "Marcel Salathé & Nicky Case"), language=lang, thumbnail=thumbnail) topic.add_child(zip_node) channel.add_child(topic) return channel
def scrape_subcategory(link, topic): url = "{}{}".format(BASE_URL, link.lstrip("/")) resource_page = BeautifulSoup(downloader.read(url), 'html5lib') # Skip "All" category for resource_filter in resource_page.find('div', {'class': 'menu-filtro'}).find_all('a')[1:]: LOGGER.info(' {}'.format(resource_filter.text)) source_id = get_source_id('{}/{}'.format(topic.title, resource_filter.text)) filter_topic = nodes.TopicNode(title=resource_filter.text, source_id=source_id) scrape_resource_list(url + resource_filter['href'], filter_topic) topic.add_child(filter_topic)
def get_nodes_by_ids_complete(self, studio_id): """ Get the complete JSON representation of a content node from the Studio API. """ NODES_ENDPOINT = self.studio_url + '/api/get_nodes_by_ids_complete/' headers = {"Authorization": "Token {0}".format(self.token)} url = NODES_ENDPOINT + studio_id LOGGER.info(' GET ' + url) response = requests.get(url, headers=headers) studio_node = response.json()[0] return studio_node
def _recusive_visit_rm_global_nav_children(subtree): newchildren = [] for child in subtree['children']: child_url = child['url'] if len(child['children'] ) == 0 and child_url in global_nav_urls: LOGGER.info('Removing global nav url =' + child_url) else: clean_child = _recusive_visit_rm_global_nav_children(child) newchildren.append(clean_child) subtree['children'] = newchildren return subtree
def pre_run(self, args, options): """ This is where all the works happens for this chef: - Load the source tree from the Khan Academy API - Convert the tree of Khan-objects in ricecooker_json dicts objects - Write ricecooker json tree to the appropriate file """ lang, variant = self.parse_lang_and_variant_from_kwargs(options) if lang == "en" and variant != "in-in": # Load the CCSSM tags for the KA en channel (but not in-in variant) global CC_MAPPING CC_MAPPING = generate_common_core_mapping() channel_node = self.get_channel_dict(options) channel_node["children"] = [] # Handle special case of building Kolibri channel from youtube playlists if options.get("youtube_channel_id"): youtube_channel_id = options.get("youtube_channel_id") LOGGER.info("Found YouTube channel {}".format(youtube_channel_id)) root_node = youtube_playlist_scraper(youtube_channel_id, channel_node) json_tree_path = self.get_json_tree_path(**options) LOGGER.info("Writing youtube ricecooker tree to " + json_tree_path) write_tree_to_json_tree(json_tree_path, root_node) return None LOGGER.info("Downloading KA topic tree") # Obtain the complete topic tree for lang=lang from the KA API ka_root_topic, topics_by_slug = get_khan_topic_tree(lang=lang) # TODO: discuss w @kollivier introducing "archive" step here (for source diffs) self.topics_by_slug = topics_by_slug # to be used for topic replacments self.slug_blacklist = get_slug_blacklist(lang=lang, variant=variant) self.topic_replacements = get_topic_tree_replacements(lang=lang, variant=variant) if options.get("english_subtitles"): # we will include english videos with target language subtitles duplicate_videos(ka_root_topic) LOGGER.info("Converting KA nodes to ricecooker json nodes") root_topic = self.convert_ka_node_to_ricecooker_node(ka_root_topic, target_lang=lang) for topic in root_topic["children"]: channel_node["children"].append(topic) # write to ricecooker tree to json file json_tree_path = self.get_json_tree_path(**options) LOGGER.info("Writing ricecooker json tree to " + json_tree_path) write_tree_to_json_tree(json_tree_path, channel_node)
def download_structure_csv(which=None): if which == 'English': response = requests.get(PRADIGI_ENGLISH_SHEET_CSV_URL) csv_data = response.content.decode('utf-8') with open(PRADIGI_ENGLISH_SHEET_CSV_PATH, 'w') as csvfile: csvfile.write(csv_data) LOGGER.info('Succesfully saved ' + PRADIGI_ENGLISH_SHEET_CSV_PATH) return PRADIGI_ENGLISH_SHEET_CSV_PATH else: response = requests.get(PRADIGI_SHEET_CSV_URL) csv_data = response.content.decode('utf-8') with open(PRADIGI_SHEET_CSV_PATH, 'w') as csvfile: csvfile.write(csv_data) LOGGER.info('Succesfully saved ' + PRADIGI_SHEET_CSV_PATH) return PRADIGI_SHEET_CSV_PATH
def scrape_video_page(self, url, title): """ Creates a video topic with all the videos on the page """ IGNORED_VIDEOS = ['google', 'facebook'] VIDEO_SCRAPERS = [who.WHOWebVideoScraper, who.WHOVideoScraper] video_topic = nodes.TopicNode(source_id=url, title=title) contents = BeautifulSoup(downloader.read(url), 'html.parser') # Scrape youtube embeds # e.g. https://www.who.int/emergencies/diseases/novel-coronavirus-2019/advice-for-public/videos for iframe in contents.findAll('iframe'): if not any( [test for test in IGNORED_VIDEOS if test in iframe['src']]): header = iframe.find_parent('div', { 'class': 'sf_colsIn' }).find('div', { 'class': 'section-heading' }).text.strip() LOGGER.info(' - Downloading {}'.format( header.encode('utf-8'))) scraper = guess_scraper(iframe['src'], scrapers=VIDEO_SCRAPERS ) # Might be native or youtube video video_node = scraper.to_contentnode(header, license=LICENSE, directory="videos") video_topic.add_child(video_node) # Scrape native videos # e.g. https://www.who.int/zh/emergencies/diseases/novel-coronavirus-2019/advice-for-public/videos for video in contents.findAll('div', {'class': 'sf-multimedia-item__video'}): header = video.find('h3').text.strip() LOGGER.info(' - Downloading {}'.format( header.encode('utf-8'))) video_matches = re.search(r"\(\s*\"(.+)\"\,\s*\"(.+)\"\)", video.find('a')['onclick']) # Embedded youtube videos here refer to playlists, so skip them if 'YoutubeVideo' == video_matches.group(1): continue scraper = who.WHOVideoScraper(video_matches.group(2)) video_node = scraper.to_contentnode(header, license=LICENSE, directory="videos") video_topic.add_child(video_node) return video_topic
def scrape_video_collection(url, topic): """ Scrape videos under video collection and add to the topic node Args: url (str): url to video page (e.g. https://www.exploratorium.edu/video/inflatable-jimmy-kuehnle) topic (TopicNode): topic to add video nodes to """ try: collection_contents = BeautifulSoup(read(url), 'html5lib') for result in collection_contents.find_all('div', {'class': 'search-result'}): header = result.find('div', {'class': 'views-field-field-html-title'}) LOGGER.info(" {}".format(header.text.strip())) # Get video from given url description = result.find('div', {'class': 'search-description'}) video_contents = BeautifulSoup(read(header.find('a')['href']), 'html.parser') for k, v in get_brightcove_mapping(video_contents).items(): video_node = nodes.VideoNode( source_id=k, title=header.text.strip().replace("’", "'"), description=description.text.strip() if description else "", license=LICENSE, copyright_holder=COPYRIGHT_HOLDER, author=v.get('author') or "", files=[ files.WebVideoFile(v['url'], high_resolution=False) ], thumbnail=get_thumbnail_url(result.find('img')['src']), ) # If video doesn't already exist here, add to topic if not next((c for c in topic.children if c.source_id == video_node.source_id), None): topic.add_child(video_node) # Scrape next page (if any) next_page_url = get_next_page_url(collection_contents) if next_page_url: scrape_video_collection(next_page_url, topic) except requests.exceptions.HTTPError: LOGGER.error("Could not read collection at {}".format(url))
def get_phet_zip_file(zip_file_url, main_file_and_query): """ Phet simulations are provided in the zip file `phet.zip`, and the entry point is passed as a GET parameter in `main_file_and_query`. To make these compatible with Kolibri's default behaviour of loading index.html, we will: - Rename index.html to phetindex.thml - Add a custom index.html that uses javascrpt redirect to phetindex.thml?{sim_id} """ u = urlparse(main_file_and_query) idk, sim_id = u.query.split('=') assert idk == 'id', 'unknown query sting format found' + main_file_and_query main_file = u.scheme + '://' + u.netloc + u.path # skip querystring destpath = tempfile.mkdtemp() LOGGER.info('saving phet zip file in dir ' + destpath) try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] zip_basename = zip_filename.rsplit('.', 1)[0] zip_folder = os.path.join(destpath, zip_basename) # Extract zip file contents. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: zf.extractall(destpath) # Rename main_file to phetindex.html main_file = main_file.split('/')[-1] src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'phetindex.html') os.rename(src, dest) # Create the index_html = PHET_INDEX_HTML_TEMPLATE.format(sim_id=sim_id) with open(os.path.join(zip_folder, 'index.html'), 'w') as indexf: indexf.write(index_html) # Always be zipping! return create_predictable_zip(zip_folder) except Exception as e: LOGGER.error("get_phet_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file_and_query, destpath, e)) return None
def pre_run(self, args, options): """ Build the ricecooker json tree for the entire channel """ LOGGER.info('in pre_run...') # delete .zip files in temporary dir when running using update if args['update']: LOGGER.info('Deleting all zips in cache dir {}'.format( HTML5APP_ZIPS_LOCAL_DIR)) for rel_path in os.listdir(HTML5APP_ZIPS_LOCAL_DIR): abs_path = os.path.join(HTML5APP_ZIPS_LOCAL_DIR, rel_path) if os.path.isdir(abs_path): shutil.rmtree(abs_path) # option to skip crawling stage if 'nocrawl' not in options: self.crawl(args, options) # Conditionally determine `source_id` depending on variant specified if 'variant' in options and options['variant'].upper() == 'LE': # Official PraDigi channel = channel_name = 'PraDigi' channel_source_id = PRADIGI_SOURCE_ID__VARIANT_LE DEBUG_MODE = False else: # Pratham ETL (used to import content from website into Pratham app) # channel_id = f9da12749d995fa197f8b4c0192e7b2c channel_name = 'PraDigi Pratham' channel_source_id = PRADIGI_SOURCE_ID__VARIANT_PRATHAM ricecooker_json_tree = dict( title=channel_name, source_domain=PRADIGI_DOMAIN, source_id=channel_source_id, description=PRADIGI_DESCRIPTION, thumbnail='chefdata/prathamlogo_b01-v1.jpg', language='mul', children=[], ) for lang in PRADIGI_WEBSITE_LANGUAGES: lang_subtree = self.build_subtree_for_lang(lang) ricecooker_json_tree['children'].append(lang_subtree) json_tree_path = self.get_json_tree_path() write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
def get_subtopics(parent, path): doc = get_page(path) try: menu_row = doc.find('div', {'id': 'body-row'}) menu_row = menu_row.find('div', {'class': 'col-md-2'}) except Exception as e: LOGGER.error('get_subtopics: %s : %s' % (e, doc)) return for subtopic in menu_row.find_all('a'): try: title = subtopic.get_text().strip() source_id = get_source_id(subtopic['href']) LOGGER.info(' subtopic: %s: %s' % (source_id, title)) node = TopicNode(title=title, source_id=source_id) parent.add_child(node) get_lessons(node, subtopic['href']) except Exception as e: LOGGER.error('get_subtopics: %s : %s' % (e, subtopic))