def pre_run(self, args, options): if 'lang' not in options: raise ValueError('Must specify lang option in ' + str(HPLIFE_LANGS)) lang = options['lang'] assert lang in HPLIFE_LANGS if not os.path.exists(self.TREES_DATA_DIR): os.makedirs(self.TREES_DATA_DIR) ricecooker_json_tree = dict( title=CHANNEL_TITLE_LOOKUP[lang], source_domain='life-global.org', source_id='hp-life-courses-{}'.format(lang), description=CHANNEL_DESCRIPTION_LOOKUP[lang], thumbnail='chefdata/thumbnails/new_channel_thumbnail.png', language=lang, children=[], ) print('in pre_run; channel info = ', ricecooker_json_tree) containerdir = os.path.join(COURSES_DIR, lang) course_list = json.load( open(os.path.join(containerdir, 'course_list.json'))) for course in course_list['courses']: course_dict = build_subtree_from_course(course, containerdir, chefargs=args) if course_dict: ricecooker_json_tree['children'].append(course_dict) else: print('WARNING: Skipping course', course['name'], 'because it failed to pre-validate') json_tree_path = self.get_json_tree_path(lang=lang) write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
def pre_run(self, args, options): if "lang" in options: language_code = options["lang"] else: language_code = ( "en" ) # default to en if no language specified on command line lang = getlang(language_code) or getlang_by_name(language_code) channel_node = dict( source_id="KA ({0})".format(language_code), source_domain="khanacademy.org", title="Khan Academy ({0})".format(lang.native_name), description=CHANNEL_DESCRIPTION_LOOKUP.get( language_code, "Khan Academy content for {}.".format(lang.name) ), thumbnail=os.path.join("chefdata", "khan-academy-logo.png"), language=lang.code, children=[], ) # build studio channel out of youtube playlist if options.get("youtube_channel_id"): youtube_id = options.get("youtube_channel_id") logger.info( "Downloading youtube playlist {} for {} language".format( youtube_id, lang.name ) ) root_node = youtube_playlist_scraper(youtube_id, channel_node) # write to json file logger.info("writing ricecooker json to a file") json_tree_path = self.get_json_tree_path(*args, **options) write_tree_to_json_tree(json_tree_path, root_node) return logger.info("downloading KA tree") # build channel through KA API ka_root_topic = get_khan_topic_tree(lang=language_code) if options.get("english_subtitles"): # we will include english videos with target language subtitles duplicate_videos(ka_root_topic) language_code = lang.primary_code if lang.subcode: language_code = language_code + "-" + lang.subcode logger.info("converting KA nodes to ricecooker json nodes") root_topic = convert_ka_node_to_ricecooker_node( ka_root_topic, target_lang=language_code ) for topic in root_topic["children"]: channel_node["children"].append(topic) # write to json file logger.info("writing ricecooker json to a file") json_tree_path = self.get_json_tree_path(*args, **options) write_tree_to_json_tree(json_tree_path, channel_node)
def scrape(self, args, options): with open( os.path.join(UbongoKidsChef.TREES_DATA_DIR, UbongoKidsChef.CRAWLING_STAGE_OUTPUT), 'r') as f: web_resource_tree = json.load(f) assert web_resource_tree['kind'] == 'UbongoKidsWebResourceTree' ricecooker_json_tree = dict( source_domain=UbongoKidsChef.HOSTNAME, source_id='ubongokids', title='UbongoKids', description= """Ubongo is a Tanzanian social enterprise that creates fun, localized edutainment for learners in Africa. "Ubongo" means brain in Kiswahili, and we're all about finding fun ways to stimulate kids (and kids at heart) to use their brains. Our entertaining media help learners understand concepts, rather than memorizing them. And we use catchy songs and captivating imagery to make sure they never forget!"""[: 400], thumbnail= 'http://www.ubongokids.com/wp-content/uploads/2016/06/logo_ubongo_kids-150x100.png', language='en', children=[ self.scrape_youtube_channel(child) for child in web_resource_tree['children'] ], # TODO: license=UbongoKidsChef.LICENSE, ) write_tree_to_json_tree( os.path.join(UbongoKidsChef.TREES_DATA_DIR, UbongoKidsChef.SCRAPING_STAGE_OUTPUT), ricecooker_json_tree) return ricecooker_json_tree
def pre_run(self, args, options): """ Build the ricecooker json tree for the channel. The code here is similar to the code in `ricecooker_channel/chef.py`, but the channel hiearachy is build using dictionary objects instead of classes. """ LOGGER.info('In pre_run...') # 1. Create the channel tree ricecooker_json_tree = dict( title='Sample JSON channel', source_domain='source.org', source_id='sample-json-channel', description='This channel was created from the files in the content/ ' \ + 'directory and the metadata in sample_ricecooker_json_tree.json', thumbnail='./content/sample-json-channel-files/channel_thumbnail.jpg', language='en', children=[], ) # The root object of the ricecooker json tree contains the channel info; # add topic and content nodes and to the children list to build the tree. # 2. Add topics nodes and content nodes as to the tree self.create_content_nodes(ricecooker_json_tree) self.create_exercise_nodes(ricecooker_json_tree) # 3. Save the tree to chefdata/trees/sample_ricecooker_json_tree.json json_tree_path = self.get_json_tree_path() write_tree_to_json_tree(json_tree_path, ricecooker_json_tree) LOGGER.info('Finished writing ricecooker json tree.')
def pre_run(self, args, options): """ This is where all the works happens for this chef: - Load the source tree from the Khan Academy API - Convert the tree of Khan-objects in ricecooker_json dicts objects - Write ricecooker json tree to the appropriate file """ lang, variant = self.parse_lang_and_variant_from_kwargs(options) if lang == "en" and variant != "in-in": # Load the CCSSM tags for the KA en channel (but not in-in variant) global CC_MAPPING CC_MAPPING = generate_common_core_mapping() channel_node = self.get_channel_dict(options) channel_node["children"] = [] # Handle special case of building Kolibri channel from youtube playlists if options.get("youtube_channel_id"): youtube_channel_id = options.get("youtube_channel_id") LOGGER.info("Found YouTube channel {}".format(youtube_channel_id)) root_node = youtube_playlist_scraper(youtube_channel_id, channel_node) json_tree_path = self.get_json_tree_path(**options) LOGGER.info("Writing youtube ricecooker tree to " + json_tree_path) write_tree_to_json_tree(json_tree_path, root_node) return None LOGGER.info("Downloading KA topic tree") # Obtain the complete topic tree for lang=lang from the KA API ka_root_topic, topics_by_slug = get_khan_topic_tree(lang=lang) # TODO: discuss w @kollivier introducing "archive" step here (for source diffs) self.topics_by_slug = topics_by_slug # to be used for topic replacments self.slug_blacklist = get_slug_blacklist(lang=lang, variant=variant) self.topic_replacements = get_topic_tree_replacements(lang=lang, variant=variant) if options.get("english_subtitles"): # we will include english videos with target language subtitles duplicate_videos(ka_root_topic) LOGGER.info("Converting KA nodes to ricecooker json nodes") root_topic = self.convert_ka_node_to_ricecooker_node(ka_root_topic, target_lang=lang) for topic in root_topic["children"]: channel_node["children"].append(topic) # write to ricecooker tree to json file json_tree_path = self.get_json_tree_path(**options) LOGGER.info("Writing ricecooker json tree to " + json_tree_path) write_tree_to_json_tree(json_tree_path, channel_node)
def pre_run(self,args,options): """Function to generate nroer_channel_tree and will write tree into json tree""" nroer_channel_tree = dict( description = 'This channel was created from the files in the contentdirectory and the metadata in nroer_json_tree.json', language = 'en', source_domain = 'nroer.gov.in', source_id = 'nroer-json-channel_25_6', thumbnail = 'https://nroer.gov.in/static/ndf/css/themes/nroer/logo.png', title = 'NROER-INDIA', children = [], ) self.create_theme_nodes(nroer_channel_tree) json_tree_path = self.get_json_tree_path() write_tree_to_json_tree(json_tree_path, nroer_channel_tree)
def pre_run(self, args, options): """ Build the ricecooker json tree for the entire channel """ LOGGER.info('in pre_run...') # delete .zip files in temporary dir when running using update if args['update']: LOGGER.info('Deleting all zips in cache dir {}'.format( HTML5APP_ZIPS_LOCAL_DIR)) for rel_path in os.listdir(HTML5APP_ZIPS_LOCAL_DIR): abs_path = os.path.join(HTML5APP_ZIPS_LOCAL_DIR, rel_path) if os.path.isdir(abs_path): shutil.rmtree(abs_path) # option to skip crawling stage if 'nocrawl' not in options: self.crawl(args, options) # Conditionally determine `source_id` depending on variant specified if 'variant' in options and options['variant'].upper() == 'LE': # Official PraDigi channel = channel_name = 'PraDigi' channel_source_id = PRADIGI_SOURCE_ID__VARIANT_LE DEBUG_MODE = False else: # Pratham ETL (used to import content from website into Pratham app) # channel_id = f9da12749d995fa197f8b4c0192e7b2c channel_name = 'PraDigi Pratham' channel_source_id = PRADIGI_SOURCE_ID__VARIANT_PRATHAM ricecooker_json_tree = dict( title=channel_name, source_domain=PRADIGI_DOMAIN, source_id=channel_source_id, description=PRADIGI_DESCRIPTION, thumbnail='chefdata/prathamlogo_b01-v1.jpg', language='mul', children=[], ) for lang in PRADIGI_WEBSITE_LANGUAGES: lang_subtree = self.build_subtree_for_lang(lang) ricecooker_json_tree['children'].append(lang_subtree) json_tree_path = self.get_json_tree_path() write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
def scrape(self, args, options): kwargs = {} # combined dictionary of argparse args and extra options kwargs.update(args) kwargs.update(options) with open( os.path.join(NalibaliChef.TREES_DATA_DIR, NalibaliChef.CRAWLING_STAGE_OUTPUT), 'r') as json_file: web_resource_tree = json.load(json_file) assert web_resource_tree['kind'] == 'NalibaliWebResourceTree' ricecooker_json_tree = dict( source_domain=NalibaliChef.HOSTNAME, source_id="nal'ibali", title=web_resource_tree['title'], description= """Nal'ibali (isiXhosa for "here's the story") is a national reading-for-enjoyment campaign to spark children's potential through storytelling and reading.""", language='en', thumbnail= 'http://nalibali.org/sites/default/files/nalibali_logo.png', children=[], ) hierarchies_map = { h['title']: h for h in web_resource_tree['children'] } children = [None] * len(hierarchies_map.keys()) children[0] = self._scrape_hierarchy( hierarchies_map.get('Multilingual stories'), self._scrape_multilingual_story) children[1] = self._scrape_hierarchy( hierarchies_map.get('Audio stories'), self._scrape_audio_story) children[2] = self._scrape_hierarchy( hierarchies_map.get('Story cards'), self._scrape_story_card) children[3] = self._scrape_hierarchy( hierarchies_map.get('Story seeds'), self._scrape_story_seed) children[4] = self._scrape_hierarchy( hierarchies_map.get('Your stories'), self._scrape_your_story) ricecooker_json_tree['children'] = children write_tree_to_json_tree( os.path.join(NalibaliChef.TREES_DATA_DIR, NalibaliChef.SCRAPING_STAGE_OUTPUT), ricecooker_json_tree) return ricecooker_json_tree
def scraping_part(json_tree_path): """ Download all categories, subpages, modules, and resources from engageny and store them as a ricecooker json tree in the file `json_tree_path`. """ # Read web_resource_trees.json with open(os.path.join(TREES_DATA_DIR, CRAWLING_STAGE_OUTPUT)) as json_file: web_resource_tree = json.load(json_file) assert web_resource_tree['kind'] == 'EngageNYWebResourceTree' # Build a Ricecooker tree from scraping process ricecooker_json_tree = build_scraping_json_tree(web_resource_tree) LOGGER.info('Finished building ricecooker_json_tree') # Write out ricecooker_json_tree.json write_tree_to_json_tree( os.path.join(TREES_DATA_DIR, SCRAPING_STAGE_OUTPUT), ricecooker_json_tree)
def pre_run(self, args, options): """ Build the ricecooker json tree for the entire channel """ LOGGER.info('in pre_run...') # Conditionally determine `source_id` depending on variant specified if 'variant' in options and options['variant'].upper() == 'LE': # Official PraDigi channel = channel_name = 'PraDigi' channel_source_id = PRADIGI_SOURCE_ID__VARIANT_LE DEBUG_MODE = False else: # Pratham ETL (used to import content from website into Pratham app) # channel_id = f9da12749d995fa197f8b4c0192e7b2c channel_name = 'Pratham PraDigi' # channel_source_id = PRADIGI_SOURCE_ID__VARIANT_PRATHAM channel_source_id = PRADIGI_SOURCE_ID__VARIANT_PRATHAM + '_testing' ricecooker_json_tree = dict( title=channel_name, source_domain=PRADIGI_DOMAIN, source_id=channel_source_id, description=PRADIGI_DESCRIPTION, thumbnail='chefdata/plogo.jpg', language='mul', children=[], ) # once all the samples work you can try the full tree with open("chefdata/trees/pradigi_hindi_web_resource_tree.json", 'r', encoding='utf-8') as jtree: web_resource_tree = json.load(jtree) web_resource_tree_children = web_resource_tree['children'] for lang_subtree in web_resource_tree_children: ricecooker_subtree = wrt_to_ricecooker_tree(lang_subtree) pprint(ricecooker_subtree) ricecooker_json_tree['children'].append(ricecooker_subtree) pprint(ricecooker_json_tree) json_tree_path = self.get_json_tree_path() write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
def pre_run(self, args, options): """ Build the ricecooker json tree for the entire channel. """ LOGGER.info('in pre_run...') ricecooker_json_tree = dict( title='Edraak Courses (العربيّة)', # a humand-readbale title source_domain=EDRAAK_COURSES_DOMAIN, # content provider's domain source_id= 'continuing-education-courses', # an alphanumeric channel ID description=EDRAAK_COURSES_CHANNEL_DESCRIPTION, thumbnail='./chefdata/edraak-logo.png', # logo created from SVG language=getlang('ar').code, # language code of channel children=[], ) self.add_content_nodes(ricecooker_json_tree) json_tree_path = self.get_json_tree_path() write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
def add_content_nodes(self, channel): """ Build the hierarchy of topic nodes and content nodes. """ LOGGER.info('Creating channel content nodes...') course_list = json.load( open(os.path.join(COURSES_DIR, 'course_list.json'))) for course in course_list['courses']: # [1:2]: basedir = os.path.join(COURSES_DIR, course['name']) coursedir = os.path.join(basedir, 'course') course_data = extract_course_tree(coursedir) course_id = course_data['course'] write_tree_to_json_tree( os.path.join(ORIGINAL_TREES_DIR, course_id + '.json'), course_data) # print_course(course_data, translate_from='ar') clean_subtree(course_data, coursedir) print('Cleaned course', course_data['course'], '#' * 80) write_tree_to_json_tree( os.path.join(CLEAN_TREES_DIR, course_id + '.json'), course_data) transformed_tree = transform_tree(course_data, coursedir) write_tree_to_json_tree( os.path.join(TRANSFORMED_TREES_DIR, course_id + '.json'), transformed_tree) print_transfomed_tree(transformed_tree, translate_from='ar') channel['children'].append(transformed_tree) print('\n\n')
def build_ricecooker_json_tree(args, options, json_tree_path): """ Download all categories, subpages, modules, and resources from open.edu. """ LOGGER.info('Starting to build the ricecooker_json_tree') if 'lang' not in options: raise ValueError( 'Must specify lang=?? on the command line. Supported languages are `en` and `fr`' ) lang = options['lang'] lang_dir = LANGUAGE_FOLDER_LOOKUP[lang] # Ricecooker tree ricecooker_json_tree = dict( source_domain='aflatoun.org', source_id='aflatoun-{}'.format(lang), title='Aflatoun Academy ({})'.format(lang), thumbnail='./content/images/aflatoun_logo.jpg', description='Aflatoun International offers social and financial' ' education to millions of children and young people' ' worldwide, empowering them to make a positive change' ' for a more equitable world.', language=lang, children=[], ) channel_base_dir = os.path.join(AFLATOUN_CONTENT_BASE_DIR, lang_dir) content_folders = sorted(os.walk(channel_base_dir)) # MAIN PROCESSING OF os.walk OUTPUT ############################################################################ _ = content_folders.pop( 0) # Skip over channel folder because handled above for raw_path, _subfolders, filenames in content_folders: LOGGER.info('processing folder ' + str(raw_path)) sorted_filenames = sorted(filenames) process_folder(ricecooker_json_tree, raw_path, sorted_filenames, lang) # Write out ricecooker_json_tree_{en/fr}.json write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
def _scraping_part(self, json_tree_path, options): """ Download all categories, subpages, modules, and resources from engageny and store them as a ricecooker json tree in the file `json_tree_path`. """ # Read web_resource_trees.json with open( os.path.join(EngageNYChef.TREES_DATA_DIR, EngageNYChef.CRAWLING_STAGE_OUTPUT)) as json_file: web_resource_tree = json.load(json_file) assert web_resource_tree['kind'] == 'EngageNYWebResourceTree' if not self._lang: self._setup_language(options) # Build a Ricecooker tree from scraping process ricecooker_json_tree = self._build_scraping_json_tree( web_resource_tree) self._logger.info('Finished building {json_tree_path}'.format( json_tree_path=json_tree_path)) # Write out ricecooker_json_tree_{lang_code}.json write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
def write_tree_to_json(self, channel_tree): write_tree_to_json_tree(self.scrape_stage, channel_tree)
def write_tree_to_json(self, channel_tree): scrape_stage = os.path.join(KingKhaledChef.TREES_DATA_DIR, self.RICECOOKER_JSON_TREE) write_tree_to_json_tree(scrape_stage, channel_tree)
def build_ricecooker_json_tree(args, options, json_tree_path): print('json_tree_path=', json_tree_path) """ Convert the OPDS feed into a Ricecooker JSON tree, with the following strucutre: Channel --> Language (TopicNode) --> readingLevel (from lrmi_educationalalignment --> Book.pdf (DocumentNode) """ LOGGER.info('Starting to build the ricecooker_json_tree') # if 'lang' not in options: # raise ValueError('Must specify lang=?? on the command line. Supported languages are `en` and `fr`') # lang = options['lang'] # Ricecooker tree for the channel ricecooker_json_tree = dict( source_domain='digitallibrary.io', source_id='digitallibrary-testing', # feed_dict['id'], title='Global Digital Library - Book Catalog', # ({})'.format(lang), thumbnail='./content/globaldigitallibrary_logo.png', description='The Global Digital Library (GDL) is being developed to ' 'increase the availability of high quality reading resources ' 'in languages children and youth speak and understand.', language='en', # lang, children=[], ) OPDS_LANG_ROOTS = build_lang_lookup_table(FEED_ROOT_URL) print("{} languages found".format(len(OPDS_LANG_ROOTS))) for lang_code in sorted(OPDS_LANG_ROOTS.keys()): print("Processing lang_code", lang_code) lang_dict = OPDS_LANG_ROOTS[lang_code] start_url = lang_dict['href'] feed_dict, all_entries = parse_entire_feed(start_url) if feed_dict is None: continue # Skip over empty or broken feeds lang_topic = dict( kind=content_kinds.TOPIC, source_id=start_url, title=lang_dict['lang_title'], author='', description='', language=lang_code, thumbnail=None, children=[], ) ricecooker_json_tree['children'].append(lang_topic) # Group entries by their lrmi_educationalalignment readingLevel value entries_by_readingLevel = defaultdict(list) for entry in all_entries: level = _get_reading_level(entry) entries_by_readingLevel[level].append(entry) # Make a subtopic from each level levels = sorted(entries_by_readingLevel.keys()) for level in levels: entries = entries_by_readingLevel[level] print("Processing level", level) level_topic = dict( kind=content_kinds.TOPIC, source_id='digitallibrary.io' + ':' + lang_code + ':' + level, title=level, author='', description='', language=lang_code, thumbnail=None, children=[], ) lang_topic['children'].append(level_topic) # Make a subtopic from each level for entry in entries: content_node = content_node_from_entry(entry, lang_code) if content_node: level_topic['children'].append(content_node) else: print('content_node None for entry', entry) # Write out ricecooker_json_tree.json write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)