def pre_run(self, args, options):
        if 'lang' not in options:
            raise ValueError('Must specify lang option in ' +
                             str(HPLIFE_LANGS))
        lang = options['lang']
        assert lang in HPLIFE_LANGS

        if not os.path.exists(self.TREES_DATA_DIR):
            os.makedirs(self.TREES_DATA_DIR)

        ricecooker_json_tree = dict(
            title=CHANNEL_TITLE_LOOKUP[lang],
            source_domain='life-global.org',
            source_id='hp-life-courses-{}'.format(lang),
            description=CHANNEL_DESCRIPTION_LOOKUP[lang],
            thumbnail='chefdata/thumbnails/new_channel_thumbnail.png',
            language=lang,
            children=[],
        )
        print('in pre_run; channel info = ', ricecooker_json_tree)

        containerdir = os.path.join(COURSES_DIR, lang)
        course_list = json.load(
            open(os.path.join(containerdir, 'course_list.json')))
        for course in course_list['courses']:
            course_dict = build_subtree_from_course(course,
                                                    containerdir,
                                                    chefargs=args)
            if course_dict:
                ricecooker_json_tree['children'].append(course_dict)
            else:
                print('WARNING: Skipping course', course['name'],
                      'because it failed to pre-validate')
        json_tree_path = self.get_json_tree_path(lang=lang)
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
    def pre_run(self, args, options):
        if "lang" in options:
            language_code = options["lang"]
        else:
            language_code = (
                "en"
            )  # default to en if no language specified on command line

        lang = getlang(language_code) or getlang_by_name(language_code)

        channel_node = dict(
            source_id="KA ({0})".format(language_code),
            source_domain="khanacademy.org",
            title="Khan Academy ({0})".format(lang.native_name),
            description=CHANNEL_DESCRIPTION_LOOKUP.get(
                language_code, "Khan Academy content for {}.".format(lang.name)
            ),
            thumbnail=os.path.join("chefdata", "khan-academy-logo.png"),
            language=lang.code,
            children=[],
        )
        # build studio channel out of youtube playlist
        if options.get("youtube_channel_id"):
            youtube_id = options.get("youtube_channel_id")
            logger.info(
                "Downloading youtube playlist {} for {} language".format(
                    youtube_id, lang.name
                )
            )
            root_node = youtube_playlist_scraper(youtube_id, channel_node)
            # write to json file
            logger.info("writing ricecooker json to a file")
            json_tree_path = self.get_json_tree_path(*args, **options)
            write_tree_to_json_tree(json_tree_path, root_node)
            return

        logger.info("downloading KA tree")
        # build channel through KA API
        ka_root_topic = get_khan_topic_tree(lang=language_code)

        if options.get("english_subtitles"):
            # we will include english videos with target language subtitles
            duplicate_videos(ka_root_topic)

        language_code = lang.primary_code
        if lang.subcode:
            language_code = language_code + "-" + lang.subcode

        logger.info("converting KA nodes to ricecooker json nodes")
        root_topic = convert_ka_node_to_ricecooker_node(
            ka_root_topic, target_lang=language_code
        )

        for topic in root_topic["children"]:
            channel_node["children"].append(topic)

        # write to json file
        logger.info("writing ricecooker json to a file")
        json_tree_path = self.get_json_tree_path(*args, **options)
        write_tree_to_json_tree(json_tree_path, channel_node)
예제 #3
0
    def scrape(self, args, options):
        with open(
                os.path.join(UbongoKidsChef.TREES_DATA_DIR,
                             UbongoKidsChef.CRAWLING_STAGE_OUTPUT), 'r') as f:
            web_resource_tree = json.load(f)
            assert web_resource_tree['kind'] == 'UbongoKidsWebResourceTree'

        ricecooker_json_tree = dict(
            source_domain=UbongoKidsChef.HOSTNAME,
            source_id='ubongokids',
            title='UbongoKids',
            description=
            """Ubongo is a Tanzanian social enterprise that creates fun, localized edutainment for learners in Africa. "Ubongo" means brain in Kiswahili, and we're all about finding fun ways to stimulate kids (and kids at heart) to use their brains. Our entertaining media help learners understand concepts, rather than memorizing them. And we use catchy songs and captivating imagery to make sure they never forget!"""[:
                                                                                                                                                                                                                                                                                                                                                                                                                                    400],
            thumbnail=
            'http://www.ubongokids.com/wp-content/uploads/2016/06/logo_ubongo_kids-150x100.png',
            language='en',
            children=[
                self.scrape_youtube_channel(child)
                for child in web_resource_tree['children']
            ],
            # TODO:
            license=UbongoKidsChef.LICENSE,
        )
        write_tree_to_json_tree(
            os.path.join(UbongoKidsChef.TREES_DATA_DIR,
                         UbongoKidsChef.SCRAPING_STAGE_OUTPUT),
            ricecooker_json_tree)
        return ricecooker_json_tree
예제 #4
0
    def pre_run(self, args, options):
        """
        Build the ricecooker json tree for the channel.
        The code here is similar to the code in `ricecooker_channel/chef.py`, but
        the channel hiearachy is build using dictionary objects instead of classes.
        """
        LOGGER.info('In pre_run...')

        # 1. Create the channel tree
        ricecooker_json_tree = dict(
            title='Sample JSON channel',
            source_domain='source.org',
            source_id='sample-json-channel',
            description='This channel was created from the files in the content/ ' \
                + 'directory and the metadata in sample_ricecooker_json_tree.json',
            thumbnail='./content/sample-json-channel-files/channel_thumbnail.jpg',
            language='en',
            children=[],
        )
        # The root object of the ricecooker json tree contains the channel info;
        # add topic and content nodes and to the children list to build the tree.

        # 2. Add topics nodes and content nodes as to the tree
        self.create_content_nodes(ricecooker_json_tree)
        self.create_exercise_nodes(ricecooker_json_tree)

        # 3. Save the tree to chefdata/trees/sample_ricecooker_json_tree.json
        json_tree_path = self.get_json_tree_path()
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
        LOGGER.info('Finished writing ricecooker json tree.')
    def pre_run(self, args, options):
        """
        This is where all the works happens for this chef:
        - Load the source tree from the Khan Academy API
        - Convert the tree of Khan-objects in ricecooker_json dicts objects
        - Write ricecooker json tree to the appropriate file
        """
        lang, variant = self.parse_lang_and_variant_from_kwargs(options)

        if lang == "en" and variant != "in-in":
            # Load the CCSSM tags for the KA en channel (but not in-in variant)
            global CC_MAPPING
            CC_MAPPING = generate_common_core_mapping()

        channel_node = self.get_channel_dict(options)
        channel_node["children"] = []

        # Handle special case of building Kolibri channel from youtube playlists
        if options.get("youtube_channel_id"):
            youtube_channel_id = options.get("youtube_channel_id")
            LOGGER.info("Found YouTube channel {}".format(youtube_channel_id))
            root_node = youtube_playlist_scraper(youtube_channel_id,
                                                 channel_node)
            json_tree_path = self.get_json_tree_path(**options)
            LOGGER.info("Writing youtube ricecooker tree to " + json_tree_path)
            write_tree_to_json_tree(json_tree_path, root_node)
            return None

        LOGGER.info("Downloading KA topic tree")
        # Obtain the complete topic tree for lang=lang from the KA API
        ka_root_topic, topics_by_slug = get_khan_topic_tree(lang=lang)
        # TODO: discuss w @kollivier introducing "archive" step here (for source diffs)
        self.topics_by_slug = topics_by_slug  # to be used for topic replacments
        self.slug_blacklist = get_slug_blacklist(lang=lang, variant=variant)
        self.topic_replacements = get_topic_tree_replacements(lang=lang,
                                                              variant=variant)

        if options.get("english_subtitles"):
            # we will include english videos with target language subtitles
            duplicate_videos(ka_root_topic)

        LOGGER.info("Converting KA nodes to ricecooker json nodes")
        root_topic = self.convert_ka_node_to_ricecooker_node(ka_root_topic,
                                                             target_lang=lang)
        for topic in root_topic["children"]:
            channel_node["children"].append(topic)

        # write to ricecooker tree to json file
        json_tree_path = self.get_json_tree_path(**options)
        LOGGER.info("Writing ricecooker json tree to " + json_tree_path)
        write_tree_to_json_tree(json_tree_path, channel_node)
예제 #6
0
	def pre_run(self,args,options):
		"""Function to generate nroer_channel_tree and will write tree into json tree"""
		nroer_channel_tree =  dict(
			description = 'This channel was created from the files in the contentdirectory and the metadata in nroer_json_tree.json',
			language = 'en',
			source_domain = 'nroer.gov.in',
			source_id = 'nroer-json-channel_25_6',
			thumbnail = 'https://nroer.gov.in/static/ndf/css/themes/nroer/logo.png',
			title = 'NROER-INDIA',
			children = [],
			)		
		self.create_theme_nodes(nroer_channel_tree)
		
		json_tree_path = self.get_json_tree_path()
		write_tree_to_json_tree(json_tree_path, nroer_channel_tree)
예제 #7
0
    def pre_run(self, args, options):
        """
        Build the ricecooker json tree for the entire channel
        """
        LOGGER.info('in pre_run...')

        # delete .zip files in temporary dir when running using update
        if args['update']:
            LOGGER.info('Deleting all zips in cache dir {}'.format(
                HTML5APP_ZIPS_LOCAL_DIR))
            for rel_path in os.listdir(HTML5APP_ZIPS_LOCAL_DIR):
                abs_path = os.path.join(HTML5APP_ZIPS_LOCAL_DIR, rel_path)
                if os.path.isdir(abs_path):
                    shutil.rmtree(abs_path)

        # option to skip crawling stage
        if 'nocrawl' not in options:
            self.crawl(args, options)

        # Conditionally determine `source_id` depending on variant specified
        if 'variant' in options and options['variant'].upper() == 'LE':
            # Official PraDigi channel =
            channel_name = 'PraDigi'
            channel_source_id = PRADIGI_SOURCE_ID__VARIANT_LE
            DEBUG_MODE = False
        else:
            # Pratham ETL (used to import content from website into Pratham app)
            # channel_id = f9da12749d995fa197f8b4c0192e7b2c
            channel_name = 'PraDigi Pratham'
            channel_source_id = PRADIGI_SOURCE_ID__VARIANT_PRATHAM

        ricecooker_json_tree = dict(
            title=channel_name,
            source_domain=PRADIGI_DOMAIN,
            source_id=channel_source_id,
            description=PRADIGI_DESCRIPTION,
            thumbnail='chefdata/prathamlogo_b01-v1.jpg',
            language='mul',
            children=[],
        )
        for lang in PRADIGI_WEBSITE_LANGUAGES:
            lang_subtree = self.build_subtree_for_lang(lang)
            ricecooker_json_tree['children'].append(lang_subtree)
        json_tree_path = self.get_json_tree_path()
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
    def scrape(self, args, options):
        kwargs = {}  # combined dictionary of argparse args and extra options
        kwargs.update(args)
        kwargs.update(options)

        with open(
                os.path.join(NalibaliChef.TREES_DATA_DIR,
                             NalibaliChef.CRAWLING_STAGE_OUTPUT),
                'r') as json_file:
            web_resource_tree = json.load(json_file)
            assert web_resource_tree['kind'] == 'NalibaliWebResourceTree'

        ricecooker_json_tree = dict(
            source_domain=NalibaliChef.HOSTNAME,
            source_id="nal'ibali",
            title=web_resource_tree['title'],
            description=
            """Nal'ibali (isiXhosa for "here's the story") is a national reading-for-enjoyment campaign to spark children's potential through storytelling and reading.""",
            language='en',
            thumbnail=
            'http://nalibali.org/sites/default/files/nalibali_logo.png',
            children=[],
        )
        hierarchies_map = {
            h['title']: h
            for h in web_resource_tree['children']
        }
        children = [None] * len(hierarchies_map.keys())
        children[0] = self._scrape_hierarchy(
            hierarchies_map.get('Multilingual stories'),
            self._scrape_multilingual_story)
        children[1] = self._scrape_hierarchy(
            hierarchies_map.get('Audio stories'), self._scrape_audio_story)
        children[2] = self._scrape_hierarchy(
            hierarchies_map.get('Story cards'), self._scrape_story_card)
        children[3] = self._scrape_hierarchy(
            hierarchies_map.get('Story seeds'), self._scrape_story_seed)
        children[4] = self._scrape_hierarchy(
            hierarchies_map.get('Your stories'), self._scrape_your_story)
        ricecooker_json_tree['children'] = children
        write_tree_to_json_tree(
            os.path.join(NalibaliChef.TREES_DATA_DIR,
                         NalibaliChef.SCRAPING_STAGE_OUTPUT),
            ricecooker_json_tree)
        return ricecooker_json_tree
예제 #9
0
def scraping_part(json_tree_path):
    """
    Download all categories, subpages, modules, and resources from engageny and
    store them as a ricecooker json tree in the file `json_tree_path`.
    """
    # Read web_resource_trees.json
    with open(os.path.join(TREES_DATA_DIR,
                           CRAWLING_STAGE_OUTPUT)) as json_file:
        web_resource_tree = json.load(json_file)
        assert web_resource_tree['kind'] == 'EngageNYWebResourceTree'

    # Build a Ricecooker tree from scraping process
    ricecooker_json_tree = build_scraping_json_tree(web_resource_tree)
    LOGGER.info('Finished building ricecooker_json_tree')

    # Write out ricecooker_json_tree.json
    write_tree_to_json_tree(
        os.path.join(TREES_DATA_DIR, SCRAPING_STAGE_OUTPUT),
        ricecooker_json_tree)
예제 #10
0
    def pre_run(self, args, options):
        """
        Build the ricecooker json tree for the entire channel
        """
        LOGGER.info('in pre_run...')

        # Conditionally determine `source_id` depending on variant specified
        if 'variant' in options and options['variant'].upper() == 'LE':
            # Official PraDigi channel =
            channel_name = 'PraDigi'
            channel_source_id = PRADIGI_SOURCE_ID__VARIANT_LE
            DEBUG_MODE = False
        else:
            # Pratham ETL (used to import content from website into Pratham app)
            # channel_id = f9da12749d995fa197f8b4c0192e7b2c
            channel_name = 'Pratham PraDigi'
            # channel_source_id = PRADIGI_SOURCE_ID__VARIANT_PRATHAM
            channel_source_id = PRADIGI_SOURCE_ID__VARIANT_PRATHAM + '_testing'

        ricecooker_json_tree = dict(
            title=channel_name,
            source_domain=PRADIGI_DOMAIN,
            source_id=channel_source_id,
            description=PRADIGI_DESCRIPTION,
            thumbnail='chefdata/plogo.jpg',
            language='mul',
            children=[],
        )

        # once all the samples work you can try the full tree
        with open("chefdata/trees/pradigi_hindi_web_resource_tree.json",
                  'r',
                  encoding='utf-8') as jtree:
            web_resource_tree = json.load(jtree)
            web_resource_tree_children = web_resource_tree['children']
            for lang_subtree in web_resource_tree_children:
                ricecooker_subtree = wrt_to_ricecooker_tree(lang_subtree)
                pprint(ricecooker_subtree)
                ricecooker_json_tree['children'].append(ricecooker_subtree)
        pprint(ricecooker_json_tree)
        json_tree_path = self.get_json_tree_path()
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
예제 #11
0
    def pre_run(self, args, options):
        """
        Build the ricecooker json tree for the entire channel.
        """
        LOGGER.info('in pre_run...')

        ricecooker_json_tree = dict(
            title='Edraak Courses (العربيّة)',  # a humand-readbale title
            source_domain=EDRAAK_COURSES_DOMAIN,  # content provider's domain
            source_id=
            'continuing-education-courses',  # an alphanumeric channel ID
            description=EDRAAK_COURSES_CHANNEL_DESCRIPTION,
            thumbnail='./chefdata/edraak-logo.png',  # logo created from SVG
            language=getlang('ar').code,  # language code of channel
            children=[],
        )
        self.add_content_nodes(ricecooker_json_tree)

        json_tree_path = self.get_json_tree_path()
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
예제 #12
0
    def add_content_nodes(self, channel):
        """
        Build the hierarchy of topic nodes and content nodes.
        """
        LOGGER.info('Creating channel content nodes...')

        course_list = json.load(
            open(os.path.join(COURSES_DIR, 'course_list.json')))
        for course in course_list['courses']:  # [1:2]:
            basedir = os.path.join(COURSES_DIR, course['name'])
            coursedir = os.path.join(basedir, 'course')
            course_data = extract_course_tree(coursedir)
            course_id = course_data['course']
            write_tree_to_json_tree(
                os.path.join(ORIGINAL_TREES_DIR, course_id + '.json'),
                course_data)
            # print_course(course_data, translate_from='ar')
            clean_subtree(course_data, coursedir)
            print('Cleaned course', course_data['course'], '#' * 80)
            write_tree_to_json_tree(
                os.path.join(CLEAN_TREES_DIR, course_id + '.json'),
                course_data)
            transformed_tree = transform_tree(course_data, coursedir)
            write_tree_to_json_tree(
                os.path.join(TRANSFORMED_TREES_DIR, course_id + '.json'),
                transformed_tree)
            print_transfomed_tree(transformed_tree, translate_from='ar')
            channel['children'].append(transformed_tree)
            print('\n\n')
예제 #13
0
def build_ricecooker_json_tree(args, options, json_tree_path):
    """
    Download all categories, subpages, modules, and resources from open.edu.
    """
    LOGGER.info('Starting to build the ricecooker_json_tree')
    if 'lang' not in options:
        raise ValueError(
            'Must specify lang=?? on the command line. Supported languages are `en` and `fr`'
        )
    lang = options['lang']
    lang_dir = LANGUAGE_FOLDER_LOOKUP[lang]

    # Ricecooker tree
    ricecooker_json_tree = dict(
        source_domain='aflatoun.org',
        source_id='aflatoun-{}'.format(lang),
        title='Aflatoun Academy ({})'.format(lang),
        thumbnail='./content/images/aflatoun_logo.jpg',
        description='Aflatoun International offers social and financial'
        ' education to millions of children and young people'
        ' worldwide, empowering them to make a positive change'
        ' for a more equitable world.',
        language=lang,
        children=[],
    )
    channel_base_dir = os.path.join(AFLATOUN_CONTENT_BASE_DIR, lang_dir)
    content_folders = sorted(os.walk(channel_base_dir))

    # MAIN PROCESSING OF os.walk OUTPUT
    ############################################################################
    _ = content_folders.pop(
        0)  # Skip over channel folder because handled above
    for raw_path, _subfolders, filenames in content_folders:
        LOGGER.info('processing folder ' + str(raw_path))
        sorted_filenames = sorted(filenames)
        process_folder(ricecooker_json_tree, raw_path, sorted_filenames, lang)

    # Write out ricecooker_json_tree_{en/fr}.json
    write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
    def _scraping_part(self, json_tree_path, options):
        """
        Download all categories, subpages, modules, and resources from engageny and
        store them as a ricecooker json tree in the file `json_tree_path`.
        """
        # Read web_resource_trees.json
        with open(
                os.path.join(EngageNYChef.TREES_DATA_DIR,
                             EngageNYChef.CRAWLING_STAGE_OUTPUT)) as json_file:
            web_resource_tree = json.load(json_file)
            assert web_resource_tree['kind'] == 'EngageNYWebResourceTree'

        if not self._lang:
            self._setup_language(options)

        # Build a Ricecooker tree from scraping process
        ricecooker_json_tree = self._build_scraping_json_tree(
            web_resource_tree)
        self._logger.info('Finished building {json_tree_path}'.format(
            json_tree_path=json_tree_path))

        # Write out ricecooker_json_tree_{lang_code}.json
        write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
 def write_tree_to_json(self, channel_tree):
     write_tree_to_json_tree(self.scrape_stage, channel_tree)
예제 #16
0
 def write_tree_to_json(self, channel_tree):
     scrape_stage = os.path.join(KingKhaledChef.TREES_DATA_DIR,
                                 self.RICECOOKER_JSON_TREE)
     write_tree_to_json_tree(scrape_stage, channel_tree)
def build_ricecooker_json_tree(args, options, json_tree_path):
    print('json_tree_path=', json_tree_path)
    """
    Convert the OPDS feed into a Ricecooker JSON tree, with the following strucutre:
        Channel
            --> Language (TopicNode)
                    --> readingLevel (from lrmi_educationalalignment
                            --> Book.pdf  (DocumentNode)
    """
    LOGGER.info('Starting to build the ricecooker_json_tree')
    # if 'lang' not in options:
    #     raise ValueError('Must specify lang=?? on the command line. Supported languages are `en` and `fr`')
    # lang = options['lang']

    # Ricecooker tree for the channel
    ricecooker_json_tree = dict(
        source_domain='digitallibrary.io',
        source_id='digitallibrary-testing',  # feed_dict['id'],
        title='Global Digital Library - Book Catalog',  # ({})'.format(lang),
        thumbnail='./content/globaldigitallibrary_logo.png',
        description='The Global Digital Library (GDL) is being developed to '
        'increase the availability of high quality reading resources '
        'in languages children and youth speak and understand.',
        language='en',  # lang,
        children=[],
    )

    OPDS_LANG_ROOTS = build_lang_lookup_table(FEED_ROOT_URL)

    print("{} languages found".format(len(OPDS_LANG_ROOTS)))
    for lang_code in sorted(OPDS_LANG_ROOTS.keys()):
        print("Processing lang_code", lang_code)
        lang_dict = OPDS_LANG_ROOTS[lang_code]
        start_url = lang_dict['href']
        feed_dict, all_entries = parse_entire_feed(start_url)
        if feed_dict is None:
            continue  # Skip over empty or broken feeds
        lang_topic = dict(
            kind=content_kinds.TOPIC,
            source_id=start_url,
            title=lang_dict['lang_title'],
            author='',
            description='',
            language=lang_code,
            thumbnail=None,
            children=[],
        )
        ricecooker_json_tree['children'].append(lang_topic)

        # Group entries by their  lrmi_educationalalignment readingLevel value
        entries_by_readingLevel = defaultdict(list)
        for entry in all_entries:
            level = _get_reading_level(entry)
            entries_by_readingLevel[level].append(entry)

        # Make a subtopic from each level
        levels = sorted(entries_by_readingLevel.keys())
        for level in levels:
            entries = entries_by_readingLevel[level]
            print("Processing level", level)
            level_topic = dict(
                kind=content_kinds.TOPIC,
                source_id='digitallibrary.io' + ':' + lang_code + ':' + level,
                title=level,
                author='',
                description='',
                language=lang_code,
                thumbnail=None,
                children=[],
            )
            lang_topic['children'].append(level_topic)

            # Make a subtopic from each level
            for entry in entries:
                content_node = content_node_from_entry(entry, lang_code)
                if content_node:
                    level_topic['children'].append(content_node)
                else:
                    print('content_node None for entry', entry)

    # Write out ricecooker_json_tree.json
    write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)