def test_known_names(): lang_obj = languages.getlang_by_name('English') assert lang_obj is not None, 'English not found' assert lang_obj.code == "en", 'Wrong code' assert lang_obj.name == "English", 'Wrong name' assert lang_obj.native_name == "English", 'Wrong native_name' lang_obj = languages.getlang_by_name('Zulu') assert lang_obj is not None, 'Zulu not found' assert lang_obj.code == "zul", 'Wrong internal repr. code' assert lang_obj.name == "Zulu", 'Wrong name' assert lang_obj.native_name == "isiZulu", 'Wrong native_name' # NOTE: Currently only support full-name matching so would have to lookup by # "name, country" to get local language version lang_obj = languages.getlang_by_name('Portuguese, Brazil') assert lang_obj is not None, 'Brazilian Portuguese not found' assert lang_obj.code == "pt-BR", 'Wrong internal repr. code' assert lang_obj.name == "Portuguese, Brazil", 'Wrong name' assert lang_obj.native_name == "Português (Brasil)", 'Wrong native_name' # NOTE: Currently only support full match lookups where multiple language # specified spearated by semicolons, e.g. "Scottish Gaelic; Gaelic" lang_obj = languages.getlang_by_name('Scottish Gaelic; Gaelic') assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found' assert lang_obj.code == "gd", 'Wrong internal repr. code' assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name' assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name'
def get_khan_topic_tree(lang="en", curr_key=None): if lang == "sw": response = make_request(V2_API_URL.format(lang="swa", projection=PROJECTION_KEYS), timeout=120) else: response = make_request(V2_API_URL.format(lang=lang, projection=PROJECTION_KEYS), timeout=120) topic_tree = ujson.loads(response.content) # if name of lang is passed in, get language code if getlang_by_name(lang): lang = getlang_by_name(lang).primary_code if lang not in SUPPORTED_LANGS: global translations translations = retrieve_translations(lang_code=lang) # Flatten node_data flattened_tree = [ node for node_list in topic_tree.values() for node in node_list ] # convert to dict with ids as keys tree_dict = {node["id"]: node for node in flattened_tree} return _recurse_create(tree_dict["x00000000"], tree_dict, lang=lang)
def get_khan_topic_tree(lang="en", update=False): """ Build the complete topic tree based on the results obtained from the KA API. Note this topic tree contains a combined topic strcuture that includes all curriculum variants, curation pages, and child data may be in wrong order. Returns: tuple (root_node, topics_by_slug) for further processing according based on SLUG_BLACKLIST and TOPIC_TREE_REPLACMENTS specified in curation.py. """ if lang == "sw": # for backward compatibility in case old Swahili code used lang = "swa" # Get the fresh data from the KA API (do not try to re-use cached data) topic_tree = get_khan_api_json(lang, update=update) # if name of lang is passed in, get language code if getlang(lang) is None and getlang_by_name(lang): lang = getlang_by_name(lang).primary_code if lang not in SUPPORTED_LANGS: global translations translations = retrieve_translations(lang) # Flatten node_data (combine topics, videos, and exercises in a single list) flattened_tree = [node for node_list in topic_tree.values() for node in node_list] # Convert to dict with ids as keys (for fast lookups by id) tree_dict = {node["id"]: node for node in flattened_tree} # Build a lookup table {slug --> KhanTopic} to be used for replacement logic topics_by_slug = {} root_node = tree_dict["x00000000"] root = _recurse_create(root_node, tree_dict, topics_by_slug, lang=lang) return root, topics_by_slug
def test_list_like_language_names(): lang_obj = languages.getlang_by_name('Scottish Gaelic') assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found' assert lang_obj.code == "gd", 'Wrong internal repr. code' assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name' assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name' lang_obj = languages.getlang_by_name('Gaelic') assert lang_obj is not None, 'Scottish Gaelic; Gaelic not found' assert lang_obj.code == "gd", 'Wrong internal repr. code' assert lang_obj.name == "Scottish Gaelic; Gaelic", 'Wrong name' assert lang_obj.native_name == "Gàidhlig", 'Wrong native_name'
def test_list_like_language_names(): lang_obj = languages.getlang_by_name("Scottish Gaelic") assert lang_obj is not None, "Scottish Gaelic; Gaelic not found" assert lang_obj.code == "gd", "Wrong internal repr. code" assert lang_obj.name == "Scottish Gaelic; Gaelic", "Wrong name" assert lang_obj.native_name == "Gàidhlig", "Wrong native_name" lang_obj = languages.getlang_by_name("Gaelic") assert lang_obj is not None, "Scottish Gaelic; Gaelic not found" assert lang_obj.code == "gd", "Wrong internal repr. code" assert lang_obj.name == "Scottish Gaelic; Gaelic", "Wrong name" assert lang_obj.native_name == "Gàidhlig", "Wrong native_name"
def test_language_names_with_modifier_in_bracket(): # try to match based on language name (stuff before subcode in brackets) lang_obj = languages.getlang_by_name('Swahili (macrolanguage)') assert lang_obj is not None, 'Swahili not found' assert lang_obj.code == "sw", 'Wrong internal repr. code' assert lang_obj.name == "Swahili", 'Wrong name' assert lang_obj.native_name == "Kiswahili", 'Wrong native_name' # lang_obj = languages.getlang_by_name('Sanskrit (Saṁskṛta)') assert lang_obj is not None, 'Sanskrit not found' assert lang_obj.code == "sa", 'Wrong internal repr. code' assert lang_obj.name == "Sanskrit (Saṁskṛta)", 'Wrong name' assert lang_obj.native_name == "संस्कृतम्", 'Wrong native_name'
def test_language_names_with_modifier_in_bracket(): # try to match based on language name (stuff before subcode in brackets) lang_obj = languages.getlang_by_name("Swahili (macrolanguage)") assert lang_obj is not None, "Swahili not found" # Not deterministic which Swahili code will be returned assert lang_obj.code == "sw" or lang_obj.code == "swa", "Wrong internal repr. code" assert lang_obj.name == "Swahili", "Wrong name" assert lang_obj.native_name == "Kiswahili", "Wrong native_name" # lang_obj = languages.getlang_by_name("Sanskrit (Saṁskṛta)") assert lang_obj is not None, "Sanskrit not found" assert lang_obj.code == "sa", "Wrong internal repr. code" assert lang_obj.name == "Sanskrit (Saṁskṛta)", "Wrong name" assert lang_obj.native_name == "संस्कृतम्", "Wrong native_name"
def pre_run(self, args, options): if "lang" in options: language_code = options["lang"] else: language_code = ( "en" ) # default to en if no language specified on command line lang = getlang(language_code) or getlang_by_name(language_code) channel_node = dict( source_id="KA ({0})".format(language_code), source_domain="khanacademy.org", title="Khan Academy ({0})".format(lang.native_name), description=CHANNEL_DESCRIPTION_LOOKUP.get( language_code, "Khan Academy content for {}.".format(lang.name) ), thumbnail=os.path.join("chefdata", "khan-academy-logo.png"), language=lang.code, children=[], ) # build studio channel out of youtube playlist if options.get("youtube_channel_id"): youtube_id = options.get("youtube_channel_id") logger.info( "Downloading youtube playlist {} for {} language".format( youtube_id, lang.name ) ) root_node = youtube_playlist_scraper(youtube_id, channel_node) # write to json file logger.info("writing ricecooker json to a file") json_tree_path = self.get_json_tree_path(*args, **options) write_tree_to_json_tree(json_tree_path, root_node) return logger.info("downloading KA tree") # build channel through KA API ka_root_topic = get_khan_topic_tree(lang=language_code) if options.get("english_subtitles"): # we will include english videos with target language subtitles duplicate_videos(ka_root_topic) language_code = lang.primary_code if lang.subcode: language_code = language_code + "-" + lang.subcode logger.info("converting KA nodes to ricecooker json nodes") root_topic = convert_ka_node_to_ricecooker_node( ka_root_topic, target_lang=language_code ) for topic in root_topic["children"]: channel_node["children"].append(topic) # write to json file logger.info("writing ricecooker json to a file") json_tree_path = self.get_json_tree_path(*args, **options) write_tree_to_json_tree(json_tree_path, channel_node)
def test_invalid_format(self): expected_language = languages.getlang_by_name('English') converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'not.txt')) with self.assertRaises(InvalidSubtitleFormatError): converter.convert(expected_language.code)
def test_invalid_format__empty(self): expected_language = languages.getlang_by_name('English') converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'empty.ttml')) with self.assertRaises(InvalidSubtitleFormatError, msg='Caption file is empty'): converter.convert(expected_language.code)
def fetch_all_languages(channel): doc = get_parsed_html_from_url('http://migranthealth.eu/etraining/') for link in doc.select('.category.essentialcats a'): url = link['href'] language_name = link.text.strip()[len('MEET '):] language = languages.getlang_by_name(language_name) language_node = fetch_language(url, language) channel.add_child(language_node)
def __get_language_code(self, language_str): language = getlang_by_name(language_str) or getlang_by_native_name( language_str) if language: return language.code else: print('Unknown language:', language_str) return NalibaliChef.ENGLISH_LANGUAGE_CODE
def test_invalid_language(self): expected_language = languages.getlang_by_name('Spanish') converter = build_subtitle_converter_from_file( os.path.join(test_files_dir, 'encapsulated.sami')) with self.assertRaises(InvalidSubtitleLanguageError): converter.convert(expected_language.code)
def test_not_expected_type(self): expected_format = file_formats.SCC expected_language = languages.getlang_by_name('Arabic') converter = build_subtitle_converter_from_file( os.path.join(test_files_dir, 'basic.srt'), in_format=expected_format) with self.assertRaises(InvalidSubtitleFormatError): converter.convert(expected_language.code)
def test_srt_conversion(self): expected_file = os.path.join(test_files_dir, 'basic.vtt') expected_language = languages.getlang_by_name('Arabic') converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'basic.srt')) converter.replace_unknown_language(expected_language.code) with tempfile.NamedTemporaryFile() as actual_file: converter.write(actual_file.name, expected_language.code) self.assertFileHashesEqual(expected_file, actual_file.name)
def test_replace_unknown_language(self): expected_language = languages.getlang_by_name('Arabic') converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'basic.srt')) self.assertTrue(converter.has_language(LANGUAGE_CODE_UNKNOWN)) converter.replace_unknown_language(expected_language.code) self.assertTrue(converter.has_language(expected_language.code)) self.assertFalse(converter.has_language(LANGUAGE_CODE_UNKNOWN))
def test_valid_language(self): expected_file = os.path.join(test_files_dir, 'encapsulated.vtt') expected_language = languages.getlang_by_name('English') converter = build_subtitle_converter_from_file( os.path.join(test_files_dir, 'encapsulated.sami')) self.assertTrue(converter.has_language(expected_language.code)) with tempfile.NamedTemporaryFile() as actual_file: converter.write(actual_file.name, expected_language.code) self.assertFileHashesEqual(expected_file, actual_file.name)
def getlang_by_language_en(language_en): """ Convert language names used on PraDigi websites to le_utils language object. """ if language_en == 'Odiya' or language_en == 'Odisa': language_en = 'Oriya' elif language_en == 'Bangali': language_en = 'Bengali' elif language_en == 'Telagu': language_en = 'Telugu' lang_obj = getlang_by_name(language_en) return lang_obj
def get_lang_obj(self): if self.name != "": lang_name = self.name language_obj = getlang_by_name(lang_name) if not language_obj: if UND_LANG[self.name]: self.set_value(UND_LANG[self.name]["name"], UND_LANG[self.name]["code"], UND_LANG[self.name]["native_name"]) return True else: self.set_value(language_obj.name, language_obj.code, language_obj.native_name) return True return False
def get_json_tree_path(self, *args, **kwargs): """ Return path to ricecooker json tree file. Override this method to use a custom filename, e.g., for channel with multiple languages. """ # Channel language if "lang" in kwargs: language_code = kwargs["lang"] else: language_code = ( "en" ) # default to en if no language specified on command line lang_obj = getlang(language_code) or getlang_by_name(language_code) json_filename = self.RICECOOKER_JSON_TREE_TPL.format(lang_obj.code) json_tree_path = os.path.join(self.TREES_DATA_DIR, json_filename) return json_tree_path
def add_node_document(booklist, level_topic, as_booklist): """ Add books as DocumentNode under a specific level of reading. Parameters: * booklist - The list of books to be added as DocumentNodes * level_topic - The TopicNode regarding current level that the DocumentNodes will be attached to * as_booklist - The list of books from African Storybooks """ for item in booklist: # Initialize the source domain and content_id domain = uuid.uuid5(uuid.NAMESPACE_DNS, "storyweaver.org.in") book_id = str(item["source_id"]) # If the publisher is AS and the book is found, # then change the source_domain and content_id if item["publisher"] == "African Storybook Initiative": check = check_if_story_in_AS(as_booklist, item["title"]) if check[0]: domain = uuid.uuid5(uuid.NAMESPACE_DNS, "www.africanstorybook.org") book_id = check[1] # Given that StoryWeaver provides the link to a zip file, # we will download the zip file and extract the pdf file from it with tempfile.NamedTemporaryFile(suffix=".zip") as tempf: try: resp = downloader.make_request(item["link"], clear_cookies=False) resp.raise_for_status() tempf.write(resp.content) except Exception as e: # Do not create the node if download fails LOGGER.info("Error: {} when downloading {}".format( e, item["link"])) continue filename = "" with zipfile.ZipFile(tempf.name, "r") as f: for zipped_file in f.namelist(): if os.path.splitext(zipped_file)[1][1:] == "pdf": tempdir = os.path.dirname(tempf.name) f.extract(zipped_file, path=tempdir) filename = os.path.join(tempdir, zipped_file) break # If no pdf file has been found in the zip, do not create the node if not filename: continue # Create the document node with given information document_file = DocumentFile(path=filename) language_obj = getlang_by_name(item["language"]) book = DocumentNode( title=item["title"], source_id=book_id, author=item["author"], provider=item["publisher"], files=[document_file], license=get_license(licenses.CC_BY, copyright_holder="StoryWeaver"), thumbnail=item.get("thumbnail"), description=item["description"], domain_ns=domain, language=language_obj, ) level_topic.add_child(book)
def build_lang_lookup_table(FEED_ROOT_URL): """ Extracts all the root URLs of the languages, based on the links with face `Languages` in FEED_ROOT_URL. """ OPDS_LANG_ROOTS = {} # Check for languages we don't yet support in Kolibri. langs_not_found = [] feed = feedparser.parse(FEED_ROOT_URL) lang_links = [] for link in feed.feed.links: if 'opds:facetgroup' in link: fg = link['opds:facetgroup'] if fg == 'Languages': lang_links.append(link) # Build lookup table lang_code --> dict with info about content in that langauge # where lang_code is the Learning Equality internal language codes defined in le_utils # Assume the chef scrill will be run on the command line using lang=lang_code # E.g. lang_code for Zulu is `zul`, for Amharic it's `am`, and for Nepali it's `ne-NP` for link in lang_links: href = link['href'] m = _LANG_CODE_RE.search(href) if not m: raise ValueError('Cannot find language code in href: ' + str(href)) gdl_lang_code = m.groupdict()['gdl_lang_code'] lang_title = link['title'] if lang_title == "isiNdebele seSewula": lang_title = "isiNdebele" elif lang_title == 'বাঙালি': lang_title = 'বাংলা' print('Processig lang_title', lang_title) # # ATTEMPT 1 ############## lang_obj = getlang_by_name(lang_title) if not lang_obj: lang_obj = getlang_by_native_name(lang_title) # # ATTEMPT 2 ######### if not lang_obj: pyc_lang = pycountry.languages.lookup(gdl_lang_code) code = pyc_lang.alpha_3 if hasattr(pyc_lang, 'alpha_2'): # # ATTEMPT 3 ############## code = pyc_lang.alpha_2 # getlang_by_alpha2 is a misnomer, codes can be alpha2, alpha3, or lang+locale. lang_obj = getlang_by_alpha2(code) if not lang_obj: langs_not_found.append((pyc_lang, lang_title)) print('ERROR could not find Kolibri lang info for ', pyc_lang) continue lang_code = lang_obj.code OPDS_LANG_ROOTS[lang_code] = dict( alpha_3=gdl_lang_code, lang_title=lang_title, href=href, name=lang_obj.name, native_name=lang_obj.native_name, ) # For now, make missing languages a hard error so we can evaluate new language support case-by-case. if len(langs_not_found) > 0: lang_codes = [] for pyc_lang, lang_title in langs_not_found: lang_codes.append(pyc_lang.alpha_3) message = "The following languages are not yet supported in Kolibri: {}".format( ",".join(lang_codes)) assert len(langs_not_found) == 0, message return OPDS_LANG_ROOTS
def test_unknown_name(): lang_obj = languages.getlang_by_name('UnknoenLanguage') assert lang_obj is None, 'UnknoenLanguage name returned non-None'
class NalibaliChef(JsonTreeChef): #region Constants HOSTNAME = 'nalibali.org' ROOT_URL = f'http://{HOSTNAME}/story-library' DATA_DIR = 'chefdata' TREES_DATA_DIR = os.path.join(DATA_DIR, 'trees') CRAWLING_STAGE_OUTPUT = 'web_resource_tree.json' SCRAPING_STAGE_OUTPUT = 'ricecooker_json_tree.json' ZIP_FILES_TMP_DIR = os.path.join(DATA_DIR, 'zipfiles') LICENSE = get_license(licenses.CC_BY_NC_ND, copyright_holder="Nal'ibali").as_dict() ENGLISH_LANGUAGE_CODE = getlang_by_name('English').code #endregion Constants #region Regexes STORY_PAGE_LINK_RE = compile(r'^.+page=(?P<page>\d+)$') SUPPORTED_THUMBNAIL_EXTENSIONS = compile(r'\.(png|jpg|jpeg)') AUTHOR_RE = compile(r'author:', IgnoreCase) AUDIO_STORIES_RE = compile(r'Audio Stories', IgnoreCase) AUDIO_STORY_ANCHOR_RE = compile(r'story-library/audio-stories') IONO_FM_RE = compile(r'iono.fm') RSS_FEED_RE = compile(r'/rss/chan') #endregion Regexes def __init__(self, html, logger): super(NalibaliChef, self).__init__(None, None) self._html = html self._logger = logger #region Helper functions def __absolute_url(self, url): if url.startswith("//"): return "https:" + url elif url.startswith("/"): return f'http://{NalibaliChef.HOSTNAME}{url}' return url def __get_text(self, elem): return "" if elem is None else elem.get_text().replace( '\r', '').replace('\n', ' ').strip() def __sanitize_author(self, text): if not text: return text new_text, _ = NalibaliChef.AUTHOR_RE.subn('', text) return new_text.strip() def __process_language(self, language): lang = language.lower() if lang == 'sotho': return 'Sesotho' elif lang == 'ndebele': return 'North Ndebele' elif lang == 'tsivenda': return 'Tshivenda' elif lang == 'seswati': return 'Siswati' elif lang == 'tsw': return 'Setswana' elif lang == 'continue reading': return 'English' return language def __get_language_code(self, language_str): language = getlang_by_name(language_str) or getlang_by_native_name( language_str) if language: return language.code else: print('Unknown language:', language_str) return NalibaliChef.ENGLISH_LANGUAGE_CODE #endregion Helper functions #region Crawling def crawl(self, args, options): root_page = self._html.get(NalibaliChef.ROOT_URL) story_hierarchies = self._crawl_story_hierarchies(root_page) web_resource_tree = dict( kind='NalibaliWebResourceTree', title="Nal'ibali Web Resource Tree", language='en', children=story_hierarchies, ) json_file_name = os.path.join(NalibaliChef.TREES_DATA_DIR, NalibaliChef.CRAWLING_STAGE_OUTPUT) with open(json_file_name, 'w') as json_file: json.dump(web_resource_tree, json_file, indent=2) self._logger.info('Crawling results stored in ' + json_file_name) return story_hierarchies def _crawl_story_hierarchies(self, page): content_div = page.find('div', class_='region-content') vocabulary_div = content_div.find('div', class_='view-vocabulary') stories_divs = vocabulary_div.find_all('div', 'views-row') story_hierarchies = [ h for h in map(self._crawl_to_story_hierarchy, stories_divs) ] stories_dict = dict(map(self._crawl_story_hierarchy, story_hierarchies)) for h in story_hierarchies: stories = stories_dict.get(h['url'], {}) h['children'] = stories return story_hierarchies def _crawl_to_story_hierarchy(self, div): title = self.__get_text(div.find('h2')) image_url = div.find('img', class_='img-responsive')['src'] body_text = self.__get_text(div.find('div', class_='body')) stories_url = self.__absolute_url( div.find('div', class_='views-field').find('a', class_='btn link')['href']) return dict( kind='NalibaliHierarchy', title=title, thumbnail=image_url, description=body_text, url=stories_url, ) def _crawl_story_hierarchy(self, hierarchy): if NalibaliChef.AUDIO_STORIES_RE.search(hierarchy['title']): return self._crawl_audio_stories_hierarchy(hierarchy) stories_url = hierarchy['url'] paginations = self._crawl_pagination(stories_url) paginations.insert( 0, dict( kind='NalibaliPagination', url=stories_url, page=0, name='1', )) all_stories_by_bucket = list( map(self._crawl_pagination_stories, paginations)) stories_by_language = {} for stories_bucket in all_stories_by_bucket: for story in stories_bucket: for lang, story in story['supported_languages'].items(): by_language = stories_by_language.get(lang) if not by_language: by_language = (set(), []) stories_by_language[lang] = by_language uniques, stories = by_language url = story['url'] if url not in uniques: stories.append(story) uniques.add(url) for lang, (uniques, stories) in stories_by_language.items(): stories_by_language[lang] = stories return stories_url, stories_by_language def _crawl_pagination(self, url): page = self._html.get(url) pagination_ul = page.find('ul', class_='pagination') if not pagination_ul: return [] anchors = pagination_ul.find_all( 'a', attrs={'href': NalibaliChef.STORY_PAGE_LINK_RE}) paginations = list(map(self._crawl_to_pagination, anchors)) paginations_dict = {p['name']: p for p in paginations} actual_paginations = [ p for p in paginations if ('next' not in p['name'] and 'last' not in p['name'] and 'first' not in p['name'] and 'previous' not in p['name'] and '>' not in p['name'] and '‹' not in p['name'] and p['name'] != '') ] last = paginations_dict.get('last') # This is to handle Story Cards hierarchy, since it does not have a <<last>> pagination item if not last: return actual_paginations current_last = actual_paginations[-1] if current_last['page'] == last['page']: return actual_paginations else: seen = set() return [ x for x in actual_paginations + self._crawl_pagination(current_last['url']) if x['page'] not in seen and not seen.add(x['page']) ] def _crawl_to_pagination(self, anchor): href = anchor['href'] m = NalibaliChef.STORY_PAGE_LINK_RE.match(href) if not m: raise Exception('STORY_PAGE_LINK_RE could not match') groups = m.groupdict() text = self.__get_text(anchor) parts = text.split() name = parts[0] if len(parts) > 0 else text return dict( kind='NalibaliPagination', url=self.__absolute_url(href), page=groups['page'], name=name, ) def _crawl_pagination_stories(self, pagination): url = pagination['url'] page = self._html.get(url) content_views = page.find_all('div', class_='view-content') stories = [] for content in content_views: stories.extend([ story for story in map(self._crawl_to_story, content.find_all('div', class_='views-row')) if story ]) return stories def _crawl_to_story(self, div): title_elem = div.find('span', property='dc:title') title = '' if title_elem: title = title_elem['content'] else: title_elem = div.find('div', class_='content') if not title_elem: return None title = self.__get_text(title_elem.find('h3')) if not title: return None posted_date = self.__get_text(div.find('div', class_='field-date')) author = self.__sanitize_author( self.__get_text(div.find('div', class_='field-author'))) description = self.__get_text(div.find('div', class_='field-body')) links = div.find('div', class_='links') anchors = links.find_all('a') if links else [] image = div.find('img', class_='img-responsive') or div.find('img') image_src = image['src'] if image else '' thumbnail = image_src.split( '?')[0] if NalibaliChef.SUPPORTED_THUMBNAIL_EXTENSIONS.search( image_src) else None language_and_hrefs = [ (self.__process_language(self.__get_text(anchor)), anchor['href']) for anchor in anchors ] story_by_language = { language: dict( kind='NalibaliLocalizedStory', title=title, description=description, posted_date=posted_date, author=author, language=language, url=self.__absolute_url(href), thumbnail=thumbnail, ) for language, href in language_and_hrefs } return dict( kind='NalibaliStory', title=title, posted_date=posted_date, author=author, supported_languages=story_by_language, ) def _crawl_audio_stories_hierarchy(self, hierarchy): stories_url = hierarchy['url'] page = self._html.get(stories_url) content = page.find('section', id='section-main').find('div', class_='region-content') language_info = [ (self.__process_language(self.__get_text(anchor)), anchor['href']) for anchor in content.find_all( 'a', attrs={'href': NalibaliChef.AUDIO_STORY_ANCHOR_RE}) if not anchor.get('class') and len(self.__get_text(anchor)) > 2 ] stories_by_language = {} for lang, url in language_info: language_page = self._html.get(self.__absolute_url(url)) language_iono_fm_url = language_page.find( 'a', attrs={'href': NalibaliChef.IONO_FM_RE})['href'] language_iono_fm_page = self._html.get(language_iono_fm_url) rss_url = language_iono_fm_page.find( 'link', attrs={'href': NalibaliChef.RSS_FEED_RE})['href'] rss_page = self._html.get_xml(rss_url) items = rss_page.find_all('item') stories = [None] * len(items) for i, item in enumerate(items): url = item.enclosure['url'].split('?')[0] filename = os.path.basename(url) filename_posix = PurePosixPath(filename) filename_no_extension = filename_posix.stem mp3_url = os.path.join(os.path.dirname(url), filename_no_extension) + '.mp3' mp3_version_exists = self._html.head( mp3_url).status_code == 200 if not mp3_version_exists: raise Exception(f'No mp3 version available for {url}') audio_node_url = mp3_url if mp3_version_exists else url parsed_url = urlparse(audio_node_url) stories[i] = dict( title=self.__get_text(item.title), source_id=parsed_url.path, url=audio_node_url, content_type=item.enclosure['type'], description=self.__get_text(item.summary), pub_date=self.__get_text(item.pubDate), author=self.__get_text(item.author), language=lang, thumbnail=item.thumbnail['href'], ) stories_by_language[lang] = stories return stories_url, stories_by_language #endregion Crawling #region Scraping def scrape(self, args, options): kwargs = {} # combined dictionary of argparse args and extra options kwargs.update(args) kwargs.update(options) with open( os.path.join(NalibaliChef.TREES_DATA_DIR, NalibaliChef.CRAWLING_STAGE_OUTPUT), 'r') as json_file: web_resource_tree = json.load(json_file) assert web_resource_tree['kind'] == 'NalibaliWebResourceTree' ricecooker_json_tree = dict( source_domain=NalibaliChef.HOSTNAME, source_id="nal'ibali", title=web_resource_tree['title'], description= """Nal'ibali (isiXhosa for "here's the story") is a national reading-for-enjoyment campaign to spark children's potential through storytelling and reading.""", language='en', thumbnail= 'http://nalibali.org/sites/default/files/nalibali_logo.png', children=[], ) hierarchies_map = { h['title']: h for h in web_resource_tree['children'] } children = [None] * len(hierarchies_map.keys()) children[0] = self._scrape_hierarchy( hierarchies_map.get('Multilingual stories'), self._scrape_multilingual_story) children[1] = self._scrape_hierarchy( hierarchies_map.get('Audio stories'), self._scrape_audio_story) children[2] = self._scrape_hierarchy( hierarchies_map.get('Story cards'), self._scrape_story_card) children[3] = self._scrape_hierarchy( hierarchies_map.get('Story seeds'), self._scrape_story_seed) children[4] = self._scrape_hierarchy( hierarchies_map.get('Your stories'), self._scrape_your_story) ricecooker_json_tree['children'] = children write_tree_to_json_tree( os.path.join(NalibaliChef.TREES_DATA_DIR, NalibaliChef.SCRAPING_STAGE_OUTPUT), ricecooker_json_tree) return ricecooker_json_tree def _scrape_hierarchy(self, hierarchy, story_scraping_func): assert hierarchy['kind'] == 'NalibaliHierarchy' items = hierarchy.get('children', {}).items() hierarchy_name = hierarchy['title'].replace(' ', '_') hierarchy_by_language = [None] * len(items) for i, (language, stories) in enumerate(items): stories_nodes = [ story for story in map(story_scraping_func, stories) if story ] topic_node = dict( kind=content_kinds.TOPIC, source_id=f'{hierarchy_name}_{language}', title=language, description=f'Stories in {language}', children=stories_nodes, ) hierarchy_by_language[i] = topic_node hierarchy_title = hierarchy['title'] return dict( kind=content_kinds.TOPIC, source_id=hierarchy_title, title=hierarchy_title, description=hierarchy['description'], children=hierarchy_by_language, thumbnail=hierarchy['thumbnail'], ) def _scrape_multilingual_story(self, story): return self._scrape_story_html5(story) def _scrape_audio_story(self, story): return dict(kind=content_kinds.AUDIO, source_id=story['source_id'], title=story['title'], license=NalibaliChef.LICENSE, author=story['author'], description=story['description'], domain_ns=NalibaliChef.HOSTNAME, thumbnail=story['thumbnail'], files=[ dict( file_type=content_kinds.AUDIO, path=story['url'], language=self.__get_language_code( story['language']), ) ]) def _scrape_story_card(self, story): url = story['url'] language_str = story['language'] lang_code = self.__get_language_code(language_str) if url and url.endswith('.pdf'): parsed_url = urlparse(url) return dict( source_id=parsed_url.path, kind=content_kinds.DOCUMENT, title=story['title'], description=story['description'], license=NalibaliChef.LICENSE, author=story['author'], thumbnail=story['thumbnail'], language=lang_code, files=[dict( file_type=content_kinds.DOCUMENT, path=url, )]) raise Exception('Non-PDF version not implemented') def _scrape_story_seed(self, story): return self._scrape_story_html5(story) def _scrape_your_story(self, story): return self._scrape_story_html5(story) def _scrape_download_image(self, base_path, img): url = img['src'] if not url: return if url.startswith('http') or url.startswith('https'): absolute_url = url parsed_url = urlparse(url) relative_url = parsed_url.path else: absolute_url = self.__absolute_url(url) relative_url = url self._scrape_download_image_helper(base_path, img, absolute_url, relative_url) def _scrape_download_image_helper(self, base_path, img, absolute_url, relative_url): image_response = self._html.get_image(absolute_url) if image_response.status_code != 200: return filename = os.path.basename(relative_url) subdirs = os.path.dirname(relative_url).split('/') image_dir = os.path.join(base_path, *subdirs) pathlib.Path(image_dir).mkdir(parents=True, exist_ok=True) image_path = os.path.join(image_dir, filename) with open(image_path, 'wb') as f: image_response.raw.decode_content = True shutil.copyfileobj(image_response.raw, f) img['src'] = relative_url[1:] if relative_url[ 0] == '/' else relative_url def _scrape_story_html5(self, story): url = story['url'] page = self._html.get(url) story_section = page.find('section', id='section-main') links_section = story_section.find('div', class_='languages-links') # Is there a way to cross link HTML5AppNode? if links_section: links_section.extract() title = self.__get_text(story_section.find('h1', class_='page-header')) language_code = self.__get_language_code(story['language']) dest_path = tempfile.mkdtemp(dir=NalibaliChef.ZIP_FILES_TMP_DIR) for img in story_section.find_all('img'): self._scrape_download_image(dest_path, img) basic_page_str = """ <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title></title> </head> <body> </body> </html>""" basic_page = BeautifulSoup(basic_page_str, "html.parser") body = basic_page.find('body') body.append(story_section) with open(os.path.join(dest_path, 'index.html'), 'w', encoding="utf8") as index_html: index_html.write(str(basic_page)) zip_path = create_predictable_zip(dest_path) parsed_story_url = urlparse(url) return dict( kind=content_kinds.HTML5, source_id=parsed_story_url.path if parsed_story_url else url, title=title, language=language_code, description=story['description'], license=NalibaliChef.LICENSE, thumbnail=story['thumbnail'], files=[ dict( file_type=content_kinds.HTML5, path=zip_path, language=language_code, ) ], ) #endregion Scraping def pre_run(self, args, options): self.crawl(args, options) self.scrape(args, options)
def test_language_names_with_modifier_after_comma(): # try to match based on language name (stuff before the comma) lang_obj = languages.getlang_by_name('Arabic, Tunisian') assert lang_obj is not None, 'Arabic fallback not found' assert lang_obj.code == "ar", 'Wrong internal repr. code' assert lang_obj.name == "Arabic", 'Wrong name'
def upload_content(self, data, access_token, channel): for language, language_value in data.items(): # convert to title to apply title case for node titles language = language.title() language_node = nodes.TopicNode(title=language, source_id=language, author="TicTacLearn", description='', thumbnail=TTL_MAIN_LOGO, language=getlang_by_name(language)) for grade, grade_value in language_value.items(): grade_node = nodes.TopicNode( title='Grade {}'.format(grade), source_id="{}-{}".format(language, grade), author="TicTacLearn", description='', thumbnail=TTL_MAIN_LOGO, language=getlang_by_name(language)) for subject, subject_value in grade_value.items(): subject = subject.title() subject_node = nodes.TopicNode( title=subject, source_id="{}-{}-{}".format(language, grade, subject), author="TicTacLearn", description='', thumbnail=TTL_MAIN_LOGO, language=getlang_by_name(language)) for chapter, chapter_value in subject_value.items(): chapter = chapter.title() chapter_node = nodes.TopicNode( title=chapter, source_id="{}-{}-{}-{}".format( language, grade, subject, chapter), author="TicTacLearn", description='', thumbnail=TTL_MAIN_LOGO, language=getlang_by_name(language)) for topic, topic_value in chapter_value.items(): topic = topic.title() if topic == "Chapter Assessment": questions = self.create_question( topic_value.items()) exercise_node = nodes.ExerciseNode( source_id="{}-{}-{}-{}-{}".format( language, grade, subject, chapter, topic), title=topic, author="TicTacLearn", description="Chapter Assessment", language=getlang_by_name(language), license=licenses.CC_BYLicense( "TicTacLearn"), thumbnail=TTL_MAIN_LOGO, exercise_data={ "mastery_model": exercises.M_OF_N, "m": len(questions), "n": len(questions), "randomize": True }, questions=questions) chapter_node.add_child(exercise_node) else: topic_node = nodes.TopicNode( title=topic, source_id="{}-{}-{}-{}-{}".format( language, grade, subject, chapter, topic), author="TicTacLearn", description='', thumbnail=TTL_MAIN_LOGO, language=getlang_by_name(language)) for content_type, content in topic_value.items( ): if content_type == "video": for link, details in content.items(): try: video_node = self.video_node_from_dropbox( details, link, access_token) topic_node.add_child( video_node) except Exception as e: print(e) print( "Error getting video from dropbox with link: {}" .format(link)) self.add_to_failed( link, details, content_type) continue else: # content type is assessment questions = self.create_question( content.items()) exercise_node = nodes.ExerciseNode( source_id= "{}-{}-{}-{}-{}-Assessment".format( language, grade, subject, chapter, topic), title="{} Assessment".format( topic), author="TicTacLearn", description="{} Assessment".format( topic), license=licenses.CC_BYLicense( "TicTacLearn"), thumbnail=TTL_MAIN_LOGO, exercise_data={ "mastery_model": exercises.M_OF_N, "m": len(questions), "n": len(questions), "randomize": True }, questions=questions) topic_node.add_child(exercise_node) chapter_node.add_child(topic_node) subject_node.add_child(chapter_node) grade_node.add_child(subject_node) language_node.add_child(grade_node) channel.add_child(language_node) return channel
lang_tag_to_lang_dict = {} for le_code, lang_obj in _LANGLOOKUP.items(): lang_tag_to_lang_dict[le_code] = dict( lang_tag=le_code, name=lang_obj.name.split(';')[0], le_code=le_code, ) for item in LANGS_LOOKUP: lang_tag = item['lang'] name = repr(item['pattern']).replace("re.compile('.*", "").replace(".*')", "") lang_obj = getlang(lang_tag) if lang_obj is None: lang_obj = getlang_by_name(name) lang_tag_to_lang_dict[lang_tag] = dict( lang_tag=lang_tag, name=name, le_code=lang_obj.code if lang_obj else None, ) # Export as JSON ################################################################################ projects_tree = {} for project_name, project_langs in projects.items():