def test_youtube_edgecases_alpha2_codes(simplified_chinese_codes, traditional_chinese_codes, taiwan_chinese_codes): # check old language code for Hebrew works `iw` lang_obj = languages.getlang_by_alpha2('iw') assert lang_obj is not None, 'Hebrew not found' assert lang_obj.code == "he", 'Wrong code' assert lang_obj.name == "Hebrew (modern)", 'Wrong name' assert lang_obj.native_name == "עברית", 'Wrong native_name' # Check all Simplified Chinese codes are resolved correctly to zh-CN for lang_code in simplified_chinese_codes: lang_obj = languages.getlang_by_alpha2(lang_code) assert lang_obj is not None, 'Simplified Chinese not found' assert lang_obj.code == "zh-CN", 'Wrong internal repr. code' assert lang_obj.name == "Chinese, Simplified", 'Wrong name' assert lang_obj.native_name == "中国大陆", 'Wrong native_name' # Check all Traditional Chinese codes are resolved correctly to zh-Hant for lang_code in traditional_chinese_codes: lang_obj = languages.getlang_by_alpha2(lang_code) assert lang_obj is not None, 'Traditional Chinese not found' assert lang_obj.code == "zh-Hant", 'Wrong internal repr. code' assert lang_obj.name == "Chinese, Traditional", 'Wrong name' assert lang_obj.native_name == "漢語 (繁體字)", 'Wrong native_name' # Check all Taiwanese langauge codes are resolved correctly to zh-TW for lang_code in taiwan_chinese_codes: lang_obj = languages.getlang_by_alpha2(lang_code) assert lang_obj is not None, 'Taiwan Chinese not found' assert lang_obj.code == "zh-TW", 'Wrong internal repr. code' assert lang_obj.name == "Chinese, Taiwan", 'Wrong name' assert lang_obj.native_name == "漢語 (臺灣)", 'Wrong native_name'
def fetch_video(video): youtube_id = video['id'] title = video['title'] description = video['description'] youtube_url = video['webpage_url'] subtitle_languages = video['subtitles'].keys() print(" Fetching video data: %s (%s)" % (title, youtube_url)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(title), license=LICENSE, description=truncate_description(description), derive_thumbnail=True, language="en", files=[files.YouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: # TODO(david): Should catch exception thrown by # files.YouTubeSubtitleFile rather than breaking abstraction. if languages.getlang(language) or languages.getlang_by_alpha2( language): video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) else: print("WARNING: Subtitle language %s not found in languages file" % language) return video_node
def test_known_alpha2_codes(): lang_obj = languages.getlang_by_alpha2('en') assert lang_obj is not None, 'English not found' assert lang_obj.code == "en", 'Wrong code' assert lang_obj.name == "English", 'Wrong name' assert lang_obj.native_name == "English", 'Wrong native_name' lang_obj = languages.getlang_by_alpha2('zu') assert lang_obj is not None, 'Zulu not found' assert lang_obj.code == "zul", 'Wrong internal repr. code' assert lang_obj.name == "Zulu", 'Wrong name' assert lang_obj.native_name == "isiZulu", 'Wrong native_name' lang_obj = languages.getlang_by_alpha2('pt') assert lang_obj is not None, 'Portuguese not found' assert lang_obj.code == "pt", 'Wrong code' assert lang_obj.name == "Portuguese", 'Wrong name' assert lang_obj.native_name == "Português", 'Wrong native_name'
def test_known_alpha2_codes(): lang_obj = languages.getlang_by_alpha2("en") assert lang_obj is not None, "English not found" assert lang_obj.code == "en", "Wrong code" assert lang_obj.name == "English", "Wrong name" assert lang_obj.native_name == "English", "Wrong native_name" lang_obj = languages.getlang_by_alpha2("zu") assert lang_obj is not None, "Zulu not found" assert lang_obj.code == "zul", "Wrong internal repr. code" assert lang_obj.name == "Zulu", "Wrong name" assert lang_obj.native_name == "isiZulu", "Wrong native_name" lang_obj = languages.getlang_by_alpha2("pt") assert lang_obj is not None, "Portuguese not found" assert lang_obj.code == "pt", "Wrong code" assert lang_obj.name == "Portuguese", "Wrong name" assert lang_obj.native_name == "Português", "Wrong native_name"
def __init__(self, youtube_id, language=None, **kwargs): self.youtube_url = 'http://www.youtube.com/watch?v={}'.format(youtube_id) if isinstance(language, languages.Language): # for backward compatibility language = language.code self.youtube_language = language # youtube language code (can differ from internal repr.) language_obj = languages.getlang(language) # lookup `language` using internal representation # if language_obj not None, we know `language` is a valid language_id in the internal repr. if language_obj is None: # if `language` not found using internal repr. language_obj = languages.getlang_by_alpha2(language) # try to match by two-letter ISO code language = language_obj.code # update `language` argument from internal repr. language_id super(YouTubeSubtitleFile, self).__init__(language=language, **kwargs) assert self.language, "Subtitles must have a language"
def download_and_transform_file(self, path): """ Download subtitles file at `path` and transform it to `.vtt` if necessary. Args: path (URL or local path) Returns: filename of final .vtt file """ key = "DOWNLOAD:{}".format(path) cache_file = get_cache_filename(key) if not config.UPDATE and not cache_is_outdated(path, cache_file): return cache_file config.LOGGER.info("\tDownloading {}".format(path)) with tempfile.NamedTemporaryFile() as temp_in_file,\ tempfile.NamedTemporaryFile() as temp_out_file: write_and_get_hash(path, temp_in_file) temp_in_file.seek(0) converter = build_subtitle_converter_from_file( temp_in_file.name, self.subtitlesformat) # We'll assume the provided file is in the passed language in this case if len(converter.get_language_codes()) == 1 \ and converter.has_language(LANGUAGE_CODE_UNKNOWN): converter.replace_unknown_language(self.language) convert_lang_code = self.language # Language is not present, let's try different codes if not converter.has_language(self.language): for lang_code in converter.get_language_codes(): language = languages.getlang_by_alpha2(lang_code) if language and language.code == self.language: convert_lang_code = lang_code break else: raise InvalidSubtitleLanguageError( "Missing language '{}' in subtitle file".format( self.language)) converter.write(temp_out_file.name, convert_lang_code) temp_out_file.seek(0) file_hash = get_hash(temp_out_file.name) filename = '{0}.{ext}'.format(file_hash, ext=file_formats.VTT) temp_out_file.seek(0) copy_file_to_storage(filename, temp_out_file) FILECACHE.set(key, bytes(filename, "utf-8")) return filename
def _get_language_with_alpha2_fallback(language_code): """ Lookup language code `language_code` (string) in the internal language codes, and if that fails, try to map map `language_code` to the internal represention using the `getlang_by_alpha2` helper method. Returns either a le-utils Language object or None if both lookups fail. """ # 1. try to lookup `language` using internal representation language_obj = languages.getlang(language_code) # if language_obj not None, we know `language` is a valid language_id in the internal repr. if language_obj is None: # 2. try to match by two-letter ISO code language_obj = languages.getlang_by_alpha2(language_code) return language_obj
def test_youtube_edgecases_alpha2_codes(simplified_chinese_codes, traditional_chinese_codes): # check old language code for Hebrew works `iw` lang_obj = languages.getlang_by_alpha2('iw') assert lang_obj is not None, 'Hebrew not found' assert lang_obj.code == "he", 'Wrong code' assert lang_obj.name == "Hebrew (modern)", 'Wrong name' assert lang_obj.native_name == "עברית", 'Wrong native_name' # Check all Simplified Chinese codes are resolved correctly to zh-CN for lang_code in simplified_chinese_codes: lang_obj = languages.getlang_by_alpha2(lang_code) assert lang_obj is not None, 'Simplified Chinese not found' assert lang_obj.code == "zh-CN", 'Wrong internal repr. code' assert lang_obj.name == "Chinese, Simplified", 'Wrong name' assert lang_obj.native_name == "中国大陆", 'Wrong native_name' # Check all Traditional Chinese codes are resolved correctly to zh-TW for lang_code in traditional_chinese_codes: lang_obj = languages.getlang_by_alpha2(lang_code) assert lang_obj is not None, 'Traditional Chinese not found' assert lang_obj.code == "zh-TW", 'Wrong internal repr. code' assert lang_obj.name == "Chinese, Traditional", 'Wrong name' assert lang_obj.native_name == "正體字/繁體字", 'Wrong native_name'
def __init__(self, lang, *args, **kwargs): self.lang_id = lang self.lang_data = languages.getlang_by_alpha2(self.lang_id) # Code in __init__ may call get_channel, which expects channel_info to be defined, # so we do the initialization here. self.channel_info = { # FIXME: Translate language titles 'CHANNEL_TITLE': 'ProFuturo ({})'.format(self.lang_data.native_name), 'CHANNEL_SOURCE_DOMAIN': 'profuturo.education', # where you got the content 'CHANNEL_SOURCE_ID': 'profuturo-'+self.lang_id, # channel's unique id 'CHANNEL_LANGUAGE': self.lang_id, # le_utils language code 'CHANNEL_DESCRIPTION': '', # (optional) } super(ProFuturoChef, self).__init__(*args, **kwargs)
def test_unknown_alpha2_code(): lang_obj = languages.getlang_by_alpha2('zz') assert lang_obj is None, 'Uknown code zz returned non-None'
def subtitle_upload(request): # File will be converted to VTT format ext = file_formats.VTT language_id = request.META.get('HTTP_LANGUAGE') content_file = request.FILES.values()[0] with NamedTemporaryFile() as temp_file: try: converter = build_subtitle_converter( unicode(content_file.read(), 'utf-8')) convert_language_code = language_id # We're making the assumption here that language the user selected is truly the caption # file's language if it's unknown if len(converter.get_language_codes()) == 1 \ and converter.has_language(LANGUAGE_CODE_UNKNOWN): converter.replace_unknown_language(language_id) # determine if the request language exists by another code, otherwise we can't continue if not converter.has_language(convert_language_code): for language_code in converter.get_language_codes(): language = getlang_by_alpha2(language_code) if language and language.code == language_id: convert_language_code = language_code break else: return HttpResponseBadRequest( "Language '{}' not present in subtitle file".format( language_id)) converter.write(temp_file.name, convert_language_code) except InvalidSubtitleFormatError as ex: return HttpResponseBadRequest( "Subtitle conversion failed: {}".format(ex)) temp_file.seek(0) converted_file = DjFile(temp_file) checksum = get_hash(converted_file) size = converted_file.size request.user.check_space(size, checksum) file_object = File( file_size=size, file_on_disk=converted_file, checksum=checksum, file_format_id=ext, original_filename=request.FILES.values()[0]._name, preset_id=request.META.get('HTTP_PRESET'), language_id=language_id, uploaded_by=request.user, ) file_object.save() return HttpResponse( json.dumps({ "success": True, "filename": str(file_object), "file": JSONRenderer().render(FileSerializer(file_object).data) }))
def build_lang_lookup_table(FEED_ROOT_URL): """ Extracts all the root URLs of the languages, based on the links with face `Languages` in FEED_ROOT_URL. """ OPDS_LANG_ROOTS = {} # Check for languages we don't yet support in Kolibri. langs_not_found = [] feed = feedparser.parse(FEED_ROOT_URL) lang_links = [] for link in feed.feed.links: if 'opds:facetgroup' in link: fg = link['opds:facetgroup'] if fg == 'Languages': lang_links.append(link) # Build lookup table lang_code --> dict with info about content in that langauge # where lang_code is the Learning Equality internal language codes defined in le_utils # Assume the chef scrill will be run on the command line using lang=lang_code # E.g. lang_code for Zulu is `zul`, for Amharic it's `am`, and for Nepali it's `ne-NP` for link in lang_links: href = link['href'] m = _LANG_CODE_RE.search(href) if not m: raise ValueError('Cannot find language code in href: ' + str(href)) gdl_lang_code = m.groupdict()['gdl_lang_code'] lang_title = link['title'] if lang_title == "isiNdebele seSewula": lang_title = "isiNdebele" elif lang_title == 'বাঙালি': lang_title = 'বাংলা' print('Processig lang_title', lang_title) # # ATTEMPT 1 ############## lang_obj = getlang_by_name(lang_title) if not lang_obj: lang_obj = getlang_by_native_name(lang_title) # # ATTEMPT 2 ######### if not lang_obj: pyc_lang = pycountry.languages.lookup(gdl_lang_code) code = pyc_lang.alpha_3 if hasattr(pyc_lang, 'alpha_2'): # # ATTEMPT 3 ############## code = pyc_lang.alpha_2 # getlang_by_alpha2 is a misnomer, codes can be alpha2, alpha3, or lang+locale. lang_obj = getlang_by_alpha2(code) if not lang_obj: langs_not_found.append((pyc_lang, lang_title)) print('ERROR could not find Kolibri lang info for ', pyc_lang) continue lang_code = lang_obj.code OPDS_LANG_ROOTS[lang_code] = dict( alpha_3=gdl_lang_code, lang_title=lang_title, href=href, name=lang_obj.name, native_name=lang_obj.native_name, ) # For now, make missing languages a hard error so we can evaluate new language support case-by-case. if len(langs_not_found) > 0: lang_codes = [] for pyc_lang, lang_title in langs_not_found: lang_codes.append(pyc_lang.alpha_3) message = "The following languages are not yet supported in Kolibri: {}".format( ",".join(lang_codes)) assert len(langs_not_found) == 0, message return OPDS_LANG_ROOTS