def to_posix_string(lang_code): # Normalize fra-ca to fr_CA lang_code = use_underscore(lang_code) if not lang_code: return None if '_' in lang_code: # ISO format, must convert to POSIX format lang, country = lang_code.split('_')[:2] else: lang, country = lang_code, None if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError("""The input %s in not a valid code to convert to posix format % (full_name,)""") if country: return '_'.join([posix_lang.lower(), country.upper()]) else: return posix_lang.lower()
def createTranslation(text, language): translate_client = translate.Client() try: language = iso.to_iso639_1(language) except: language = iso.to_iso639_1(language) try: translation = translate_client.translate(text, target_language=language) except: return 'Unfortunately, that language is not supported.' return translation['translatedText']
def convert_review_language_dictionary_to_iso(language_dict): language_iso_dict = dict() languages = set([r['tag'] for r in language_dict.values()]) for language in languages: try: language_iso = iso639.to_iso639_1(language) except iso639.NonExistentLanguageError: if language == 'schinese' or language == 'tchinese': language_iso = 'zh-cn' elif language == 'brazilian': language_iso = 'pt' elif language == 'koreana': language_iso = 'ko' else: print('Missing language:' + language) detected_languages = [ r['detected'] for r in language_dict.values() if r['tag'] == language ] print(detected_languages) language_iso = most_common(detected_languages) print('Most common match among detected languages: ' + language_iso) language_iso_dict[language] = language_iso return language_iso_dict
def to_2_letter_lang(lang): if len(lang) == 2: if iso639.is_valid639_1(lang): return lang if len(lang) == 3: if iso639.is_valid639_2(lang): return iso639.to_iso639_1(lang) return False
def make_ebook(self) -> None: """ Combines everything to make an ePub book. """ book = EpubBook() book.set_identifier(str(uuid4())) book.set_title(self.metadata.title) book.set_language(to_iso639_1(self.metadata.language)) book.add_author(self.metadata.author.name) nav = EpubNav() ncx = EpubNcx() book.add_item(ncx) book.add_item(nav) current_chapters = ( [ x for x in self.book.get_items_of_type(9) if x.is_chapter() and x.file_name.startswith("chapter") ] if self.book else [] ) book.toc = [x for x in self.step_through_chapters(current_chapters)] cover = self.get_cover() book.set_cover("cover.jpg", cover) template = Template(filename=str(self.datasource / "title.mako")) title_page = EpubHtml( title=self.metadata.title, file_name="title.xhtml", uid="title", content=template.render(story=self.metadata), ) for s in self.styles: title_page.add_item(s) book.add_item(s) book.add_item(title_page) book.spine = ["cover", title_page] for c in book.toc: book.add_item(c) book.spine.append(c) book.spine.append(nav) self._write(book)
def to_posix_string(locale_code): if not locale_code: return None # Normalize fra-ca to fr_CA locale_code = use_underscore(locale_code) locale_parts = locale_code.split("_") # Normalize first component lang = locale_parts[0] if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: # Aryan, not sure what case is being covered here full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError( "The input %s in not a valid code to convert to posix format" % (locale_code, )) locale_parts[0] = posix_lang if len(locale_parts) > 4: raise ValueError("This locale has too many parts: " + locale_code) elif len(locale_parts) == 4: # Drop dialect. Sorry. locale_parts.pop() if len(locale_parts) > 1: # Normalize Country if len(locale_parts[-1]) == 2: locale_parts[-1] = locale_parts[-1].upper() elif len(locale_parts[-1]) != 4: raise ValueError("The last part is not a script or country: " + locale_code) # Normalize script if len(locale_parts[1]) == 4: locale_parts[1] = locale_parts[1].capitalize() return "_".join(locale_parts)
def language_iso639_2to1(lang): """Convert a bibliographic language to alpha2. :param lang: bibliographic language code :returns: language (alpha2) """ default_ln = current_i18n.babel.default_locale.language try: ln = iso639.to_iso639_1(lang) except iso639.NonExistentLanguageError: return default_ln supported_languages = [v[0] for v in current_i18n.get_languages()] return ln if ln in supported_languages else default_ln
def to_posix_string(locale_code): if not locale_code: return None # Normalize fra-ca to fr_CA locale_code = use_underscore(locale_code) locale_parts = locale_code.split("_") # Normalize first component lang = locale_parts[0] if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): temp = to_iso639_1(lang) posix_lang = temp if temp else lang else: # Aryan, not sure what case is being covered here full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: raise ValueError( "The input %s in not a valid code to convert to posix format" % (locale_code,)) locale_parts[0] = posix_lang if len(locale_parts) > 4: raise ValueError("This locale has too many parts: "+locale_code) elif len(locale_parts) == 4: # Drop dialect. Sorry. locale_parts.pop() if len(locale_parts) > 1: # Normalize Country if len(locale_parts[-1]) == 2: locale_parts[-1] = locale_parts[-1].upper() elif len(locale_parts[-1]) != 4: raise ValueError( "The last part is not a script or country: "+locale_code) # Normalize script if len(locale_parts[1]) == 4: locale_parts[1] = locale_parts[1].capitalize() return "_".join(locale_parts)
def get_language_details(iso_639_3): """ dict container iso639-2, name and native name for an iso-639-3 code """ non_iso_langs = { "zh-Hans": { "code": "zh-Hans", "iso-639-1": "zh", "english": "Simplified Chinese", "native": "简化字", }, "zh-Hant": { "code": "zh-Hant", "iso-639-1": "zh", "english": "Traditional Chinese", "native": "正體字", }, "iw": { "code": "iw", "iso-639-1": "he", "english": "Hebrew", "native": "עברית" }, "es-419": { "code": "es-419", "iso-639-1": "es-419", "english": "Spanish", "native": "Español", }, "multi": { "code": "mul", "iso-639-1": "en", "english": "Multiple Languages", "native": "Multiple Languages", }, } try: return (non_iso_langs.get(iso_639_3) if iso_639_3 in non_iso_langs.keys() else { "code": iso_639_3, "iso-639-1": iso639.to_iso639_1(iso_639_3), "english": iso639.to_name(iso_639_3), "native": iso639.to_native(iso_639_3), }) except iso639.NonExistentLanguageError: return { "code": iso_639_3, "iso_639_3": iso_639_3, "english": iso_639_3, "native": iso_639_3, }
def to_posix_format(lang_code): # Normalize fra-ca to fr_CA lang_code = use_underscore(lang_code) if not lang_code: return None if '_' in lang_code: # ISO format, must convert to POSIX format lang, country = lang_code.split('_')[:2] else: lang, country = lang_code, None if is_valid639_1(lang): posix_lang = lang elif is_valid639_2(lang): posix_lang = to_iso639_1(lang) else: full_name = lang.lower().capitalize() if is_valid639_2(full_name): posix_lang = to_iso639_1(full_name) else: return if country: return '_'.join([posix_lang.lower(), country.upper()]) else: return posix_lang.lower()
def normalize_language_code(lang): try: lang_iso639_1 = to_iso639_1(lang) return lang_iso639_1 or lang except NonExistentLanguageError: return lang
def findSubtitlesNoneIso639(scanfolder, isoMode, disablelangdetect): """ Detect subtitles that do not comply with ISO-639. TODO: Add more subtitle extensions (and read/parse them correctly for language detection) TODO: Seperate language detection better in different functions TODO: Add percentage of certainty and possible other languages when low certainty TODO: Handle unicode better to detect languages like German and Dutch better TODO: Use table """ subtitleExts = ['.srt', '.sub', '.ass'] total = 0 incorrect = 0 detectedlang = 0 for subdir, dirnames, filenames in os.walk(scanfolder): for filename in filenames: incorrectSubtitle = False extension = os.path.splitext(filename)[1].lower() # subdirName = os.path.basename(os.path.normpath(subdir)) if extension in subtitleExts: total = total + 1 langcodeFromFilename = getIsoLanguageCodeFromFilename(filename) detectedLanguage = "" detectedIsoMode = False if is_valid639_1(langcodeFromFilename): detectedIsoMode = "1" detectedLanguage = iso639_to_name(langcodeFromFilename) if is_valid639_2(langcodeFromFilename): detectedIsoMode = "2" detectedLanguage = iso639_to_name(langcodeFromFilename) if detectedIsoMode is not isoMode: isoShouldBe = "" if isoMode == "1" and detectedIsoMode == "2": isoShouldBe = to_iso639_1(langcodeFromFilename) if isoMode == "2" and detectedIsoMode == "1": isoShouldBe = to_iso639_2(langcodeFromFilename) filepath = subdir + os.sep + filename incorrectSubtitle = True incorrect = incorrect + 1 warning = "Incorrectly named subtitle found at " warning += bold(filepath) printNotificationWarning(warning) if detectedIsoMode is not False: info = "\t\tLang code " + bold(langcodeFromFilename) info += " (ISO 639-" + str(detectedIsoMode) + ") " info += "detected. The ISO 639-" + isoMode + " code" info += " for " + detectedLanguage + " is " info += bold(isoShouldBe) + "." printNotificationInfo(info) if incorrectSubtitle and not disablelangdetect: filepath = subdir + os.sep + filename try: with io.open(filepath, "r", encoding="utf-8") as mfile: my_unicode_string = mfile.read() possibleLanguage = "\tDetected language is likely to " possibleLanguage += "be \"" + detect(my_unicode_string) possibleLanguage += "\"\n" detectedlang = detectedlang + 1 except Exception: possibleLanguage = "\tLanguage detection failed\n" info = "Found subtitle files " + bold(str(total)) + " of which " info += bold(str(incorrect)) + " are incorrectly named!" printNotificationInfo(info)