Пример #1
0
def to_posix_string(lang_code):
    # Normalize fra-ca to fr_CA
    lang_code = use_underscore(lang_code)
    if not lang_code:
        return None
    if '_' in lang_code:
        # ISO format, must convert to POSIX format
        lang, country = lang_code.split('_')[:2]
    else:
        lang, country = lang_code, None
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        temp = to_iso639_1(lang)
        posix_lang = temp if temp else lang
    else:
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            raise ValueError("""The input %s in not a valid code to convert
                             to posix format % (full_name,)""")
    if country:
        return '_'.join([posix_lang.lower(), country.upper()])
    else:
        return posix_lang.lower()
Пример #2
0
def createTranslation(text, language):
    translate_client = translate.Client()
    try:
        language = iso.to_iso639_1(language)
    except:
        language = iso.to_iso639_1(language)

    try:
        translation = translate_client.translate(text,
                                                 target_language=language)
    except:
        return 'Unfortunately, that language is not supported.'
    return translation['translatedText']
Пример #3
0
def convert_review_language_dictionary_to_iso(language_dict):
    language_iso_dict = dict()

    languages = set([r['tag'] for r in language_dict.values()])

    for language in languages:

        try:
            language_iso = iso639.to_iso639_1(language)

        except iso639.NonExistentLanguageError:
            if language == 'schinese' or language == 'tchinese':
                language_iso = 'zh-cn'
            elif language == 'brazilian':
                language_iso = 'pt'
            elif language == 'koreana':
                language_iso = 'ko'
            else:
                print('Missing language:' + language)

                detected_languages = [
                    r['detected'] for r in language_dict.values()
                    if r['tag'] == language
                ]
                print(detected_languages)

                language_iso = most_common(detected_languages)
                print('Most common match among detected languages: ' +
                      language_iso)

        language_iso_dict[language] = language_iso

    return language_iso_dict
Пример #4
0
def to_2_letter_lang(lang):
    if len(lang) == 2:
        if iso639.is_valid639_1(lang):
            return lang

    if len(lang) == 3:
        if iso639.is_valid639_2(lang):
            return iso639.to_iso639_1(lang)

    return False
Пример #5
0
    def make_ebook(self) -> None:
        """
        Combines everything to make an ePub book.
        """
        book = EpubBook()
        book.set_identifier(str(uuid4()))
        book.set_title(self.metadata.title)
        book.set_language(to_iso639_1(self.metadata.language))
        book.add_author(self.metadata.author.name)

        nav = EpubNav()
        ncx = EpubNcx()

        book.add_item(ncx)
        book.add_item(nav)

        current_chapters = (
            [
                x
                for x in self.book.get_items_of_type(9)
                if x.is_chapter() and x.file_name.startswith("chapter")
            ]
            if self.book
            else []
        )

        book.toc = [x for x in self.step_through_chapters(current_chapters)]

        cover = self.get_cover()
        book.set_cover("cover.jpg", cover)

        template = Template(filename=str(self.datasource / "title.mako"))

        title_page = EpubHtml(
            title=self.metadata.title,
            file_name="title.xhtml",
            uid="title",
            content=template.render(story=self.metadata),
        )

        for s in self.styles:
            title_page.add_item(s)
            book.add_item(s)
        book.add_item(title_page)

        book.spine = ["cover", title_page]

        for c in book.toc:
            book.add_item(c)
            book.spine.append(c)

        book.spine.append(nav)

        self._write(book)
Пример #6
0
def to_posix_string(locale_code):
    if not locale_code:
        return None
    # Normalize fra-ca to fr_CA
    locale_code = use_underscore(locale_code)
    locale_parts = locale_code.split("_")
    # Normalize first component
    lang = locale_parts[0]
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        temp = to_iso639_1(lang)
        posix_lang = temp if temp else lang
    else:
        # Aryan, not sure what case is being covered here
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            raise ValueError(
                "The input %s in not a valid code to convert to posix format" %
                (locale_code, ))
    locale_parts[0] = posix_lang
    if len(locale_parts) > 4:
        raise ValueError("This locale has too many parts: " + locale_code)
    elif len(locale_parts) == 4:
        # Drop dialect. Sorry.
        locale_parts.pop()
    if len(locale_parts) > 1:
        # Normalize Country
        if len(locale_parts[-1]) == 2:
            locale_parts[-1] = locale_parts[-1].upper()
        elif len(locale_parts[-1]) != 4:
            raise ValueError("The last part is not a script or country: " +
                             locale_code)
        # Normalize script
        if len(locale_parts[1]) == 4:
            locale_parts[1] = locale_parts[1].capitalize()
    return "_".join(locale_parts)
Пример #7
0
def language_iso639_2to1(lang):
    """Convert a bibliographic language to alpha2.

    :param lang: bibliographic language code
    :returns: language (alpha2)
    """
    default_ln = current_i18n.babel.default_locale.language
    try:
        ln = iso639.to_iso639_1(lang)
    except iso639.NonExistentLanguageError:
        return default_ln
    supported_languages = [v[0] for v in current_i18n.get_languages()]
    return ln if ln in supported_languages else default_ln
Пример #8
0
def to_posix_string(locale_code):
    if not locale_code:
        return None
    # Normalize fra-ca to fr_CA
    locale_code = use_underscore(locale_code)
    locale_parts = locale_code.split("_")
    # Normalize first component
    lang = locale_parts[0]
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        temp = to_iso639_1(lang)
        posix_lang = temp if temp else lang
    else:
        # Aryan, not sure what case is being covered here
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            raise ValueError(
                "The input %s in not a valid code to convert to posix format" %
                (locale_code,))
    locale_parts[0] = posix_lang
    if len(locale_parts) > 4:
        raise ValueError("This locale has too many parts: "+locale_code)
    elif len(locale_parts) == 4:
        # Drop dialect. Sorry.
        locale_parts.pop()
    if len(locale_parts) > 1:
        # Normalize Country
        if len(locale_parts[-1]) == 2:
            locale_parts[-1] = locale_parts[-1].upper()
        elif len(locale_parts[-1]) != 4:
            raise ValueError(
                "The last part is not a script or country: "+locale_code)
        # Normalize script
        if len(locale_parts[1]) == 4:
            locale_parts[1] = locale_parts[1].capitalize()
    return "_".join(locale_parts)
Пример #9
0
def get_language_details(iso_639_3):
    """ dict container iso639-2, name and native name for an iso-639-3 code """
    non_iso_langs = {
        "zh-Hans": {
            "code": "zh-Hans",
            "iso-639-1": "zh",
            "english": "Simplified Chinese",
            "native": "简化字",
        },
        "zh-Hant": {
            "code": "zh-Hant",
            "iso-639-1": "zh",
            "english": "Traditional Chinese",
            "native": "正體字",
        },
        "iw": {
            "code": "iw",
            "iso-639-1": "he",
            "english": "Hebrew",
            "native": "עברית"
        },
        "es-419": {
            "code": "es-419",
            "iso-639-1": "es-419",
            "english": "Spanish",
            "native": "Español",
        },
        "multi": {
            "code": "mul",
            "iso-639-1": "en",
            "english": "Multiple Languages",
            "native": "Multiple Languages",
        },
    }

    try:
        return (non_iso_langs.get(iso_639_3)
                if iso_639_3 in non_iso_langs.keys() else {
                    "code": iso_639_3,
                    "iso-639-1": iso639.to_iso639_1(iso_639_3),
                    "english": iso639.to_name(iso_639_3),
                    "native": iso639.to_native(iso_639_3),
                })
    except iso639.NonExistentLanguageError:
        return {
            "code": iso_639_3,
            "iso_639_3": iso_639_3,
            "english": iso_639_3,
            "native": iso_639_3,
        }
Пример #10
0
def to_posix_format(lang_code):
    # Normalize fra-ca to fr_CA
    lang_code = use_underscore(lang_code)
    if not lang_code:
        return None
    if '_' in lang_code:
        # ISO format, must convert to POSIX format
        lang, country = lang_code.split('_')[:2]
    else:
        lang, country = lang_code, None
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        posix_lang = to_iso639_1(lang)
    else:
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            return
    if country:
        return '_'.join([posix_lang.lower(), country.upper()])
    else:
        return posix_lang.lower()
Пример #11
0
def to_posix_format(lang_code):
    # Normalize fra-ca to fr_CA
    lang_code = use_underscore(lang_code)
    if not lang_code:
        return None
    if '_' in lang_code:
        # ISO format, must convert to POSIX format
        lang, country = lang_code.split('_')[:2]
    else:
        lang, country = lang_code, None
    if is_valid639_1(lang):
        posix_lang = lang
    elif is_valid639_2(lang):
        posix_lang = to_iso639_1(lang)
    else:
        full_name = lang.lower().capitalize()
        if is_valid639_2(full_name):
            posix_lang = to_iso639_1(full_name)
        else:
            return
    if country:
        return '_'.join([posix_lang.lower(), country.upper()])
    else:
        return posix_lang.lower()
Пример #12
0
def normalize_language_code(lang):
    try:
        lang_iso639_1 = to_iso639_1(lang)
        return lang_iso639_1 or lang
    except NonExistentLanguageError:
        return lang
def findSubtitlesNoneIso639(scanfolder, isoMode, disablelangdetect):
    """
    Detect subtitles that do not comply with ISO-639.

    TODO: Add more subtitle extensions (and read/parse them correctly for
          language detection)
    TODO: Seperate language detection better in different functions
    TODO: Add percentage of certainty and possible other languages when
          low certainty
    TODO: Handle unicode better to detect languages like German and Dutch
          better
    TODO: Use table
    """
    subtitleExts = ['.srt', '.sub', '.ass']
    total = 0
    incorrect = 0
    detectedlang = 0
    for subdir, dirnames, filenames in os.walk(scanfolder):
        for filename in filenames:
            incorrectSubtitle = False
            extension = os.path.splitext(filename)[1].lower()
            # subdirName = os.path.basename(os.path.normpath(subdir))
            if extension in subtitleExts:
                total = total + 1
                langcodeFromFilename = getIsoLanguageCodeFromFilename(filename)
                detectedLanguage = ""
                detectedIsoMode = False
                if is_valid639_1(langcodeFromFilename):
                    detectedIsoMode = "1"
                    detectedLanguage = iso639_to_name(langcodeFromFilename)
                if is_valid639_2(langcodeFromFilename):
                    detectedIsoMode = "2"
                    detectedLanguage = iso639_to_name(langcodeFromFilename)
                if detectedIsoMode is not isoMode:
                    isoShouldBe = ""
                    if isoMode == "1" and detectedIsoMode == "2":
                        isoShouldBe = to_iso639_1(langcodeFromFilename)
                    if isoMode == "2" and detectedIsoMode == "1":
                        isoShouldBe = to_iso639_2(langcodeFromFilename)
                    filepath = subdir + os.sep + filename
                    incorrectSubtitle = True
                    incorrect = incorrect + 1
                    warning = "Incorrectly named subtitle found at "
                    warning += bold(filepath)
                    printNotificationWarning(warning)
                    if detectedIsoMode is not False:
                        info = "\t\tLang code " + bold(langcodeFromFilename)
                        info += " (ISO 639-" + str(detectedIsoMode) + ") "
                        info += "detected. The ISO 639-" + isoMode + " code"
                        info += " for " + detectedLanguage + " is "
                        info += bold(isoShouldBe) + "."
                        printNotificationInfo(info)
                if incorrectSubtitle and not disablelangdetect:
                    filepath = subdir + os.sep + filename
                    try:
                        with io.open(filepath, "r", encoding="utf-8") as mfile:
                            my_unicode_string = mfile.read()
                        possibleLanguage = "\tDetected language is likely to "
                        possibleLanguage += "be \"" + detect(my_unicode_string)
                        possibleLanguage += "\"\n"
                        detectedlang = detectedlang + 1
                    except Exception:
                        possibleLanguage = "\tLanguage detection failed\n"
    info = "Found subtitle files " + bold(str(total)) + " of which "
    info += bold(str(incorrect)) + " are incorrectly named!"
    printNotificationInfo(info)