Exemplo n.º 1
0
    def reverse(self, name):
        with_country = (GuessitConverter._with_country_regexp.match(name) or
                        GuessitConverter._with_country_regexp2.match(name))

        name  = u(name.lower())
        if with_country:
            lang = Language.fromguessit(with_country.group(1).strip())
            lang.country = babelfish.Country.fromguessit(with_country.group(2).strip())
            return (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None)

        # exceptions come first, as they need to override a potential match
        # with any of the other guessers
        try:
            return self.guessit_exceptions[name]
        except KeyError:
            pass

        for conv in [babelfish.Language,
                     babelfish.Language.fromalpha3b,
                     babelfish.Language.fromalpha2,
                     babelfish.Language.fromname,
                     babelfish.Language.fromopensubtitles]:
            try:
                c = conv(name)
                return c.alpha3, c.country, c.script
            except (ValueError, babelfish.LanguageReverseError):
                pass

        raise babelfish.LanguageReverseError(name)
Exemplo n.º 2
0
    def reverse(self, name):
        with_country = (GuessitConverter._with_country_regexp.match(name)
                        or GuessitConverter._with_country_regexp2.match(name))

        name = u(name.lower())
        if with_country:
            lang = Language.fromguessit(with_country.group(1).strip())
            lang.country = babelfish.Country.fromguessit(
                with_country.group(2).strip())
            return lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None

        # exceptions come first, as they need to override a potential match
        # with any of the other guessers
        try:
            return self.guessit_exceptions[name]
        except KeyError:
            pass

        for conv in [
                babelfish.Language, babelfish.Language.fromalpha3b,
                babelfish.Language.fromalpha2, babelfish.Language.fromname,
                babelfish.Language.fromopensubtitles
        ]:
            try:
                c = conv(name)
                return c.alpha3, c.country, c.script
            except (ValueError, babelfish.LanguageReverseError):
                pass

        raise babelfish.LanguageReverseError(name)
Exemplo n.º 3
0
def find_possible_languages(string):
    """Find possible languages in the string

    :return: list of tuple (property, Language, lang_word, word)
    """
    words = find_words(string)

    valid_words = []
    for word in words:
        lang_word = word.lower()
        key = 'language'
        for prefix in subtitle_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
                key = 'subtitleLanguage'
        for suffix in subtitle_suffixes:
            if lang_word.endswith(suffix):
                lang_word = lang_word[:len(suffix)]
                key = 'subtitleLanguage'
        for prefix in lang_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
        if not lang_word in LNG_COMMON_WORDS:
            try:
                lang = Language.fromguessit(lang_word)
                # Keep language with alpha2 equivalent. Others are probably
                # uncommon languages.
                if lang == 'mul' or hasattr(lang, 'alpha2'):
                    valid_words.append((key, lang, lang_word, word))
            except babelfish.Error:
                pass
    return valid_words
Exemplo n.º 4
0
def find_possible_languages(string):
    """Find possible languages in the string

    :return: list of tuple (property, Language, lang_word, word)
    """
    words = find_words(string)

    valid_words = []
    for word in words:
        lang_word = word.lower()
        key = 'language'
        for prefix in subtitle_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
                key = 'subtitleLanguage'
        for suffix in subtitle_suffixes:
            if lang_word.endswith(suffix):
                lang_word = lang_word[:len(suffix)]
                key = 'subtitleLanguage'
        for prefix in lang_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
        if not lang_word in LNG_COMMON_WORDS:
            try:
                lang = Language.fromguessit(lang_word)
                # Keep language with alpha2 equivalent. Others are probably
                # uncommon languages.
                if lang == 'mul' or hasattr(lang, 'alpha2'):
                    valid_words.append((key, lang, lang_word, word))
            except babelfish.Error:
                pass
    return valid_words
Exemplo n.º 5
0
def guess_language(text):  # pragma: no cover
    """Guess the language in which a body of text is written.

    This uses the external guess-language python module, and will fail and return
    Language(Undetermined) if it is not installed.
    """
    try:
        from guess_language import guessLanguage
        return Language.fromguessit(guessLanguage(text))

    except ImportError:
        log.error('Cannot detect the language of the given text body, missing dependency: guess-language')
        log.error('Please install it from PyPI, by doing eg: pip install guess-language')
        return UNDETERMINED
Exemplo n.º 6
0
def search_language(string, lang_filter=None):
    """Looks for language patterns, and if found return the language object,
    its group span and an associated confidence.

    you can specify a list of allowed languages using the lang_filter argument,
    as in lang_filter = [ 'fr', 'eng', 'spanish' ]

    >>> search_language('movie [en].avi')['language']
    <Language [en]>

    >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es'])

    """

    if lang_filter:
        lang_filter = set(Language.fromguessit(lang) for lang in lang_filter)

    confidence = 1.0  # for all of them

    for prop, language, lang, word in find_possible_languages(string):
        pos = string.find(word)
        end = pos + len(word)

        if lang_filter and language not in lang_filter:
            continue

        # only allow those languages that have a 2-letter code, those that
        # don't are too esoteric and probably false matches
        #if language.lang not in lng3_to_lng2:
        #    continue

        # confidence depends on alpha2, alpha3, english name, ...
        if len(lang) == 2:
            confidence = 0.8
        elif len(lang) == 3:
            confidence = 0.9
        elif prop == 'subtitleLanguage':
            confidence = 0.6  # Subtitle prefix found with language
        else:
            # Note: we could either be really confident that we found a
            #       language or assume that full language names are too
            #       common words and lower their confidence accordingly
            confidence = 0.3  # going with the low-confidence route here

        return Guess({prop: language},
                     confidence=confidence,
                     input=string,
                     span=(pos, end))

    return None
Exemplo n.º 7
0
def guess_language(text):  # pragma: no cover
    """Guess the language in which a body of text is written.

    This uses the external guess-language python module, and will fail and return
    Language(Undetermined) if it is not installed.
    """
    try:
        from guess_language import guessLanguage
        return Language.fromguessit(guessLanguage(text))

    except ImportError:
        log.error('Cannot detect the language of the given text body, missing dependency: guess-language')
        log.error('Please install it from PyPI, by doing eg: pip install guess-language')
        return UNDETERMINED
Exemplo n.º 8
0
def search_language(string, lang_filter=None):
    """Looks for language patterns, and if found return the language object,
    its group span and an associated confidence.

    you can specify a list of allowed languages using the lang_filter argument,
    as in lang_filter = [ 'fr', 'eng', 'spanish' ]

    >>> search_language('movie [en].avi')['language']
    <Language [en]>

    >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es'])

    """

    if lang_filter:
        lang_filter = set(Language.fromguessit(lang) for lang in lang_filter)

    confidence = 1.0  # for all of them

    for prop, language, lang, word in find_possible_languages(string):
        pos = string.find(word)
        end = pos + len(word)

        if lang_filter and language not in lang_filter:
            continue

        # only allow those languages that have a 2-letter code, those that
        # don't are too esoteric and probably false matches
        #if language.lang not in lng3_to_lng2:
        #    continue

        # confidence depends on alpha2, alpha3, english name, ...
        if len(lang) == 2:
            confidence = 0.8
        elif len(lang) == 3:
            confidence = 0.9
        elif prop == 'subtitleLanguage':
            confidence = 0.6  # Subtitle prefix found with language
        else:
            # Note: we could either be really confident that we found a
            #       language or assume that full language names are too
            #       common words and lower their confidence accordingly
            confidence = 0.3  # going with the low-confidence route here

        return Guess({prop: language}, confidence=confidence, input=string, span=(pos, end))

    return None
Exemplo n.º 9
0
def find_possible_languages(string, allowed_languages=None):
    """Find possible languages in the string

    :return: list of tuple (property, Language, lang_word, word)
    """

    common_words = None
    if allowed_languages:
        common_words = LNG_COMMON_WORDS_STRICT
    else:
        common_words = LNG_COMMON_WORDS

    words = find_words(string)

    valid_words = []
    for word in words:
        lang_word = word.lower()
        key = 'language'
        for prefix in subtitle_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
                key = 'subtitleLanguage'
        for suffix in subtitle_suffixes:
            if lang_word.endswith(suffix):
                lang_word = lang_word[:len(suffix)]
                key = 'subtitleLanguage'
        for prefix in lang_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
        if lang_word not in common_words and word.lower() not in common_words:
            try:
                lang = Language.fromguessit(lang_word)
                if allowed_languages:
                    if lang.name.lower(
                    ) in allowed_languages or lang.alpha2.lower(
                    ) in allowed_languages or lang.alpha3.lower(
                    ) in allowed_languages:
                        valid_words.append((key, lang, lang_word, word))
                # Keep language with alpha2 equivalent. Others are probably
                # uncommon languages.
                elif lang == 'mul' or hasattr(lang, 'alpha2'):
                    valid_words.append((key, lang, lang_word, word))
            except babelfish.Error:
                pass
    return valid_words
Exemplo n.º 10
0
def find_possible_languages(string, allowed_languages=None):
    """Find possible languages in the string

    :return: list of tuple (property, Language, lang_word, word)
    """

    common_words = None
    if allowed_languages:
        common_words = LNG_COMMON_WORDS_STRICT
    else:
        common_words = LNG_COMMON_WORDS

    words = find_words(string)

    valid_words = []
    for word in words:
        lang_word = word.lower()
        key = 'language'
        for prefix in subtitle_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
                key = 'subtitleLanguage'
        for suffix in subtitle_suffixes:
            if lang_word.endswith(suffix):
                lang_word = lang_word[:len(suffix)]
                key = 'subtitleLanguage'
        for prefix in lang_prefixes:
            if lang_word.startswith(prefix):
                lang_word = lang_word[len(prefix):]
        if lang_word not in common_words and word.lower() not in common_words:
            try:
                lang = Language.fromguessit(lang_word)
                if allowed_languages:
                    if lang.name.lower() in allowed_languages or lang.alpha2.lower() in allowed_languages or lang.alpha3.lower() in allowed_languages:
                        valid_words.append((key, lang, lang_word, word))
                # Keep language with alpha2 equivalent. Others are probably
                # uncommon languages.
                elif lang == 'mul' or hasattr(lang, 'alpha2'):
                    valid_words.append((key, lang, lang_word, word))
            except babelfish.Error:
                pass
    return valid_words