Пример #1
0
def get_lang(s: str, proportion: float = 0.8) -> Tuple[bool, str]:
    """Returns the most likely language detected by cld3.

    :param s: String to be detected.
    :param proportion: Proportion.
    :return: e.g. ``(True, 'en')``
    """
    # get_language() is not so reliable: mixed languages content is not well detected
    r = cld3.get_frequent_languages(s, 3)[0]
    return r.is_reliable and r.proportion > proportion, r.language
Пример #2
0
    def test_get_frequent_languages(self):
        self.assertFalse(cld3.get_frequent_languages("", 1))
        self.assertFalse(cld3.get_frequent_languages(None, 1))

        # This is an especially important case where we want to make sure
        # that "und" is not included in the results;
        # see bottom of https://github.com/google/cld3/issues/15.
        langs = cld3.get_frequent_languages(
            "This piece of text is in English. Този текст е на Български.",
            num_langs=3,
        )
        self.assertEqual(len(langs), 2)
        self.assertEqual(
            sorted(i.language for i in langs),
            ["bg", "en"],
        )

        langs = cld3.get_frequent_languages(
            "Derrière ce sujet des retraites, il y a beaucoup de questions autour de ce que sera le travail de demain. Nous ne sommes pas au bout de ce chantier. Jusqu’à présent nous avons ajusté, il est temps de refonder. On le fera en transparence, et tous ensemble.",  # noqa
            num_langs=5,
        )
        self.assertEqual(len(langs), 1)
        self.assertEqual(langs[0].language, "fr")
Пример #3
0
def detect_lang_neural(text,
                       return_multiple=False,
                       return_dict=False,
                       hint_language=None,
                       filter_unreliable=False):
    if cld3 is None:
        LOG.debug("run pip install pycld3")
        raise ImportError("pycld3 not installed")
    languages = []
    if return_multiple or hint_language:
        preds = sorted(cld3.get_frequent_languages(text, num_langs=5),
                       key=lambda i: i.probability,
                       reverse=True)
        for pred in preds:
            if filter_unreliable and not pred.is_reliable:
                continue
            if return_dict:
                languages += [{
                    "lang_code": pred.language,
                    "lang": code_to_name(pred.language),
                    "conf": pred.probability
                }]
            else:
                languages.append(pred.language)

            if hint_language and hint_language == pred.language:
                languages = [languages[-1]]
                break
    else:
        pred = cld3.get_language(text)
        if filter_unreliable and not pred.is_reliable:
            pass
        elif return_dict:
            languages = [{
                "lang_code": pred.language,
                "lang": code_to_name(pred.language),
                "conf": pred.probability
            }]
        else:
            languages = [pred.language]

    # return top language only
    if not return_multiple:
        if not len(languages):
            return None
        return languages[0]
    return languages
Пример #4
0
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
    all_languages = []

    ## CLEAN CONTENT ##
    content = get_item_content_html2text(item_id, ignore_links=True)
    content = remove_all_urls_from_content(item_id, item_content=content)

    # REMOVE USELESS SPACE
    content = ' '.join(content.split())
    #- CLEAN CONTENT -#

    #print(content)
    #print(len(content))
    if len(content) >= min_len:
        for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
            if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
                all_languages.append(lang)
    return all_languages
Пример #5
0
 def detect_freq_lang(self, text, n = 3):
     import cld3  # requires protobuf
     pred = cld3.get_frequent_languages(text, num_langs = n)
     pred_list = [ (p.language, 100*p.probability) for p in pred ]
     return pred_list