def get_text_language(text):
    language_rank = {}
    tokens = utils.get_tokens(text)
    for language in language_helper.get_languages():
        c_stopwords = language_helper.get_language_stopwords(language)
        language_rank[language] = reduce(lambda carry, curr: carry + 1 if curr in c_stopwords else carry, tokens, 0)
    sorted_languages = sorted(language_rank.iteritems(), key=lambda x: -x[1])
    return sorted_languages[0][0]
Пример #2
0
def __get_stopwords_regex(language):
    if language in language_stopwords_re:
        return language_stopwords_re[language]
    l_stopwords = language_helper.get_language_stopwords(language)
    re_stopwords = []
    for stopword in l_stopwords:
        stopword_re = r'\b' + stopword + r'(?![\w-])'
        re_stopwords.append(stopword_re)
    language_stopwords_re[language] = re.compile('|'.join(re_stopwords), re.IGNORECASE)
    return language_stopwords_re[language]
Пример #3
0
def _filter_tagged_tokens(tagged_tokens, language):
    stop_words = language_helper.get_language_stopwords(language)
    filtered_tagged_tokens = filter(lambda t: t[0] not in stop_words and t[1] in valid_pos_tags, tagged_tokens)
    filtered_tokens = map(lambda t: t[0].lower(), filtered_tagged_tokens)
    return filtered_tokens