def get_text_language(text): language_rank = {} tokens = utils.get_tokens(text) for language in language_helper.get_languages(): c_stopwords = language_helper.get_language_stopwords(language) language_rank[language] = reduce(lambda carry, curr: carry + 1 if curr in c_stopwords else carry, tokens, 0) sorted_languages = sorted(language_rank.iteritems(), key=lambda x: -x[1]) return sorted_languages[0][0]
def __get_stopwords_regex(language): if language in language_stopwords_re: return language_stopwords_re[language] l_stopwords = language_helper.get_language_stopwords(language) re_stopwords = [] for stopword in l_stopwords: stopword_re = r'\b' + stopword + r'(?![\w-])' re_stopwords.append(stopword_re) language_stopwords_re[language] = re.compile('|'.join(re_stopwords), re.IGNORECASE) return language_stopwords_re[language]
def _filter_tagged_tokens(tagged_tokens, language): stop_words = language_helper.get_language_stopwords(language) filtered_tagged_tokens = filter(lambda t: t[0] not in stop_words and t[1] in valid_pos_tags, tagged_tokens) filtered_tokens = map(lambda t: t[0].lower(), filtered_tagged_tokens) return filtered_tokens