def load_stopwords_processor(stopwords_file): pt_chars = set(list('áãâéêíóõôúç')) kp = KeywordProcessor() kp.non_word_boundaries = kp.non_word_boundaries | pt_chars stopwords = [n.strip() for n in open(stopwords_file)] for s in stopwords: kp.add_keyword(s, ' ') for s in nltk.corpus.stopwords.words('portuguese'): kp.add_keyword(s, ' ') def transform(txt): return " ".join(kp.replace_keywords(txt).split()) kp.transform = transform return kp
def load_thesaurus(thesaurus_file): df = pd.read_csv(thesaurus_file) df.fillna('', inplace=True) thesaurus = KeywordProcessor() thesaurus.add_keywords_from_list(list(df['name'].values)) def use(term): u = df[df.name == term]['USE'] if len(u) == 0 or u.values[0] == '': return term else: return u.values[0] def transform(txt): terms = thesaurus.extract_keywords(txt) terms = [use(t) for t in terms] return terms thesaurus.transform = transform return thesaurus