Exemplo n.º 1
0
def load_stopwords_processor(stopwords_file):
  pt_chars = set(list('áãâéêíóõôúç'))
  kp = KeywordProcessor()
  kp.non_word_boundaries = kp.non_word_boundaries | pt_chars
  stopwords = [n.strip() for n in open(stopwords_file)]
  for s in stopwords: kp.add_keyword(s, ' ')
  for s in nltk.corpus.stopwords.words('portuguese'): kp.add_keyword(s, ' ')
    
  def transform(txt):
    return " ".join(kp.replace_keywords(txt).split())

  kp.transform = transform
  return kp
Exemplo n.º 2
0
def load_thesaurus(thesaurus_file):
  df = pd.read_csv(thesaurus_file)
  df.fillna('', inplace=True)
  thesaurus = KeywordProcessor()
  thesaurus.add_keywords_from_list(list(df['name'].values))
  
  def use(term):
    u = df[df.name == term]['USE']
    if len(u) == 0 or u.values[0] == '':
        return term
    else:
        return u.values[0]
  
  def transform(txt):
    terms = thesaurus.extract_keywords(txt)
    terms = [use(t) for t in terms]
    return terms
  
  thesaurus.transform = transform
  return thesaurus