def replace(text): kwp = KeywordProcessor() kwp.non_word_boundaries = set() kwp.add_keywords_from_dict( {" {} ".format(v): [k] for k, v in UNICODE_EMOJI.items()}) clean_text = kwp.replace_keywords(text).strip() return clean_text
def load_stopwords_processor(stopwords_file): pt_chars = set(list('áãâéêíóõôúç')) kp = KeywordProcessor() kp.non_word_boundaries = kp.non_word_boundaries | pt_chars stopwords = [n.strip() for n in open(stopwords_file)] for s in stopwords: kp.add_keyword(s, ' ') for s in nltk.corpus.stopwords.words('portuguese'): kp.add_keyword(s, ' ') def transform(txt): return " ".join(kp.replace_keywords(txt).split()) kp.transform = transform return kp