def stem_and_tokenize(text): words = TweetTokenizer(reduce_len=True, strip_handles=True).tokenize(text) stemmer = PorterStemmer() words = [stemmer.stem(word) for word in words] if '!' in words: words.remove('!') return words
def tokenizer_sin_stemming(text): tokens = TweetTokenizer(strip_handles=True, reduce_len=True).tokenize( unidecode.unidecode(unidecode.unidecode(text.rstrip()))) tokens = replace_abb(filter_stopwords(tokens)) for token in tokens: if ('u00' in token) or (len(token) == 1) or token.isdigit(): tokens.remove(token) return tokens