Пример #1
0
def stem_and_tokenize(text):
    words = TweetTokenizer(reduce_len=True, strip_handles=True).tokenize(text)
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    if '!' in words:
        words.remove('!')
    return words
Пример #2
0
def tokenizer_sin_stemming(text):
    tokens = TweetTokenizer(strip_handles=True, reduce_len=True).tokenize(
            unidecode.unidecode(unidecode.unidecode(text.rstrip())))
    tokens = replace_abb(filter_stopwords(tokens))
    for token in tokens:
        if ('u00' in token) or (len(token) == 1) or token.isdigit():
            tokens.remove(token)
    return tokens