def normalize_list(self, text): """ Get a list of word stems that appear in the text. Stopwords and an initial 'to' will be stripped. """ pieces = [self.snowball_stem(word) for word in tokenize_list(text) if self.good_lemma(word)] if not pieces: return text return pieces
def normalize_list(self, text): """ Get a list of word stems that appear in the text. Stopwords and an initial 'to' will be stripped. """ pieces = [ self.snowball_stem(word) for word in tokenize_list(text) if self.good_lemma(word) ] if not pieces: return text return pieces
def normalize_list(text): """ Get a list of word stems that appear in the text. Stopwords and an initial 'to' will be stripped. """ pieces = [morphy_stem(word) for word in tokenize_list(text)] pieces = [piece for piece in pieces if good_lemma(piece)] if not pieces: return text if pieces[0] == 'to': pieces = pieces[1:] return pieces
def tag_and_stem(text): """ Returns a list of (stem, tag, token) triples: - stem: the word's uninflected form - tag: the word's part of speech - token: the original word, so we can reconstruct it later """ tokens = tokenize_list(preprocess_text(text)) tagged = nltk.pos_tag(tokens) out = [] for token, tag in tagged: if token in BRACKET_DIC: out.append((token, BRACKET_DIC[token], token)) else: stem = morphy_stem(token, tag) out.append((stem, tag, token)) return out
def normalize_list(text): """ Get a list of word stems that appear in the text. Stopwords and an initial 'to' will be stripped, unless this leaves nothing in the stem. >>> normalize_list('the dog') [u'dog'] >>> normalize_list('big dogs') [u'big', u'dog'] >>> normalize_list('the') [u'the'] """ text = preprocess_text(text) pieces = [morphy_stem(word) for word in tokenize_list(text)] pieces = [piece for piece in pieces if good_lemma(piece)] if not pieces: return [text] if pieces[0] == 'to': pieces = pieces[1:] return pieces