def _get_document_tokens(document): if isinstance(document, basestring): tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False))) else: tokens = set(strip_punc(w, all=False) for w in document) return tokens
def words(self): """Return a list of word tokens. This excludes punctuation characters. If you want to include punctuation characters, access the ``tokens`` property. :returns: A :class:`WordList <WordList>` of word tokens. """ return WordList( word_tokenize(self.raw, self.tokenizer, include_punc=False))
def tokenize(words): if isinstance(words, basestring): return word_tokenize(words, include_punc=False) else: return words
def test_word_tokenize(self): tokens = word_tokenize(self.text) assert_true(is_generator(tokens)) assert_equal(list(tokens), self.tokenizer.tokenize(self.text))