Exemplo n.º 1
0
def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                      for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    return tokens
def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                      for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    return tokens
Exemplo n.º 3
0
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.

        """
        return WordList(
            word_tokenize(self.raw, self.tokenizer, include_punc=False))
Exemplo n.º 4
0
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.

        """
        return WordList(
            word_tokenize(self.raw, self.tokenizer, include_punc=False))
Exemplo n.º 5
0
 def tokenize(words):
     if isinstance(words, basestring):
         return word_tokenize(words, include_punc=False)
     else:
         return words
Exemplo n.º 6
0
 def test_word_tokenize(self):
     tokens = word_tokenize(self.text)
     assert_true(is_generator(tokens))
     assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
 def tokenize(words):
     if isinstance(words, basestring):
         return word_tokenize(words, include_punc=False)
     else:
         return words
Exemplo n.º 8
0
 def test_word_tokenize(self):
     tokens = word_tokenize(self.text)
     assert_true(is_generator(tokens))
     assert_equal(list(tokens), self.tokenizer.tokenize(self.text))