class TestWordTokenizer(unittest.TestCase): '''An example unit test case.''' def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ['Python', 'is', 'a', 'high-level', 'programming', 'language', '.']) def test_exclude_punc(self): assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is")
def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' tokenizer = WordTokenizer() if isinstance(document, basestring): tokens = set([w.lower() for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set((lowerstrip(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def analyze(self, text): """Return the sentiment as a tuple of the form: ``(classification, pos_probability, neg_probability)`` """ # Lazily train the classifier super(NaiveBayesAnalyzer, self).analyze(text) tokenizer = WordTokenizer() tokens = tokenizer.tokenize(text, include_punc=False) filtered = [t.lower() for t in tokens if len(t) >= 3] feats = self._extract_feats(filtered) prob_dist = self._classifier.prob_classify(feats) # classification, p_pos, p_neg return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")
def _get_words_from_dataset(dataset): '''Return a set of all words in a dataset. :param dataset: A list of tuples of the form ``(words, label)`` where ``words`` is either a string of a list of tokens. ''' tokenizer = WordTokenizer() all_words = [] for words, classification in dataset: # Words may either be a string or an iterable if isinstance(words, basestring): all_words.extend(tokenizer.itokenize(words, include_punc=False)) else: all_words.extend(words) return set(all_words)
class TestWordTokenizer(unittest.TestCase): '''An example unit test case.''' def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), [ 'Python', 'is', 'a', 'high-level', 'programming', 'language', '.' ]) def test_exclude_punc(self): assert_equal( self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language'])
def correct(self): '''Attempt to correct the spelling of a blob. .. versionadded:: 0.6.0 :rtype: BaseBlob ''' tok = WordTokenizer() corrected = (Word(w).correct() for w in tok.tokenize(self.raw, include_punc=True)) # Separate each token with a space unless the token is a punctuation ret = '' for i, word in enumerate(corrected): # Avoid an extra space at the beginning if word in pystring.punctuation or i == 0: ret = ''.join([ret, word]) else: ret = ' '.join([ret, word]) return self.__class__(ret)
def basic_extractor(document, train_set): '''A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. :param document: The text to extract features from. Can be a string or an iterable. :param train_set: Training data set, a list of tuples of the form ``(words, label)``. ''' tokenizer = WordTokenizer() word_features = _get_words_from_dataset(train_set) if isinstance(document, basestring): tokens = set([w.lower() for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set((lowerstrip(w, all=False) for w in document)) features = dict([(u'contains({0})'.format(word), (word in tokens)) for word in word_features]) return features
def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language."
def test_tokens_property(self): assert_true(self.blob.tokens, tb.WordList(WordTokenizer().tokenize(self.text)))