class TestWordTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ['Python', 'is', 'a', 'high-level', 'programming', 'language', '.']) def test_exclude_punc(self): assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is") def test_word_tokenize(self): tokens = word_tokenize(self.text) assert_true(is_generator(tokens)) assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
class TestWordTokenizer(unittest.TestCase): '''An example unit test case.''' def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ['Python', 'is', 'a', 'high-level', 'programming', 'language', '.']) def test_exclude_punc(self): assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is")
class TestWordTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ['Python', 'is', 'a', 'high-level', 'programming', 'language', '.']) def test_exclude_punc(self): assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is") def test_word_tokenize(self): tokens = word_tokenize(self.text) assert_true(is_generator(tokens)) assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
def words(self): '''Return a list of word tokens. This excludes punctuation characters. If you want to include punctuation characters, access the ``tokens`` property. ''' # NLTK's word tokenizer expects sentences as input, so tokenize the # blob into sentences before tokenizing to words tok = WordTokenizer() words = chain.from_iterable(tok.itokenize(sent.raw, include_punc=False) for sent in self.sentences) return WordList(words)
def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' tokenizer = WordTokenizer() if isinstance(document, basestring): tokens = set([strip_punc(w, all=False) for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set((strip_punc(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' tokenizer = WordTokenizer() if isinstance(document, basestring): tokens = set([strip_punc(w, all=False) for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set((strip_punc(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def analyze(self, text): """Return the sentiment as a named tuple of the form: ``Sentiment(classification, p_pos, p_neg)`` """ # Lazily train the classifier super(NaiveBayesAnalyzer, self).analyze(text) tokenizer = WordTokenizer() tokens = tokenizer.itokenize(text, include_punc=False) filtered = (t.lower() for t in tokens if len(t) >= 3) feats = self._extract_feats(filtered) prob_dist = self._classifier.prob_classify(feats) return self.RETURN_TYPE( classification=prob_dist.max(), p_pos=prob_dist.prob("pos"), p_neg=prob_dist.prob("neg") )
def _get_words_from_dataset(dataset): '''Return a set of all words in a dataset. :param dataset: A list of tuples of the form ``(words, label)`` where ``words`` is either a string of a list of tokens. ''' tokenizer = WordTokenizer() all_words = [] for words, classification in dataset: # Words may either be a string or an iterable if isinstance(words, basestring): all_words.extend(tokenizer.itokenize(words, include_punc=False)) else: all_words.extend(words) return set(all_words)
def basic_extractor(document, train_set): '''A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. :param document: The text to extract features from. Can be a string or an iterable. :param train_set: Training data set, a list of tuples of the form ``(words, label)``. ''' tokenizer = WordTokenizer() word_features = _get_words_from_dataset(train_set) if isinstance(document, basestring): tokens = set((strip_punc(w, all=False) for w in tokenizer.itokenize(document, include_punc=False))) else: tokens = set(strip_punc(w, all=False) for w in document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) return features
def basic_extractor(document, train_set): '''A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. :param document: The text to extract features from. Can be a string or an iterable. :param train_set: Training data set, a list of tuples of the form ``(words, label)``. ''' tokenizer = WordTokenizer() word_features = _get_words_from_dataset(train_set) if isinstance(document, basestring): tokens = set([strip_punc(w, all=False) for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set(strip_punc(w, all=False) for w in document) features = dict([(u'contains({0})'.format(word), (word in tokens)) for word in word_features]) return features
class TestWordTokenizer(unittest.TestCase): '''An example unit test case.''' def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), [ 'Python', 'is', 'a', 'high-level', 'programming', 'language', '.' ]) def test_exclude_punc(self): assert_equal( self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is")
def test_get_words_from_dataset(): tok = WordTokenizer() all_words = [] for words, _ in train_set: all_words.extend(tok.itokenize(words, include_punc=False)) assert_equal(_get_words_from_dataset(train_set), set(all_words))