Пример #1
0
class TestWordTokenizer(unittest.TestCase):

    '''An example unit test case.'''

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")
Пример #2
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([w.lower()
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((lowerstrip(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
Пример #3
0
 def analyze(self, text):
     """Return the sentiment as a tuple of the form:
     ``(classification, pos_probability, neg_probability)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokenizer = WordTokenizer()
     tokens = tokenizer.tokenize(text, include_punc=False)
     filtered = [t.lower() for t in tokens if len(t) >= 3]
     feats = self._extract_feats(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     # classification, p_pos, p_neg
     return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")
Пример #4
0
def _get_words_from_dataset(dataset):
    '''Return a set of all words in a dataset.

    :param dataset: A list of tuples of the form ``(words, label)`` where
        ``words`` is either a string of a list of tokens.
    '''
    tokenizer = WordTokenizer()
    all_words = []
    for words, classification in dataset:
        # Words may either be a string or an iterable
        if isinstance(words, basestring):
            all_words.extend(tokenizer.itokenize(words, include_punc=False))
        else:
            all_words.extend(words)
    return set(all_words)
Пример #5
0
class TestWordTokenizer(unittest.TestCase):
    '''An example unit test case.'''
    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            'Python', 'is', 'a', 'high-level', 'programming', 'language', '.'
        ])

    def test_exclude_punc(self):
        assert_equal(
            self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming', 'language'])
Пример #6
0
    def correct(self):
        '''Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: BaseBlob
        '''
        tok = WordTokenizer()
        corrected = (Word(w).correct() for w in tok.tokenize(self.raw, include_punc=True))
        # Separate each token with a space unless the token is a punctuation
        ret = ''
        for i, word in enumerate(corrected):
            # Avoid an extra space at the beginning
            if word in pystring.punctuation or i == 0:
                ret = ''.join([ret, word])
            else:
                ret = ' '.join([ret, word])
        return self.__class__(ret)
Пример #7
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    tokenizer = WordTokenizer()
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set([w.lower()
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((lowerstrip(w, all=False) for w in document))
    features = dict([(u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features])
    return features
Пример #8
0
 def setUp(self):
     self.tokenizer = WordTokenizer()
     self.text = "Python is a high-level programming language."
Пример #9
0
 def test_tokens_property(self):
     assert_true(self.blob.tokens,
                 tb.WordList(WordTokenizer().tokenize(self.text)))
Пример #10
0
 def setUp(self):
     self.tokenizer = WordTokenizer()
     self.text = "Python is a high-level programming language."