Пример #1
0
 def word_counts(self):
     '''Dictionary of word frequencies in this text.
     '''
     counts = defaultdict(int)
     stripped_words = [lowerstrip(word) for word in self.words]
     for word in stripped_words:
         counts[word] += 1
     return counts
Пример #2
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([w.lower()
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((lowerstrip(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
Пример #3
0
 def __init__(self, text, tokenizer=None,
             pos_tagger=None, np_extractor=None, analyzer=None,
             parser=None, classifier=None, clean_html=False):
     if not isinstance(text, basestring):
         raise TypeError('The `text` argument passed to `__init__(text)` '
                         'must be a string, not {0}'.format(type(text)))
     if clean_html:
         raise NotImplementedError("clean_html has been deprecated. "
                                 "To remove HTML markup, use BeautifulSoup's "
                                 "get_text() function")
     self.raw = self.string = text
     self.stripped = lowerstrip(self.raw, all=True)
     _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                        parser, classifier)
Пример #4
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    tokenizer = WordTokenizer()
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set([w.lower()
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((lowerstrip(w, all=False) for w in document))
    features = dict([(u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features])
    return features
Пример #5
0
 def test_lowerstrip(self):
     assert_equal(lowerstrip(self.text),
                 'this has punctuation')
Пример #6
0
 def test_lowerstrip(self):
     assert_equal(lowerstrip(self.text), 'this. has. punctuation')