Python WordTokenizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: text.tokenizers

Класс/Тип: WordTokenizer

Примеров на hotexamples.com: 10

Python WordTokenizer - 10 примеров найдено. Это лучшие примеры Python кода для text.tokenizers.WordTokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

WordTokenizer(2)

itokenize(1)

tokenize(1)

Пример #1

Показать файл

Файл: test_tokenizers.py Проект: LisaDawn/TextBlob

class TestWordTokenizer(unittest.TestCase):

    '''An example unit test case.'''

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")

Пример #2

Показать файл

Файл: classifiers.py Проект: scraping-xx/TextBlob

def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([w.lower()
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((lowerstrip(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features

Пример #3

Показать файл

Файл: sentiments.py Проект: LisaDawn/TextBlob

 def analyze(self, text):
     """Return the sentiment as a tuple of the form:
     ``(classification, pos_probability, neg_probability)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokenizer = WordTokenizer()
     tokens = tokenizer.tokenize(text, include_punc=False)
     filtered = [t.lower() for t in tokens if len(t) >= 3]
     feats = self._extract_feats(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     # classification, p_pos, p_neg
     return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")

Пример #4

Показать файл

Файл: classifiers.py Проект: scraping-xx/TextBlob

def _get_words_from_dataset(dataset):
    '''Return a set of all words in a dataset.

    :param dataset: A list of tuples of the form ``(words, label)`` where
        ``words`` is either a string of a list of tokens.
    '''
    tokenizer = WordTokenizer()
    all_words = []
    for words, classification in dataset:
        # Words may either be a string or an iterable
        if isinstance(words, basestring):
            all_words.extend(tokenizer.itokenize(words, include_punc=False))
        else:
            all_words.extend(words)
    return set(all_words)

Пример #5

Показать файл

class TestWordTokenizer(unittest.TestCase):
    '''An example unit test case.'''
    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            'Python', 'is', 'a', 'high-level', 'programming', 'language', '.'
        ])

    def test_exclude_punc(self):
        assert_equal(
            self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming', 'language'])

Пример #6

Показать файл

Файл: blob.py Проект: scraping-xx/TextBlob

    def correct(self):
        '''Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: BaseBlob
        '''
        tok = WordTokenizer()
        corrected = (Word(w).correct() for w in tok.tokenize(self.raw, include_punc=True))
        # Separate each token with a space unless the token is a punctuation
        ret = ''
        for i, word in enumerate(corrected):
            # Avoid an extra space at the beginning
            if word in pystring.punctuation or i == 0:
                ret = ''.join([ret, word])
            else:
                ret = ' '.join([ret, word])
        return self.__class__(ret)

Пример #7

Показать файл

Файл: classifiers.py Проект: scraping-xx/TextBlob

def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    tokenizer = WordTokenizer()
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set([w.lower()
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((lowerstrip(w, all=False) for w in document))
    features = dict([(u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features])
    return features

Пример #8

Показать файл

 def setUp(self):
     self.tokenizer = WordTokenizer()
     self.text = "Python is a high-level programming language."

Пример #9

Показать файл

Файл: test_blob.py Проект: syllog1sm/TextBlob

 def test_tokens_property(self):
     assert_true(self.blob.tokens,
                 tb.WordList(WordTokenizer().tokenize(self.text)))

Пример #10

Показать файл

Файл: test_tokenizers.py Проект: nicolargo/TextBlob

 def setUp(self):
     self.tokenizer = WordTokenizer()
     self.text = "Python is a high-level programming language."