Exemplos de word_tokenize em Python, exemplos de textblob.tokenizers.word_tokenize em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test.py Projeto: apals/sentiment-classification-in-social-media

def tokenize(words):
    if isinstance(words, basestring):
        return word_tokenize(words, include_punc=False)
    else:
        return words
    all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
    return set(all_words)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: tweepy_streamer.py Projeto: gary-269/Sentiment_analyser

def clean_tweets(tweet):
    stop_words = set(stopwords.words('english'))
    stop_words.add('\\n')
    word_tokens = word_tokenize(tweet)
    # after tweepy preprocessing the colon symbol left remain after removing mentions
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
    # replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet)
    # remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)
    # remove url pattern
    tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
    # tweet.decode('ascii', 'ignore')
    # filter using NLTK library append it to a string
    strng = ""
    filtered_tweet = [w for w in word_tokens if not w in stop_words]

    # filtered_tweet = []
    # # looping through conditions
    # for w in word_tokens:
    #     # check tokens against stop words , emoticons and punctuations
    #     if w not in stop_words and w not in emoticons and w not in string.punctuation:
    #         filtered_tweet.append(w)
    return filtered_tweet

Exemplo n.º 3

0

Exibir arquivo

Arquivo: classifiers.py Projeto: wbtiger/TextBlob

def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                      for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    return tokens

Exemplo n.º 4

0

Exibir arquivo

Arquivo: classifiers.py Projeto: Anhmike/TextBlob

def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    return tokens

Exemplo n.º 5

0

Exibir arquivo

Arquivo: blob.py Projeto: sloria/TextBlob

    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.
        """
        return WordList(word_tokenize(self.raw, include_punc=False))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: blob.py Projeto: valadis6760/ai-app-chatbot

    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.
        """
        return WordList(word_tokenize(self.raw, include_punc=False))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: classifiers.py Projeto: Arttii/TextBlob

def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test.py Projeto: apals/sentiment-classification-in-social-media

def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
        return tokens[docs]

    def basic_extractor(document, train_set):
        word_features = _get_words_from_dataset(train_set)
        tokens = _get_document_tokens(document)
        features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features))
        return features

Exemplo n.º 9

0

Exibir arquivo

Arquivo: classifiers.py Projeto: VinodhSubramanian1193/NLP

def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    if isinstance(document, basestring):
        tokens = set([
            strip_punc(w, all=False)
            for w in word_tokenize(document, include_punc=False)
        ])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features

Exemplo n.º 10

0

Exibir arquivo

Arquivo: Mobile Analyzer.py Projeto: jashwanthPadamati/Feature-specific-sentiment-analysis-for-mobile-models

 def analyze(self, text):
     """Return the sentiment as a named tuple of the form:
     ``Sentiment(classification, p_pos, p_neg)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokens = word_tokenize(text, include_punc=False)
     filtered = (t.lower() for t in tokens if len(t) >= 3)
     feats = self.feature_extractor(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     return self.RETURN_TYPE(classification=prob_dist.max(),
                             p_pos=prob_dist.prob('pos'),
                             p_neg=prob_dist.prob("neg"))

Exemplo n.º 11

0

Exibir arquivo

Arquivo: sentiments.py Projeto: ANDRESVA/TextBlob

 def analyze(self, text):
     """Return the sentiment as a named tuple of the form:
     ``Sentiment(classification, p_pos, p_neg)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokens = word_tokenize(text, include_punc=False)
     filtered = (t.lower() for t in tokens if len(t) >= 3)
     feats = self.feature_extractor(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     return self.RETURN_TYPE(
         classification=prob_dist.max(),
         p_pos=prob_dist.prob('pos'),
         p_neg=prob_dist.prob("neg")
     )

Exemplo n.º 12

0

Exibir arquivo

Arquivo: classifiers.py Projeto: VinodhSubramanian1193/NLP

def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                      for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict(((u'contains({0})'.format(word), (word in tokens))
                     for word in word_features))
    return features

Exemplo n.º 13

0

Exibir arquivo

Arquivo: classifiers.py Projeto: Arttii/TextBlob

def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict(((u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features))
    return features

Exemplo n.º 14

0

Exibir arquivo

Arquivo: nltk_interface.py Projeto: nagkumar91/cmpe273-final-project

 def classify(self, intweet):
     """ 
     tweet : The content of the tweet in string format 
     returns : Either "pos" or "neg"
     """
     tweet = preprocess_tweet(intweet)
     tokens = word_tokenize(tweet, include_punc=False)
     filtered = (t.lower() for t in tokens if len(t) >= 3)
     feats = feature_extractor(filtered)
     prob_dist = self.classifier.prob_classify(feats)
     print "For text: %s" % tweet.encode('utf-8')
     print(prob_dist.prob('pos')),
     print(prob_dist.prob('neg')),
     if (abs(prob_dist.prob('pos') - prob_dist.prob("neg")) < 0.25):
         print 'neutral'
         print ""
         return 'neutral'
     print prob_dist.max()
     print ""
     return (prob_dist.max())

Exemplo n.º 15

0

Exibir arquivo

Arquivo: classifiers.py Projeto: wbtiger/TextBlob

 def tokenize(words):
     if isinstance(words, basestring):
         return word_tokenize(words, include_punc=False)
     else:
         return words

Exemplo n.º 16

0

Exibir arquivo

 def __tokenize_text(self, text):
     self.__tokenized_text = list(tt.word_tokenize(text))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: test_tokenizers.py Projeto: ANDRESVA/TextBlob

 def test_word_tokenize(self):
     tokens = word_tokenize(self.text)
     assert_true(is_generator(tokens))
     assert_equal(list(tokens), self.tokenizer.tokenize(self.text))

Exemplo n.º 18

0

Exibir arquivo

Arquivo: classifiers.py Projeto: Arttii/TextBlob

 def tokenize(words):
     if isinstance(words, basestring):
         return word_tokenize(words, include_punc=False)
     else:
         return (w for w in words)

Exemplo n.º 19

0

Exibir arquivo

 def __tokenizeText(self, text):
     ### the tt.word_tokenize(text) is just the generator and is not present after you used it once, therefore,
     ### create the list out of it
     self.__tokenizedText = list(tt.word_tokenize(text))

Exemplo n.º 20

0

Exibir arquivo

 def test_word_tokenize(self):
     tokens = word_tokenize(self.text)
     assert_true(is_generator(tokens))
     assert_equal(list(tokens), self.tokenizer.tokenize(self.text))

Exemplo n.º 21

0

Exibir arquivo

Arquivo: blob.py Projeto: STHITAPRAJNAS/TextBlob

 def words(self):
     '''Return a list of word tokens. This excludes punctuation characters.
     If you want to include punctuation characters, access the ``tokens``
     property.
     '''
     return WordList(word_tokenize(self.raw, include_punc=False))

Exemplo n.º 22

0

Exibir arquivo

Arquivo: milton_words.py Projeto: charlesreid1/milton

from queneau import WordAssembler
import nltk
from nltk.tokenize import WordPunctTokenizer
from textblob.tokenizers import word_tokenize

poem = 'data/poems/book01.txt'

with open(poem,'rb') as f:
    raw = f.read()

### tokenizer = WordPunctTokenizer()
### tokens = tokenizer.tokenize(raw)
### tokens = nltk.word_tokenize(raw)
### text = nltk.Text(tokens)

tokens = list(word_tokenize(raw))

words = [w.lower() for w in tokens]

vocab = sorted(set(words))

vocab = vocab[15:] # cut out the punctuation

corpus = WordAssembler(vocab)

for i in range(1000):
    print corpus.assemble_word(min_length=5)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: taggers.py Projeto: DCSGInterns/NLP

 def tag(self, text, tokenize=True):
     '''Tag a string `text`.'''
     if tokenize:
         text = list(word_tokenize(text))
     tagged = nltk.tag.pos_tag(text)
     return tagged

Exemplo n.º 24

0

Exibir arquivo

Arquivo: blob.py Projeto: Arttii/TextBlob

 def words(self):
     '''Return a list of word tokens. This excludes punctuation characters.
     If you want to include punctuation characters, access the ``tokens``
     property.
     '''
     return WordList(word_tokenize(self.raw, include_punc=False))

Exemplo n.º 25

0

Exibir arquivo

 def tag(self, text, tokenize=True):
     '''Tag a string `text`.'''
     if tokenize:
         text = list(word_tokenize(text))
     tagged = nltk.tag.pos_tag(text)
     return tagged