def tokenize(words):
    if isinstance(words, basestring):
        return word_tokenize(words, include_punc=False)
    else:
        return words
    all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
    return set(all_words)
def clean_tweets(tweet):
    stop_words = set(stopwords.words('english'))
    stop_words.add('\\n')
    word_tokens = word_tokenize(tweet)
    # after tweepy preprocessing the colon symbol left remain after removing mentions
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'…', '', tweet)
    # replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet)
    # remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)
    # remove url pattern
    tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
    # tweet.decode('ascii', 'ignore')
    # filter using NLTK library append it to a string
    strng = ""
    filtered_tweet = [w for w in word_tokens if not w in stop_words]

    # filtered_tweet = []
    # # looping through conditions
    # for w in word_tokens:
    #     # check tokens against stop words , emoticons and punctuations
    #     if w not in stop_words and w not in emoticons and w not in string.punctuation:
    #         filtered_tweet.append(w)
    return filtered_tweet
예제 #3
0
def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                      for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    return tokens
예제 #4
0
def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    return tokens
예제 #5
0
파일: blob.py 프로젝트: sloria/TextBlob
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.
        """
        return WordList(word_tokenize(self.raw, include_punc=False))
예제 #6
0
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.
        """
        return WordList(word_tokenize(self.raw, include_punc=False))
예제 #7
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
def _get_document_tokens(document):
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
        return tokens[docs]

    def basic_extractor(document, train_set):
        word_features = _get_words_from_dataset(train_set)
        tokens = _get_document_tokens(document)
        features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features))
        return features
예제 #9
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    if isinstance(document, basestring):
        tokens = set([
            strip_punc(w, all=False)
            for w in word_tokenize(document, include_punc=False)
        ])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
 def analyze(self, text):
     """Return the sentiment as a named tuple of the form:
     ``Sentiment(classification, p_pos, p_neg)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokens = word_tokenize(text, include_punc=False)
     filtered = (t.lower() for t in tokens if len(t) >= 3)
     feats = self.feature_extractor(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     return self.RETURN_TYPE(classification=prob_dist.max(),
                             p_pos=prob_dist.prob('pos'),
                             p_neg=prob_dist.prob("neg"))
예제 #11
0
 def analyze(self, text):
     """Return the sentiment as a named tuple of the form:
     ``Sentiment(classification, p_pos, p_neg)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokens = word_tokenize(text, include_punc=False)
     filtered = (t.lower() for t in tokens if len(t) >= 3)
     feats = self.feature_extractor(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     return self.RETURN_TYPE(
         classification=prob_dist.max(),
         p_pos=prob_dist.prob('pos'),
         p_neg=prob_dist.prob("neg")
     )
예제 #12
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                      for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict(((u'contains({0})'.format(word), (word in tokens))
                     for word in word_features))
    return features
예제 #13
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                    for w in word_tokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict(((u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features))
    return features
 def classify(self, intweet):
     """ 
     tweet : The content of the tweet in string format 
     returns : Either "pos" or "neg"
     """
     tweet = preprocess_tweet(intweet)
     tokens = word_tokenize(tweet, include_punc=False)
     filtered = (t.lower() for t in tokens if len(t) >= 3)
     feats = feature_extractor(filtered)
     prob_dist = self.classifier.prob_classify(feats)
     print "For text: %s" % tweet.encode('utf-8')
     print(prob_dist.prob('pos')),
     print(prob_dist.prob('neg')),
     if (abs(prob_dist.prob('pos') - prob_dist.prob("neg")) < 0.25):
         print 'neutral'
         print ""
         return 'neutral'
     print prob_dist.max()
     print ""
     return (prob_dist.max())
예제 #15
0
 def tokenize(words):
     if isinstance(words, basestring):
         return word_tokenize(words, include_punc=False)
     else:
         return words
예제 #16
0
 def __tokenize_text(self, text):
     self.__tokenized_text = list(tt.word_tokenize(text))
예제 #17
0
 def test_word_tokenize(self):
     tokens = word_tokenize(self.text)
     assert_true(is_generator(tokens))
     assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
예제 #18
0
 def tokenize(words):
     if isinstance(words, basestring):
         return word_tokenize(words, include_punc=False)
     else:
         return (w for w in words)
예제 #19
0
 def __tokenizeText(self, text):
     ### the tt.word_tokenize(text) is just the generator and is not present after you used it once, therefore,
     ### create the list out of it
     self.__tokenizedText = list(tt.word_tokenize(text))
예제 #20
0
 def test_word_tokenize(self):
     tokens = word_tokenize(self.text)
     assert_true(is_generator(tokens))
     assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
예제 #21
0
 def words(self):
     '''Return a list of word tokens. This excludes punctuation characters.
     If you want to include punctuation characters, access the ``tokens``
     property.
     '''
     return WordList(word_tokenize(self.raw, include_punc=False))
예제 #22
0
from queneau import WordAssembler
import nltk
from nltk.tokenize import WordPunctTokenizer
from textblob.tokenizers import word_tokenize

poem = 'data/poems/book01.txt'

with open(poem,'rb') as f:
    raw = f.read()

### tokenizer = WordPunctTokenizer()
### tokens = tokenizer.tokenize(raw)
### tokens = nltk.word_tokenize(raw)
### text = nltk.Text(tokens)

tokens = list(word_tokenize(raw))

words = [w.lower() for w in tokens]

vocab = sorted(set(words))

vocab = vocab[15:] # cut out the punctuation

corpus = WordAssembler(vocab)

for i in range(1000):
    print corpus.assemble_word(min_length=5) 

예제 #23
0
파일: taggers.py 프로젝트: DCSGInterns/NLP
 def tag(self, text, tokenize=True):
     '''Tag a string `text`.'''
     if tokenize:
         text = list(word_tokenize(text))
     tagged = nltk.tag.pos_tag(text)
     return tagged
예제 #24
0
파일: blob.py 프로젝트: Arttii/TextBlob
 def words(self):
     '''Return a list of word tokens. This excludes punctuation characters.
     If you want to include punctuation characters, access the ``tokens``
     property.
     '''
     return WordList(word_tokenize(self.raw, include_punc=False))
예제 #25
0
 def tag(self, text, tokenize=True):
     '''Tag a string `text`.'''
     if tokenize:
         text = list(word_tokenize(text))
     tagged = nltk.tag.pos_tag(text)
     return tagged