def tokenize(words): if isinstance(words, basestring): return word_tokenize(words, include_punc=False) else: return words all_words = chain.from_iterable(tokenize(words) for words, _ in dataset) return set(all_words)
def clean_tweets(tweet): stop_words = set(stopwords.words('english')) stop_words.add('\\n') word_tokens = word_tokenize(tweet) # after tweepy preprocessing the colon symbol left remain after removing mentions tweet = re.sub(r':', '', tweet) tweet = re.sub(r'…', '', tweet) # replace consecutive non-ASCII characters with a space tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet) # remove emojis from tweet tweet = emoji_pattern.sub(r'', tweet) # remove url pattern tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE) # tweet.decode('ascii', 'ignore') # filter using NLTK library append it to a string strng = "" filtered_tweet = [w for w in word_tokens if not w in stop_words] # filtered_tweet = [] # # looping through conditions # for w in word_tokens: # # check tokens against stop words , emoticons and punctuations # if w not in stop_words and w not in emoticons and w not in string.punctuation: # filtered_tweet.append(w) return filtered_tweet
def _get_document_tokens(document): if isinstance(document, basestring): tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False))) else: tokens = set(strip_punc(w, all=False) for w in document) return tokens
def words(self): """Return a list of word tokens. This excludes punctuation characters. If you want to include punctuation characters, access the ``tokens`` property. :returns: A :class:`WordList <WordList>` of word tokens. """ return WordList(word_tokenize(self.raw, include_punc=False))
def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' if isinstance(document, basestring): tokens = set([strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False)]) else: tokens = set((strip_punc(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def _get_document_tokens(document): if isinstance(document, basestring): tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False))) else: tokens = set(strip_punc(w, all=False) for w in document) return tokens[docs] def basic_extractor(document, train_set): word_features = _get_words_from_dataset(train_set) tokens = _get_document_tokens(document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) return features
def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' if isinstance(document, basestring): tokens = set([ strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False) ]) else: tokens = set((strip_punc(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def analyze(self, text): """Return the sentiment as a named tuple of the form: ``Sentiment(classification, p_pos, p_neg)`` """ # Lazily train the classifier super(NaiveBayesAnalyzer, self).analyze(text) tokens = word_tokenize(text, include_punc=False) filtered = (t.lower() for t in tokens if len(t) >= 3) feats = self.feature_extractor(filtered) prob_dist = self._classifier.prob_classify(feats) return self.RETURN_TYPE(classification=prob_dist.max(), p_pos=prob_dist.prob('pos'), p_neg=prob_dist.prob("neg"))
def analyze(self, text): """Return the sentiment as a named tuple of the form: ``Sentiment(classification, p_pos, p_neg)`` """ # Lazily train the classifier super(NaiveBayesAnalyzer, self).analyze(text) tokens = word_tokenize(text, include_punc=False) filtered = (t.lower() for t in tokens if len(t) >= 3) feats = self.feature_extractor(filtered) prob_dist = self._classifier.prob_classify(feats) return self.RETURN_TYPE( classification=prob_dist.max(), p_pos=prob_dist.prob('pos'), p_neg=prob_dist.prob("neg") )
def basic_extractor(document, train_set): '''A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. :param document: The text to extract features from. Can be a string or an iterable. :param train_set: Training data set, a list of tuples of the form ``(words, label)``. ''' word_features = _get_words_from_dataset(train_set) if isinstance(document, basestring): tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False))) else: tokens = set(strip_punc(w, all=False) for w in document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) return features
def classify(self, intweet): """ tweet : The content of the tweet in string format returns : Either "pos" or "neg" """ tweet = preprocess_tweet(intweet) tokens = word_tokenize(tweet, include_punc=False) filtered = (t.lower() for t in tokens if len(t) >= 3) feats = feature_extractor(filtered) prob_dist = self.classifier.prob_classify(feats) print "For text: %s" % tweet.encode('utf-8') print(prob_dist.prob('pos')), print(prob_dist.prob('neg')), if (abs(prob_dist.prob('pos') - prob_dist.prob("neg")) < 0.25): print 'neutral' print "" return 'neutral' print prob_dist.max() print "" return (prob_dist.max())
def tokenize(words): if isinstance(words, basestring): return word_tokenize(words, include_punc=False) else: return words
def __tokenize_text(self, text): self.__tokenized_text = list(tt.word_tokenize(text))
def test_word_tokenize(self): tokens = word_tokenize(self.text) assert_true(is_generator(tokens)) assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
def tokenize(words): if isinstance(words, basestring): return word_tokenize(words, include_punc=False) else: return (w for w in words)
def __tokenizeText(self, text): ### the tt.word_tokenize(text) is just the generator and is not present after you used it once, therefore, ### create the list out of it self.__tokenizedText = list(tt.word_tokenize(text))
def words(self): '''Return a list of word tokens. This excludes punctuation characters. If you want to include punctuation characters, access the ``tokens`` property. ''' return WordList(word_tokenize(self.raw, include_punc=False))
from queneau import WordAssembler import nltk from nltk.tokenize import WordPunctTokenizer from textblob.tokenizers import word_tokenize poem = 'data/poems/book01.txt' with open(poem,'rb') as f: raw = f.read() ### tokenizer = WordPunctTokenizer() ### tokens = tokenizer.tokenize(raw) ### tokens = nltk.word_tokenize(raw) ### text = nltk.Text(tokens) tokens = list(word_tokenize(raw)) words = [w.lower() for w in tokens] vocab = sorted(set(words)) vocab = vocab[15:] # cut out the punctuation corpus = WordAssembler(vocab) for i in range(1000): print corpus.assemble_word(min_length=5)
def tag(self, text, tokenize=True): '''Tag a string `text`.''' if tokenize: text = list(word_tokenize(text)) tagged = nltk.tag.pos_tag(text) return tagged