def guess(text, classifier=DEFAULT_CLASSIFIER, feat_ex=best_word_feats): """ Takes a blob of text and returns the sentiment score (-1.0 - 1.0). Keyword Arguments: classifier -- the classifier to use (Note: for now we only have a naivebayes classifier) feat_ex -- the feature extractor to use i.e bigram_word_feats, stopword_feats, found in extractors """ assert classifier, "Needs a classifier." tokens = sanitize_text(text) bag_of_words = feat_ex(tokens) score = 0.0 if bag_of_words: prob = classifier.prob_classify(bag_of_words) #return a -1 .. 1 score score = prob.prob('positive') - prob.prob('negative') #if score doesn't fall within -1 and 1 return 0.0 #example: single words might return a heavily biased score like -9.8343 if not (-1 <= score <= 1): pass #score 0.0 return score
def store_word_counts(self, wordcount_samples=300000): """ Stores word:count histograms for samples in Redis with the ability to increment. """ if 'positive_wordcounts' and 'negative_wordcounts' in self.r.keys(): return from synt.utils.db import get_samples samples = get_samples(wordcount_samples) assert samples, "Samples must be provided." for text, label in samples: label = label + '_wordcounts' tokens = sanitize_text(text) if tokens: for word in tokens: prev_score = self.r.zscore(label, word) self.r.zadd(label, word, 1 if not prev_score else prev_score + 1)
def test(test_samples=200000, feat_ex=best_word_feats): """ This first returns the accuracy of the classifier then proceeds to test across known sentiments and produces a 'manual accuracy score'. Keyword Arguments: test_samples -- the amount of samples to test against feat_ext -- the feature extractor to use (utils/extractors) """ classifier = RedisManager().load_classifier() if not classifier: print("There is not classifier in Redis yet, have you trained?") return results = [] nltk_testing_dicts = [] accurate_samples = 0 print("Preparing %s Testing Samples" % test_samples) samples = get_samples(test_samples) for sample in samples: text, sentiment = sample[0], sample[1] #(text, sentiment) tokens = sanitize_text(text) if tokens: feats = feat_ex(tokens) nltk_testing_dicts.append((feats, sentiment)) nltk_accuracy = nltk.classify.util.accuracy(classifier, nltk_testing_dicts) * 100 # percentify for sample in samples: text, sentiment = sample[0], sample[1] #(text, sentiment) guessed = guess(text) if sentiment.startswith('pos') and guessed > 0: accurate = True elif sentiment.startswith('neg') and guessed < 0: accurate = True else: accurate = False results.append((accurate, sentiment, guessed, text)) for result in results: print ("Text: %s" % (result[3])) print ("Accuracy: %s | Known Sentiment: %s | Guessed Sentiment: %s " % (result[0], result[1], result[2])) print ("------------------------------------------------------------------------------------------------------------------------------------------") if result[0] == True: accurate_samples += 1 total_accuracy = (accurate_samples * 100.00 / len(samples)) classifier.show_most_informative_features(30) print("\n\rManual classifier accuracy result: %s%%" % total_accuracy) print("\n\rNLTK classifier accuracy result: %.2f%%" % nltk_accuracy)
def train( feat_ex=best_word_feats, train_samples=400000, wordcount_samples=300000, wordcount_range=150000, force_update=False, verbose=True, ): """ Trains a Naive Bayes classifier with samples from database and stores the resulting classifier in Redis. Args: featx -- the feature extractor to use, found in utils/extractors.py Keyword arguments: train_samples -- the amount of samples to train half this number will be negative the other positive wordcount_samples -- the amount of samples to build wordcounts, this produces a word:count histogram in Redis wordcount_range -- the amount of 'up-to' words to use for the FreqDist will pick out the most 'popular' words up to this amount. i.e top 150000 tokens force_update -- if True will drop the Redis DB and assume a new train verbose -- if True will output to console """ logger = create_logger(__file__) if not verbose: # no output logger.setLevel(0) man = RedisManager(force_update=force_update) if "classifier" in man.r.keys(): logger.info("Trained classifier exists in Redis.") return logger.info("Storing %d word counts." % wordcount_samples) man.store_word_counts(wordcount_samples) logger.info("Build frequency distributions with %d words." % wordcount_range) man.build_freqdists(wordcount_range) logger.info("Storing word scores.") man.store_word_scores() logger.info("Storing best words.") man.store_best_words() samples = get_samples(train_samples) half = len(samples) / 2 pos_samples = samples[:half] neg_samples = samples[half:] logger.info("Build negfeats and posfeats") negfeats, posfeats = [], [] for text, sent in neg_samples: s_text = sanitize_text(text) tokens = feat_ex(s_text) if tokens: negfeats.append((tokens, sent)) for text, sent in pos_samples: s_text = sanitize_text(text) tokens = feat_ex(s_text) if tokens: posfeats.append((tokens, sent)) if not (negfeats or posfeats): logger.error("Could not build positive and negative features.") return negcutoff = len(negfeats) * 3 / 4 # 3/4 training set poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] logger.info("Train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))) classifier = NaiveBayesClassifier.train(trainfeats) logger.info("Done training") man.store_classifier(classifier) logger.info("Stored to Redis") # refsets = collections.defaultdict(set) # testsets = collections.defaultdict(set) # for i, (feats, label) in enumerate(testfeats): # if feats: # refsets[label].add(i) # observed = classifier.classify(feats) # testsets[observed].add(i) # # print '#### POSITIVE ####' # print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) # print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) # print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) # print # print '#### NEGATIVE ####' # print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) # print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) # print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg']) # print '--------------------' logger.info("Classifier Accuracy: %s" % util.accuracy(classifier, testfeats))