def store_word_counts(self, wordcount_samples=300000): """ Stores word:count histograms for samples in Redis with the ability to increment. """ if 'positive_wordcounts' and 'negative_wordcounts' in self.r.keys(): return from synt.utils.db import get_samples samples = get_samples(wordcount_samples) assert samples, "Samples must be provided." for text, label in samples: label = label + '_wordcounts' tokens = sanitize_text(text) if tokens: for word in tokens: prev_score = self.r.zscore(label, word) self.r.zadd(label, word, 1 if not prev_score else prev_score + 1)
def test_accuracy(db_name='', test_samples=0, neutral_range=0, offset=0, redis_db=5): """ Returns two accuracies and classifier: NLTK accuracy is the internal accuracy of the classifier Manual Accuracy is the accuracy when compared to pre-flagged/known samples and label. Keyword Arguments: db_name (str) -- Samples database to use, by default this is the same as your trained database with an offset to ensure unseen data. Should be a string database name located in ~/.synt. test_samples (int) -- Amount of samples to use, by default this will be 25% of the training set amount. neutral_range (float) -- Will be used to drop "neutrals" to see how real-world accuracy will look. For example in the case where neutral range is 0.2 if the sentiment guessed is not greater than 0.2 or less than -0.2 it is not considered. Leaving this set to 0 will not cause the special case drops and will by default categorize text as either positive or negative. This may be undesired as the classifier will treat 0.0001 as positive even though it is not a strong indication. offset (int) -- By default the offset is decided from the end of the the trained amount, i.e if you've trained on 1000 and you have 250 testing samples the samples retrieved will be from 1000-1250, you can override this offset if you wish to use a different subset. redis_db (int) -- The redis database to use. """ m = RedisManager(db=redis_db) trained_classifier = m.r.get( 'trained_classifier') #retrieve the trained classifier if not trained_classifier: print("Accuracy needs a classifier, have you trained?") return classifier = m.pickle_load(trained_classifier) #we want to make sure we are testing on a new set of samples therefore #we use the trained_to as our offset and proceed to use the samples #thereafter, unless an offset is otherwise specified trained_to = int(m.r.get('trained_to')) if not offset: offset = trained_to if test_samples <= 0: #if no testing samples provided use 25% of our training number test_samples = int(trained_to * .25) if not db_name: db_name = m.r.get('trained_db') #use the trained samples database test_samples = get_samples(db_name, test_samples, offset=offset, redis_db=redis_db) testfeats = [] trained_ext = m.r.get('trained_extractor') feat_ex = get_extractor(trained_ext)() #normalization and extraction for text, label in test_samples: tokens = normalize_text(text) bag_of_words = feat_ex.extract(tokens) if bag_of_words: testfeats.append((bag_of_words, label)) nltk_accuracy = nltk.classify.util.accuracy( classifier, gold=testfeats) * 100 # percentify total_guessed = 0 total_correct = 0 total_incorrect = 0 g = Guesser(extractor_type=trained_ext) #compare the guessed sentiments with our samples database to determine manual accuracy for text, label in test_samples: guessed = g.guess(text) if abs(guessed) < neutral_range: continue if (guessed > 0) == label.startswith('pos'): total_correct += 1 else: #print text, label, guessed total_incorrect += 1 total_guessed += 1 assert total_guessed, "There were no guesses, make sure you've trained on the same database you're testing." manual_accuracy = total_correct * 100.0 / total_guessed #TODO: precision and recall return (nltk_accuracy, manual_accuracy, classifier)
def test_accuracy(db_name='', test_samples=0, neutral_range=0, offset=0, redis_db=5): """ Returns two accuracies and classifier: NLTK accuracy is the internal accuracy of the classifier Manual Accuracy is the accuracy when compared to pre-flagged/known samples and label. Keyword Arguments: db_name (str) -- Samples database to use, by default this is the same as your trained database with an offset to ensure unseen data. Should be a string database name located in ~/.synt. test_samples (int) -- Amount of samples to use, by default this will be 25% of the training set amount. neutral_range (float) -- Will be used to drop "neutrals" to see how real-world accuracy will look. For example in the case where neutral range is 0.2 if the sentiment guessed is not greater than 0.2 or less than -0.2 it is not considered. Leaving this set to 0 will not cause the special case drops and will by default categorize text as either positive or negative. This may be undesired as the classifier will treat 0.0001 as positive even though it is not a strong indication. offset (int) -- By default the offset is decided from the end of the the trained amount, i.e if you've trained on 1000 and you have 250 testing samples the samples retrieved will be from 1000-1250, you can override this offset if you wish to use a different subset. redis_db (int) -- The redis database to use. """ m = RedisManager(db=redis_db) trained_classifier = m.r.get('trained_classifier') #retrieve the trained classifier if not trained_classifier: print("Accuracy needs a classifier, have you trained?") return classifier = m.pickle_load(trained_classifier) #we want to make sure we are testing on a new set of samples therefore #we use the trained_to as our offset and proceed to use the samples #thereafter, unless an offset is otherwise specified trained_to = int(m.r.get('trained_to')) if not offset: offset = trained_to if test_samples <= 0: #if no testing samples provided use 25% of our training number test_samples = int(trained_to * .25) if not db_name: db_name = m.r.get('trained_db') #use the trained samples database test_samples = get_samples(db_name, test_samples, offset=offset, redis_db=redis_db) testfeats = [] trained_ext = m.r.get('trained_extractor') feat_ex = get_extractor(trained_ext)() #normalization and extraction for text, label in test_samples: tokens = normalize_text(text) bag_of_words = feat_ex.extract(tokens) if bag_of_words: testfeats.append((bag_of_words, label)) nltk_accuracy = nltk.classify.util.accuracy(classifier, gold=testfeats) * 100 # percentify total_guessed = 0 total_correct = 0 total_incorrect = 0 g = Guesser(extractor_type=trained_ext) #compare the guessed sentiments with our samples database to determine manual accuracy for text, label in test_samples: guessed = g.guess(text) if abs(guessed) < neutral_range: continue if (guessed > 0) == label.startswith('pos'): total_correct += 1 else: #print text, label, guessed total_incorrect += 1 total_guessed += 1 assert total_guessed, "There were no guesses, make sure you've trained on the same database you're testing." manual_accuracy = total_correct * 100.0 / total_guessed #TODO: precision and recall return (nltk_accuracy, manual_accuracy, classifier)
def test(test_samples=200000, feat_ex=best_word_feats): """ This first returns the accuracy of the classifier then proceeds to test across known sentiments and produces a 'manual accuracy score'. Keyword Arguments: test_samples -- the amount of samples to test against feat_ext -- the feature extractor to use (utils/extractors) """ classifier = RedisManager().load_classifier() if not classifier: print("There is not classifier in Redis yet, have you trained?") return results = [] nltk_testing_dicts = [] accurate_samples = 0 print("Preparing %s Testing Samples" % test_samples) samples = get_samples(test_samples) for sample in samples: text, sentiment = sample[0], sample[1] #(text, sentiment) tokens = sanitize_text(text) if tokens: feats = feat_ex(tokens) nltk_testing_dicts.append((feats, sentiment)) nltk_accuracy = nltk.classify.util.accuracy(classifier, nltk_testing_dicts) * 100 # percentify for sample in samples: text, sentiment = sample[0], sample[1] #(text, sentiment) guessed = guess(text) if sentiment.startswith('pos') and guessed > 0: accurate = True elif sentiment.startswith('neg') and guessed < 0: accurate = True else: accurate = False results.append((accurate, sentiment, guessed, text)) for result in results: print ("Text: %s" % (result[3])) print ("Accuracy: %s | Known Sentiment: %s | Guessed Sentiment: %s " % (result[0], result[1], result[2])) print ("------------------------------------------------------------------------------------------------------------------------------------------") if result[0] == True: accurate_samples += 1 total_accuracy = (accurate_samples * 100.00 / len(samples)) classifier.show_most_informative_features(30) print("\n\rManual classifier accuracy result: %s%%" % total_accuracy) print("\n\rNLTK classifier accuracy result: %.2f%%" % nltk_accuracy)
def train(db_name, samples=200000, classifier_type='naivebayes', extractor_type='words', best_features=10000, processes=8, purge=False): """ Train with samples from sqlite database and stores the resulting classifier in Redis. Arguments: db_name (str) -- Name of the training database to use stored in ~/.synt Keyword arguments: samples (int) -- Amount of samples to train on. classifier_type (str) -- Type of classifier to use. Available classifiers are 'naivebayes'. extractor_type (str) -- Type of extractor to use. Available extractors are 'words', 'stopwords', 'bestwords'. best_features (int) -- Amount of highly informative features to store. processes (int) -- The amount of processes to be used for counting features in parallel. purge (bool) -- If true will flush the redis database. """ m = RedisManager(purge=purge) extractor = get_extractor(extractor_type) if not db_exists(db_name): raise ValueError("Database '%s' does not exist." % db_name) if classifier_type in m.r.keys(): print("Classifier exists in Redis. Purge to re-train.") return classifier = config.CLASSIFIERS.get(classifier_type) if not classifier: #classifier not supported raise ValueError("Classifier '%s' not supported." % classifier_type) #retrieve training samples from database train_samples = get_samples(db_name, samples) m.store_feature_counts(train_samples, processes=processes) m.store_feature_scores() if best_features and best_features > 1: m.store_best_features(best_features) label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) #retreieve the actual samples processed for label neg_processed, pos_processed = m.r.get('negative_processed'), m.r.get('positive_processed') label_freqdist.inc('negative', int(neg_processed)) label_freqdist.inc('positive', int(pos_processed)) labeled_feature_freqs = m.pickle_load('labeled_feature_freqs') labels = labeled_feature_freqs.keys() #feature extraction feat_ex = extractor() extracted_set = set([feat_ex.extract(labeled_feature_freqs[label].keys(), as_list=True) for label in labels][0]) #increment the amount of times a given feature for label occured and fill in the missing occurences with Falses for label in labels: samples = label_freqdist[label] for fname in extracted_set: trues = labeled_feature_freqs[label].get(fname, 0) falses = samples - trues feature_freqdist[label, fname].inc(True, trues) feature_freqdist[label, fname].inc(False, falses) #create the P(label) distribution estimator = ELEProbDist label_probdist = estimator(label_freqdist) #create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=2) feature_probdist[label,fname] = probdist #TODO: naivebayes supports this prototype, future classifiers will most likely not trained_classifier = classifier(label_probdist, feature_probdist) m.pickle_store(classifier_type, trained_classifier) m.r.set('trained_to', samples) m.r.set('trained_db', db_name) m.r.set('trained_classifier', classifier_type) m.r.set('trained_extractor', extractor_type)
def train(db_name, samples=200000, classifier_type='naivebayes', extractor_type='words', best_features=10000, processes=8, purge=False): """ Train with samples from sqlite database and stores the resulting classifier in Redis. Arguments: db_name (str) -- Name of the training database to use stored in ~/.synt Keyword arguments: samples (int) -- Amount of samples to train on. classifier_type (str) -- Type of classifier to use. Available classifiers are 'naivebayes'. extractor_type (str) -- Type of extractor to use. Available extractors are 'words', 'stopwords', 'bestwords'. best_features (int) -- Amount of highly informative features to store. processes (int) -- The amount of processes to be used for counting features in parallel. purge (bool) -- If true will flush the redis database. """ m = RedisManager(purge=purge) extractor = get_extractor(extractor_type) if not db_exists(db_name): raise ValueError("Database '%s' does not exist." % db_name) if classifier_type in m.r.keys(): print("Classifier exists in Redis. Purge to re-train.") return classifier = config.CLASSIFIERS.get(classifier_type) if not classifier: #classifier not supported raise ValueError("Classifier '%s' not supported." % classifier_type) #retrieve training samples from database train_samples = get_samples(db_name, samples) m.store_feature_counts(train_samples, processes=processes) m.store_feature_scores() if best_features and best_features > 1: m.store_best_features(best_features) label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) #retreieve the actual samples processed for label neg_processed, pos_processed = m.r.get('negative_processed'), m.r.get( 'positive_processed') label_freqdist.inc('negative', int(neg_processed)) label_freqdist.inc('positive', int(pos_processed)) labeled_feature_freqs = m.pickle_load('labeled_feature_freqs') labels = labeled_feature_freqs.keys() #feature extraction feat_ex = extractor() extracted_set = set([ feat_ex.extract(labeled_feature_freqs[label].keys(), as_list=True) for label in labels ][0]) #increment the amount of times a given feature for label occured and fill in the missing occurences with Falses for label in labels: samples = label_freqdist[label] for fname in extracted_set: trues = labeled_feature_freqs[label].get(fname, 0) falses = samples - trues feature_freqdist[label, fname].inc(True, trues) feature_freqdist[label, fname].inc(False, falses) #create the P(label) distribution estimator = ELEProbDist label_probdist = estimator(label_freqdist) #create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=2) feature_probdist[label, fname] = probdist #TODO: naivebayes supports this prototype, future classifiers will most likely not trained_classifier = classifier(label_probdist, feature_probdist) m.pickle_store(classifier_type, trained_classifier) m.r.set('trained_to', samples) m.r.set('trained_db', db_name) m.r.set('trained_classifier', classifier_type) m.r.set('trained_extractor', extractor_type)
def train( feat_ex=best_word_feats, train_samples=400000, wordcount_samples=300000, wordcount_range=150000, force_update=False, verbose=True, ): """ Trains a Naive Bayes classifier with samples from database and stores the resulting classifier in Redis. Args: featx -- the feature extractor to use, found in utils/extractors.py Keyword arguments: train_samples -- the amount of samples to train half this number will be negative the other positive wordcount_samples -- the amount of samples to build wordcounts, this produces a word:count histogram in Redis wordcount_range -- the amount of 'up-to' words to use for the FreqDist will pick out the most 'popular' words up to this amount. i.e top 150000 tokens force_update -- if True will drop the Redis DB and assume a new train verbose -- if True will output to console """ logger = create_logger(__file__) if not verbose: # no output logger.setLevel(0) man = RedisManager(force_update=force_update) if "classifier" in man.r.keys(): logger.info("Trained classifier exists in Redis.") return logger.info("Storing %d word counts." % wordcount_samples) man.store_word_counts(wordcount_samples) logger.info("Build frequency distributions with %d words." % wordcount_range) man.build_freqdists(wordcount_range) logger.info("Storing word scores.") man.store_word_scores() logger.info("Storing best words.") man.store_best_words() samples = get_samples(train_samples) half = len(samples) / 2 pos_samples = samples[:half] neg_samples = samples[half:] logger.info("Build negfeats and posfeats") negfeats, posfeats = [], [] for text, sent in neg_samples: s_text = sanitize_text(text) tokens = feat_ex(s_text) if tokens: negfeats.append((tokens, sent)) for text, sent in pos_samples: s_text = sanitize_text(text) tokens = feat_ex(s_text) if tokens: posfeats.append((tokens, sent)) if not (negfeats or posfeats): logger.error("Could not build positive and negative features.") return negcutoff = len(negfeats) * 3 / 4 # 3/4 training set poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] logger.info("Train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))) classifier = NaiveBayesClassifier.train(trainfeats) logger.info("Done training") man.store_classifier(classifier) logger.info("Stored to Redis") # refsets = collections.defaultdict(set) # testsets = collections.defaultdict(set) # for i, (feats, label) in enumerate(testfeats): # if feats: # refsets[label].add(i) # observed = classifier.classify(feats) # testsets[observed].add(i) # # print '#### POSITIVE ####' # print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) # print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) # print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) # print # print '#### NEGATIVE ####' # print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) # print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) # print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg']) # print '--------------------' logger.info("Classifier Accuracy: %s" % util.accuracy(classifier, testfeats))