# -*- coding: utf-8 -*- import time import os import bz2 import urllib2 from cStringIO import StringIO from synt.utils.collect import twitter_feed from synt.utils.db import db_init from synt.logger import create_logger from synt import settings logger = create_logger(__file__) def collect(): """ Will continuously populate the sample database if it exists else it will create a new one. """ neg_lastid, pos_lastid = None, None if not os.path.exists(settings.DB_FILE): pos_lastid = twitter_feed('positive', pos_lastid, new=True) neg_lastid = twitter_feed('negative', neg_lastid, new=True) while True: time.sleep(1) try: pos_lastid = twitter_feed('positive', pos_lastid) neg_lastid = twitter_feed('negative', neg_lastid)
def train( feat_ex=best_word_feats, train_samples=400000, wordcount_samples=300000, wordcount_range=150000, force_update=False, verbose=True, ): """ Trains a Naive Bayes classifier with samples from database and stores the resulting classifier in Redis. Args: featx -- the feature extractor to use, found in utils/extractors.py Keyword arguments: train_samples -- the amount of samples to train half this number will be negative the other positive wordcount_samples -- the amount of samples to build wordcounts, this produces a word:count histogram in Redis wordcount_range -- the amount of 'up-to' words to use for the FreqDist will pick out the most 'popular' words up to this amount. i.e top 150000 tokens force_update -- if True will drop the Redis DB and assume a new train verbose -- if True will output to console """ logger = create_logger(__file__) if not verbose: # no output logger.setLevel(0) man = RedisManager(force_update=force_update) if "classifier" in man.r.keys(): logger.info("Trained classifier exists in Redis.") return logger.info("Storing %d word counts." % wordcount_samples) man.store_word_counts(wordcount_samples) logger.info("Build frequency distributions with %d words." % wordcount_range) man.build_freqdists(wordcount_range) logger.info("Storing word scores.") man.store_word_scores() logger.info("Storing best words.") man.store_best_words() samples = get_samples(train_samples) half = len(samples) / 2 pos_samples = samples[:half] neg_samples = samples[half:] logger.info("Build negfeats and posfeats") negfeats, posfeats = [], [] for text, sent in neg_samples: s_text = sanitize_text(text) tokens = feat_ex(s_text) if tokens: negfeats.append((tokens, sent)) for text, sent in pos_samples: s_text = sanitize_text(text) tokens = feat_ex(s_text) if tokens: posfeats.append((tokens, sent)) if not (negfeats or posfeats): logger.error("Could not build positive and negative features.") return negcutoff = len(negfeats) * 3 / 4 # 3/4 training set poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] logger.info("Train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))) classifier = NaiveBayesClassifier.train(trainfeats) logger.info("Done training") man.store_classifier(classifier) logger.info("Stored to Redis") # refsets = collections.defaultdict(set) # testsets = collections.defaultdict(set) # for i, (feats, label) in enumerate(testfeats): # if feats: # refsets[label].add(i) # observed = classifier.classify(feats) # testsets[observed].add(i) # # print '#### POSITIVE ####' # print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) # print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) # print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) # print # print '#### NEGATIVE ####' # print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) # print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) # print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg']) # print '--------------------' logger.info("Classifier Accuracy: %s" % util.accuracy(classifier, testfeats))