def main(): random.seed(7) # AAA: Use a random shuffle to select test/training sets print('Extracting twitter data from the database...') tm1 = time.time() tweets = extract() tm2 = time.time() print(' time=%0.3fs' % (tm2 - tm1)) test_set_size = int(TEST_SET_PROPORTION * len(tweets)) print('Training on %d tweets' % (len(tweets) - test_set_size)) tm1 = time.time() random.shuffle(tweets) test_set = tweets[:test_set_size] training_set = tweets[test_set_size:] nb = NaiveBayes() for tweet in training_set: toks = ttokenize.tokenize(tweet.text) nb.train(toks, tweet.get_majority_vote()) tm2 = time.time() print(' time=%0.3fs' % (tm2 - tm1)) print('Testing accuracy on %d tweets' % test_set_size) tm1 = time.time() predictions = [] references = [] for tweet in test_set: references.append(tweet.get_majority_vote()) toks = ttokenize.tokenize(tweet.text) predictions.append(nb.classify(toks)) mat = nltk.ConfusionMatrix(references, predictions) tm2 = time.time() print mat.pp(show_percents=True) print ('%d of %d correct ==> %f%%' % (mat._correct, mat._total, float(mat._correct) / mat._total)) print(' time=%0.3fs' % (tm2 - tm1))
def get_instance(tweet): """Return a tuple of (feature_dictionary, label).""" toks = ttokenize.tokenize(tweet.text) feature_dict = dict([(t, True) for t in toks]) return (feature_dict, tweet.get_majority_vote())