def main(): # Setup: parse trec06p-cs280/labels into two dictionaries train_data and test_data # Each dictionary is a {file : spam_or_ham} where file is a string of the file e.g 'data/001/001' and spam_or_ham is #the label e.g. 'spam' or 'ham' train_data, test_data = parse_labels() print "There are %d documents for training, and %d documents for testing" % (len(train_data.keys()), len(test_data.keys())) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts" # Creat a word_dict ( i.e { word: (spam count, ham, count)}, and get the total number of spams documents # and total number of ham documents word_dict, total_doc_count, spam__docs_count, ham_docs_count = create_word_counts(train_data) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done." print "There are a total of %d documents of which %d are spam and %d are ham." %(total_doc_count, spam__docs_count, ham_docs_count) print "Prior probability for spam is: %f" % (spam__docs_count/float(total_doc_count)) print "Prior probability for ham is: %f" % (ham_docs_count/float(total_doc_count)) print "The vocabulary extracted from training totals %d words" % (len(word_dict.keys())) data_set = test_data # Iterate through possible values of lambda for lambda smoothing. #la_set = [0.00] la_set = [0.00, 0.005, 0.1, 0.5, 1.0, 2.0] for la in la_set: print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la) word_probs = compute_probs(word_dict, spam__docs_count, ham_docs_count, la) spam_prior_prob = (float)(spam__docs_count) / (total_doc_count) ham_prior_prob = (float)(ham_docs_count) / (total_doc_count) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done" TP, FP, TN, FN = classify(data_set, word_probs, spam_prior_prob, ham_prior_prob) summarize_findings(TP, FP, TN, FN)
def main(): train_data, test_data = parse_labels() print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts" word_dict, total_doc_count, spam_doc_count, ham_doc_count = create_word_counts(train_data) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done." la = 2.000 print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la) word_probs = compute_probs(word_dict, spam_doc_count, ham_doc_count, la) spam_prior_prob = (float)(spam_doc_count) / (spam_doc_count + ham_doc_count) ham_prior_prob = (float)(ham_doc_count) / (spam_doc_count + ham_doc_count) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Spam Prior Prob : {0}".format(spam_prior_prob) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Ham Prior Prob : {0}".format(ham_prior_prob) best_mi = compute_mi(word_probs, spam_prior_prob, ham_prior_prob) top_200 = best_mi[0:200] print [word[0] for word in top_200]
def main(): # train_data = { # 'test/spam1' : 'spam', # 'test/spam2' : 'spam', # 'test/spam3' : 'spam', # 'test/spam4' : 'spam', # 'test/ham1' : 'ham', # 'test/ham2' : 'ham', # 'test/ham3' : 'ham', # 'test/ham4' : 'ham', # } # # Setup train_data, test_data = parse_labels() stats(test_data) test_data = { 'test/test0' : 'spam', } print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts" word_dict, spam_count, ham_count = create_word_counts(train_data) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done." # Iterate through possible values of lambda for lambda smoothing. # la_set = [0.005, 0.1, 0.5, 1.0, 2.0] la_set = [0] for la in la_set: print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la) word_probs = compute_probs(word_dict, spam_count, ham_count, la) spam_prior_prob = (float)(spam_count) / (spam_count + ham_count) ham_prior_prob = (float)(ham_count) / (spam_count + ham_count) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done" TP, FP, TN, FN, problems = classify(test_data, word_probs, spam_prior_prob, ham_prior_prob) summarize_findings(la, TP, FP, TN, FN, problems)