def main(): data_sets_dir = "C:\\Users\\Alex\\Downloads\\Data Sets" set_dirs = ["DictionarySets-1.1", "DictionarySets-1.2", "DictionarySets-2.1", "DictionarySets-2.2", "DictionarySets-3.1", "Mislabeled-Big", "Mislabeled-Both-1.1", "Mislabeled-Both-1.2", "Mislabeled-Both-2.1", "Mislabeled-Both-2.2", "Mislabeled-Both-3.1", "Mislabeled-HtoS-1.1", "Mislabeled-HtoS-1.2", "Mislabeled-HtoS-1.3", "Mislabeled-HtoS-1.4", "Mislabeled-HtoS-1.5", "Mislabeled-StoH-1.1", "Mislabeled-StoH-1.2", "Mislabeled-StoH-1.3", "Mislabeled-StoH-2.1", "Mislabeled-StoH-2.2"] hams = [seterize(data_sets_dir, set_dir, False, 3) for set_dir in set_dirs] spams = [seterize(data_sets_dir, set_dir, True, 3) for set_dir in set_dirs] assert(len(hams) == len(spams)) sets = [0] for i in sets: ham = hams[i] spam = spams[i] au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], # Training Ham [msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[2], [spam[2]])], # Training Spam msgs.HamStream(ham[0], [ham[0]]), # Testing Ham msgs.SpamStream(spam[0], [spam[0]]), # Testing Spam ) print "Cluster list:\n" outfile = open("C:\\Users\\Alex\\Desktop\\cluster_au.txt", 'w') cluster_list = ActiveUnlearnDriver.cluster_au(au, gold=True, test=True) print cluster_list outfile.write(cluster_list) outfile.close()
def main(): num_data_sets = len(hams) assert(len(hams) == len(spams)) sets = [0] for i in sets: ham = hams[i] spam = spams[i] ham_test = ham[0] spam_test = spam[0] ham_train = ham[1] spam_train = spam[1] ham_p = ham[2] spam_p = spam[2] try: au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]), msgs.HamStream(ham_p, [ham_p])], # Training Ham [msgs.SpamStream(spam_train, [spam_train]), msgs.SpamStream(spam_p, [spam_p])], # Training Spam msgs.HamStream(ham_test, [ham_test]), # Testing Ham msgs.SpamStream(spam_test, [spam_test]), # Testing Spam ) print "Unlearning..." cluster = ProxyCluster(au.driver.tester.train_examples[2]) au.unlearn(cluster) """ time_1 = time.time() for i in range(10): au.init_ground(update=False) time_2 = time.time() avg_no_update = float(time_2 - time_1) / 10 no_update_rate = au.driver.tester.correct_classification_rate() time_3 = time.time() for i in range(10): au.init_ground(update=True) time_4 = time.time() avg_update = float(time_4 - time_3) / 10 update_rate = au.driver.tester.correct_classification_rate() print "Average test without update: " + str(avg_no_update) print "Average test with update: " + str(avg_update) print "Detection rate without update: " + str(no_update_rate) print "Detection rate with update: " + str(update_rate) """ au.init_ground(update=False) au.init_ground(update=True) au.driver.tester.correct_classification_rate() except KeyboardInterrupt: sys.exit()
def main(): sets = [10] dest = "C:/Users/bzpru/Desktop/spambayes-1.1a6/unpollute_stats/Yang_Data_Sets (cluster features)/" for i in sets: ham = hams[i] spam = spams[i] data_set = set_dirs[i] if i > 10: ham_test = ham[1] spam_test = spam[1] ham_train = ham[0] spam_train = spam[0] else: ham_test = ham[0] spam_test = spam[0] ham_train = ham[1] spam_train = spam[1] ham_p = ham[2] spam_p = spam[2] ham_polluted = dir_enumerate(ham_p) spam_polluted = dir_enumerate(spam_p) train_ham = dir_enumerate(ham_train) train_spam = dir_enumerate(spam_train) test_ham = dir_enumerate(ham_test) test_spam = dir_enumerate(spam_test) total_polluted = ham_polluted + spam_polluted total_unpolluted = train_ham + train_spam time_1 = time.time() p_au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]), msgs.HamStream(ham_p, [ham_p])], # Training Ham [msgs.SpamStream(spam_train, [spam_train]), msgs.SpamStream(spam_p, [spam_p])], # Training Spam msgs.HamStream(ham_test, [ham_test]), # Testing Ham msgs.SpamStream(spam_test, [spam_test]), # Testing Spam distance_opt="inv-match", all_opt=True, update_opt="hybrid", greedy_opt=False) v_au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]), []], [msgs.SpamStream(spam_train, [spam_train]), []], msgs.HamStream(ham_test, [ham_test]), msgs.SpamStream(spam_test, [spam_test])) vanilla_detection_rate = v_au.current_detection_rate time_2 = time.time() train_time = seconds_to_english(time_2 - time_1) print "Train time:", train_time, "\n" with open(dest + data_set + " (unlearn_stats).txt", 'w') as outfile: cluster_list = stats(p_au, outfile, data_set, [train_ham, train_spam], [test_ham, test_spam], [ham_polluted, spam_polluted], total_polluted, total_unpolluted, train_time, vanilla=[vanilla_detection_rate, v_au], clusters=True) with open(dest + data_set + " (Separate Features).txt", 'w') as outfile: outfile.write("---------------------------\n") outfile.write("Data Set: " + data_set + "\n") outfile.write("Vanilla Training: " + str(train_ham) + " ham and " + str(train_spam) + " spam.\n") outfile.write("Testing: " + str(test_ham) + " ham and " + str(test_spam) + " spam.\n") outfile.write("Pollution Training: " + str(ham_polluted) + " ham and " + str(spam_polluted) + " spam.\n") outfile.write("---------------------------\n") outfile.write("\n\n") print_cluster_pollution(outfile, cluster_list) # In the hopes of keeping RAM down between iterations del p_au del v_au
def main(): data_sets_dir = "C:\\Users\\Alex\\Downloads\\Data Sets" set_dirs = ["Mislabeled-Big"] hams = [seterize(data_sets_dir, set_dir, False, 3) for set_dir in set_dirs] spams = [seterize(data_sets_dir, set_dir, True, 3) for set_dir in set_dirs] num_data_sets = len(hams) assert (len(hams) == len(spams)) for i in range(num_data_sets): ham = hams[i] spam = spams[i] ham_polluted = dir_enumerate(ham[2]) spam_polluted = dir_enumerate(spam[2]) train_ham = dir_enumerate(ham[1]) train_spam = dir_enumerate(spam[1]) test_ham = dir_enumerate(ham[0]) test_spam = dir_enumerate(spam[0]) total_polluted = ham_polluted + spam_polluted try: time_1 = time.time() au = ActiveUnlearnDriver.ActiveUnlearner( [ msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]]) ], # Training Ham [ msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[2], [spam[2]]) ], # Training Spam msgs.HamStream(ham[0], [ham[0]]), # Testing Ham msgs.SpamStream(spam[0], [spam[0]]), # Testing Spam ) time_2 = time.time() train_time = time_2 - time_1 print "Train time:", train_time, "\n" with open("C:\\Users\\Alex\\Desktop\\unpollute_stats\\big_yang_" + str(i + 1) + ".txt", 'w') \ as outfile: try: outfile.write("---------------------------\n") outfile.write("Data Set: " + set_dirs[i] + "\n") outfile.write("Vanilla Training: " + str(train_ham) + " ham and " + str(train_spam) + " spam.\n") outfile.write("Testing: " + str(test_ham) + " ham and " + str(test_spam) + " spam.\n") outfile.write("Pollution Training: " + str(ham_polluted) + " ham and " + str(spam_polluted) + " spam.\n") outfile.write("---------------------------\n") outfile.write("\n\n") outfile.write("CLUSTER AND RATE COUNTS:\n") outfile.write("---------------------------\n") original_detection_rate = au.driver.tester.correct_classification_rate( ) outfile.write("0: " + str(original_detection_rate) + "\n") time_start = time.time() cluster_list = au.greatest_impact_active_unlearn( outfile, test=True, pollution_set3=True, gold=True) time_end = time.time() unlearn_time = time_end - time_start total_polluted_unlearned = 0 total_unlearned = 0 total_unpolluted_unlearned = 0 final_detection_rate = au.current_detection_rate print "\nTallying up final counts...\n" for cluster in cluster_list: cluster = cluster[1] total_unlearned += cluster.size total_polluted_unlearned += cluster.target_set3() total_unpolluted_unlearned += (cluster.size - cluster.target_set3()) outfile.write("\nSTATS\n") outfile.write("---------------------------\n") outfile.write("Initial Detection Rate: " + str(original_detection_rate) + "\n") outfile.write("Final Detection Rate: " + str(final_detection_rate) + "\n") outfile.write("Total Unlearned:\n") outfile.write(str(total_unlearned) + "\n") outfile.write("Polluted Percentage of Unlearned:\n") outfile.write( str( float(total_polluted_unlearned) / float(total_unlearned)) + "\n") outfile.write("Unpolluted Percentage of Unlearned:\n") outfile.write( str( float(total_unpolluted_unlearned) / float(total_unlearned)) + "\n") outfile.write("Percentage of Polluted Unlearned:\n") outfile.write( str( float(total_polluted_unlearned) / float(total_polluted)) + "\n") outfile.write("Time for training:\n") outfile.write(str(train_time) + "\n") outfile.write("Time for unlearning:\n") outfile.write(str(unlearn_time)) except KeyboardInterrupt: outfile.flush() os.fsync(outfile) sys.exit() except KeyboardInterrupt: sys.exit()
def main(): import os import sys from random import choice sys.path.insert(-1, os.getcwd()) sys.path.insert(-1, os.path.dirname(os.getcwd())) from spambayes import ActiveUnlearnDriver from spambayes.Options import get_pathname_option from spambayes import msgs """ from dictionarywriter import DictionaryWriter """ ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] """ DictionaryWriter(600).write() """ keep_going = True trial_number = 1 au_v = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], [ msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[3], [spam[3]]) ], msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), ) while keep_going: msg = choice(au_v.driver.tester.train_examples[0]) try: test_cl, counter = au_v.determine_cluster(msg) test_size = test_cl.size au_v.learn(test_cl) except TypeError: counter = 1 test_size = "100, but fail" cluster_detection_rates_v = [] cluster_spam_rates_v = [] cluster_sizes = [] au_v.init_ground() original_rate_v = au_v.driver.tester.correct_classification_rate() cluster_size = 100 cluster_sizes.append(100) print "Clustering with size", cluster_size, "..." cl_v = ActiveUnlearnDriver.Cluster(msg, cluster_size, au_v, "extreme") cluster_spam_rates_v.append( float(cl_v.target_spam()) / float(cluster_size)) cluster_detection_rates_v.append(au_v.start_detect_rate(cl_v)) for i in range(1, counter + 2): cluster_size += 100 cluster_sizes.append(cluster_size) print "Clustering with size", cluster_size, "..." cluster_detection_rates_v.append( au_v.continue_detect_rate(cl_v, 100)) cluster_spam_rates_v.append( float(cl_v.target_spam()) / float(cluster_size)) with open( "C:\Users\Alex\Desktop\det_cluster_stats_v" + str(trial_number) + ".txt", 'w') as outfile: outfile.write("VANILLA MACHINE\n") outfile.write("--------------------------\n") outfile.write("Clustered around: " + msg.tag + "\n") outfile.write("--------------------------\n") outfile.write("Detection Rates:\n") outfile.write(str(original_rate_v) + "\n") for item in cluster_detection_rates_v: outfile.write(str(item) + "\n") outfile.write("--------------------------\n") outfile.write("Spam Rate:\n") for item in cluster_spam_rates_v: outfile.write(str(item) + "\n") outfile.write("Test Cluster Size:\n") outfile.write(str(test_size)) answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trials so far. ") if answer == "n": keep_going = False else: au_v.learn(cl_v) au_v.init_ground() trial_number += 1
def main(): ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] keep_going = True trial_number = 1 try: time_1 = time.time() au = ActiveUnlearnDriver.ActiveUnlearner( [ msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]]) ], # Training Ham [ msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[2], [spam[2]]) ], # Training Spam msgs.HamStream(ham[0], [ham[0]]), # Testing Ham msgs.SpamStream(spam[0], [spam[0]]), # Testing Spam ) time_2 = time.time() train_time = time_2 - time_1 print "Train time:", train_time, "\n" while keep_going: with open("C:\\Users\\Alex\\Desktop\\unpollute_stats\\unlearn_stats" + str(trial_number) + ".txt", 'w') \ as outfile: try: outfile.write("CLUSTER AND RATE COUNTS:\n") outfile.write("---------------------------\n") original_detection_rate = au.driver.tester.correct_classification_rate( ) outfile.write("0: " + str(original_detection_rate) + "\n") time_start = time.time() cluster_list = au.brute_force_active_unlearn( outfile, test=True, center_iteration=False, pollution_set3=True, gold=True) time_end = time.time() unlearn_time = time_end - time_start total_polluted_unlearned = 0 total_unlearned = 0 total_unpolluted_unlearned = 0 final_detection_rate = au.current_detection_rate print "\nTallying up final counts...\n" for cluster in cluster_list: total_unlearned += cluster.size total_polluted_unlearned += cluster.target_set3() total_unpolluted_unlearned += (cluster.size - cluster.target_set3()) outfile.write("\nSTATS\n") outfile.write("---------------------------\n") outfile.write("Initial Detection Rate: " + str(original_detection_rate) + "\n") outfile.write("Final Detection Rate: " + str(final_detection_rate) + "\n") outfile.write("Total Unlearned:\n") outfile.write(str(total_unlearned) + "\n") outfile.write("Polluted Percentage of Unlearned:\n") outfile.write( str( float(total_polluted_unlearned) / float(total_unlearned)) + "\n") outfile.write("Unpolluted Percentage of Unlearned:\n") outfile.write( str( float(total_unpolluted_unlearned) / float(total_unlearned)) + "\n") outfile.write("Percentage of Polluted Unlearned:\n") outfile.write( str(float(total_polluted_unlearned) / 1200) + "\n") outfile.write("Time for training:\n") outfile.write(str(train_time) + "\n") outfile.write("Time for unlearning:\n") outfile.write(str(unlearn_time)) except KeyboardInterrupt: outfile.flush() os.fsync(outfile) """ m.reset() """ sys.exit() answer = raw_input("\nKeep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "n": keep_going = False valid_input = True elif answer == "y": for cluster in cluster_list: au.learn(cluster) au.init_ground() trial_number += 1 valid_input = True else: answer = raw_input("Please enter either y or n. ") except KeyboardInterrupt: """ m.reset() """ sys.exit()
def main(): ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4) ] injected = get_pathname_option("TestDriver", "spam_directories") % 3 au = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham[0], [ham[0]]), msgs.HamStream(ham[2], [ham[2]])], [ msgs.SpamStream(spam[0], [spam[0]]), msgs.SpamStream(spam[2], [spam[2]]) ], msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) msg = choice( au.driver.tester.train_examples[2]) # Randomly chosen from Ham Set3 original_rate = au.driver.tester.correct_classification_rate() cluster_sizes = [] detection_rates = [] target_cluster_rates = [] sizes = [] for i in range(150, 1050, 50): sizes.append(i) for i in range(1000, 15000, 1000): sizes.append(i) for size in sizes: cluster = ActiveUnlearnDriver.Cluster(msg, size, au, "extreme") print "Clustering with size " + str(cluster.size) + "..." cluster_sizes.append(size) detection_rates.append(au.detect_rate(cluster)) target_cluster_rates.append( float(cluster.target_set3()) / float(cluster.size)) file = open("/Users/AlexYang/Desktop/clues.txt", 'w') features = au.driver.classifier._getclues(msg) i = 1 for feature in features: file.write(str(i) + ") ") file.write(str(feature) + "\n") i += 1 with open("/Users/AlexYang/Desktop/clusterstats.txt", 'w') as outfile: outfile.write("Clustered around: " + msg.tag) outfile.write("\nOriginal Rate: " + str(original_rate) + "\n") outfile.write( tabulate( { "Cluster Sizes": cluster_sizes, "Detection Rates": detection_rates, "% of Targets Clustered": target_cluster_rates }, headers="keys", tablefmt="plain"))
def main(): # sets = [11,12,13,14,15] # mislabeled_both_small sets = [17, 20] # sets = [16,17,18,19,20,21] # mislabeled_both_big parser = argparse.ArgumentParser() parser.add_argument( '-cv', '--cross', type=str, help="partition test set into T1 and T2 for cross-validation", choices=['random', 'features', 'mislabeled'], default=None) parser.add_argument('-f', '--features', nargs='*', help="what features to split into T2", default=None) parser.add_argument('-d', '--dest', type=str, help="choose alternate destination for output file") parser.add_argument('-dist', '--distance', type=str, default='frequency5', choices=['frequency5', 'frequency3'], help="choose a distance method") parser.add_argument('-hc', '--ham_cutoff', type=float, default=.2, help="choose a ham cutoff probability") parser.add_argument('-sc', '--spam_cutoff', type=float, default=.8, help="choose a spam cutoff probability") parser.add_argument('-cp', '--copies', type=int, default=1, help="number of times to copy T1") parser.add_argument( '-mc', '--misclassified', dest='misclassified', action='store_true', help="When partitioning T1, do we include only misclassified emails?") parser.set_defaults(misclassified=False) args = parser.parse_args() print args if args.dest: global dest dest += args.dest print "path selected: ", dest options['Categorization', 'ham_cutoff'] = args.ham_cutoff options['Categorization', 'spam_cutoff'] = args.spam_cutoff for i in sets: ham = hams[i] spam = spams[i] data_set = set_dirs[i] print "beginning tests on ", data_set if i > 10: #Set2 is test and Set1 is training for all mislabeled datasets ham_test = ham[1] # approx 20,000 test and 12,000 train spam_test = spam[1] ham_train = ham[0] spam_train = spam[0] else: ham_test = ham[0] # approx 12,000 test and 20,000 train spam_test = spam[0] ham_train = ham[1] spam_train = spam[1] # the polluted data sets ham_p = ham[2] spam_p = spam[2] # Calculate the number of emails for polluted, train, test, and total data sets ham_polluted = dir_enumerate(ham_p) spam_polluted = dir_enumerate(spam_p) train_ham = dir_enumerate(ham_train) train_spam = dir_enumerate(spam_train) test_ham = dir_enumerate(ham_test) test_spam = dir_enumerate(spam_test) total_polluted = ham_polluted + spam_polluted total_unpolluted = train_ham + train_spam try: time_1 = time.time() # begin timer # Instantiate ActiveUnlearner object if args.cross is not None: au_temp = None if args.cross == 'mislabeled' or args.misclassified: # find mislabeled emails print '------Gathering Mislabeled Emails------' au_temp = ActiveUnlearnDriver.ActiveUnlearner( [ msgs.HamStream(ham_train, [ham_train]), msgs.HamStream(ham_p, [ham_p]) ], # Training Ham [ msgs.SpamStream(spam_train, [spam_train]), msgs.SpamStream(spam_p, [spam_p]) ], # Training Spam msgs.HamStream(ham_test, [ham_test]), # Testing Ham msgs.SpamStream(spam_test, [spam_test]), # Testing Spam distance_opt=args.distance, all_opt=True, update_opt="hybrid", greedy_opt=True, include_unsures=False) # Don't unclude unsure emails print '------Mislabeled Emails Gathered------' t1_ham, t1_spam, t2_ham, t2_spam = partitioner.partition( test_ham, ham_test, test_spam, spam_test, args.cross, args.features, args.copies, mis_only=args.misclassified, au=au_temp) au = ActiveUnlearnDriver.ActiveUnlearner( [ msgs.HamStream(ham_train, [ham_train]), msgs.HamStream(ham_p, [ham_p]) ], # Training Ham [ msgs.SpamStream(spam_train, [spam_train]), msgs.SpamStream(spam_p, [spam_p]) ], # Training Spam msgs.HamStream(ham_test, [ham_test], indices=t1_ham), # Testing Ham msgs.SpamStream(spam_test, [spam_test], indices=t1_spam), # Testing Spam cv_ham=msgs.HamStream(ham_test, [ham_test], indices=t2_ham), # T2 testing Ham cv_spam=msgs.SpamStream( spam_test, [spam_test], indices=t2_spam), # T2 testing Spam distance_opt=args.distance, all_opt=True, update_opt="hybrid", greedy_opt=True, include_unsures=False, partition_method=args.cross) # Don't unclude unsure emails else: au = ActiveUnlearnDriver.ActiveUnlearner( [ msgs.HamStream(ham_train, [ham_train]), msgs.HamStream(ham_p, [ham_p]) ], # Training Ham [ msgs.SpamStream(spam_train, [spam_train]), msgs.SpamStream(spam_p, [spam_p]) ], # Training Spam msgs.HamStream(ham_test, [ham_test]), # Testing Ham msgs.SpamStream(spam_test, [spam_test]), # Testing Spam distance_opt=args.distance, all_opt=True, update_opt="hybrid", greedy_opt=True, include_unsures=False) # Don't unclude unsure emails # vanilla active unlearner v_au = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham_train, [ham_train]), []], [msgs.SpamStream(spam_train, [spam_train]), []], msgs.HamStream(ham_test, [ham_test]), msgs.SpamStream(spam_test, [spam_test])) vanilla_detection_rate = v_au.current_detection_rate time_2 = time.time() train_time = seconds_to_english(time_2 - time_1) print "Train time:", train_time, "\n" with open(dest + data_set + " (unlearn_stats).txt", 'w+') as outfile: try: if args.cross == 'features' or args.cross == 'mislabeled': t1_total = len(t1_ham) + len(t1_spam) t2_total = len(t2_ham) + len(t2_spam) print '----------------------T1/T2 TOTALS----------------------' print 'Size of T1 Ham: ' + str(len(t1_ham)) print 'Size of T1 Spam: ' + str(len(t1_spam)) print 'Size of T2 Ham: ' + str(len(t2_ham)) print 'Size of T2 Spam: ' + str(len(t2_spam)) if args.cross == 'features': outfile.write('Features used to distinguish T2: ' + ', '.join(args.features) + "\n") if args.cross == 'mislabeled': outfile.write('Ham cutoff : ' + str(args.ham_cutoff) + "\n") outfile.write('Spam cutoff : ' + str(args.spam_cutoff) + "\n") outfile.write('Size of T1 Ham: ' + str(len(t1_ham)) + "\n") outfile.write('Size of T1 Spam: ' + str(len(t1_spam)) + "\n") outfile.write('Size of T2 Ham: ' + str(len(t2_ham)) + "\n") outfile.write('Size of T2 Spam: ' + str(len(t2_spam)) + "\n") outfile.flush() os.fsync(outfile) unlearn_stats(au, args, outfile, data_set, [train_ham, train_spam], [test_ham, test_spam], [ham_polluted, spam_polluted], total_polluted, total_unpolluted, train_time, [ham_p, spam_p], vanilla=[vanilla_detection_rate, v_au], noisy_clusters=True) # unlearn_stats(au, outfile, data_set, [train_ham, train_spam], [test_ham, test_spam], # [ham_polluted, spam_polluted], total_polluted, total_unpolluted, # train_time, vanilla=None, noisy_clusters=True) except KeyboardInterrupt: outfile.flush() sys.exit() # In the hopes of keeping RAM down between iterations del au del v_au except KeyboardInterrupt: sys.exit()
def main(): sets = [11, 12, 13, 14, 15] for i in sets: ham = hams[i] spam = spams[i] data_set = set_dirs[i] if i > 10: ham_test = ham[1] spam_test = spam[1] ham_train = ham[0] spam_train = spam[0] else: ham_test = ham[0] spam_test = spam[0] ham_train = ham[1] spam_train = spam[1] ham_p = ham[2] spam_p = spam[2] ham_polluted = dir_enumerate(ham_p) spam_polluted = dir_enumerate(spam_p) train_ham = dir_enumerate(ham_train) train_spam = dir_enumerate(spam_train) test_ham = dir_enumerate(ham_test) test_spam = dir_enumerate(spam_test) total_polluted = ham_polluted + spam_polluted total_unpolluted = train_ham + train_spam try: time_1 = time.time() au = ActiveUnlearnDriver.ActiveUnlearner( [ msgs.HamStream(ham_train, [ham_train]), msgs.HamStream(ham_p, [ham_p]) ], # Training Ham [ msgs.SpamStream(spam_train, [spam_train]), msgs.SpamStream(spam_p, [spam_p]) ], # Training Spam msgs.HamStream(ham_test, [ham_test]), # Testing Ham msgs.SpamStream(spam_test, [spam_test]), # Testing Spam distance_opt="inverse", all_opt=True, update_opt="hybrid", greedy_opt=False) v_au = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham_train, [ham_train]), []], [msgs.SpamStream(spam_train, [spam_train]), []], msgs.HamStream(ham_test, [ham_test]), msgs.SpamStream(spam_test, [spam_test])) vanilla_detection_rate = v_au.current_detection_rate time_2 = time.time() train_time = seconds_to_english(time_2 - time_1) print "Train time:", train_time, "\n" dest = "C:/Users/bzpru/Desktop/spambayes-1.1a6/unpollute_stats/Yang_Data_Sets (inverse)/" \ "Hybrid Update - Nongreedy/Noisy/" with open(dest + data_set + " (unlearn_stats).txt", 'w') as outfile: try: unlearn_stats(au, outfile, data_set, [train_ham, train_spam], [test_ham, test_spam], [ham_polluted, spam_polluted], total_polluted, total_unpolluted, train_time, vanilla=[vanilla_detection_rate, v_au], noisy_clusters=True) except KeyboardInterrupt: outfile.flush() sys.exit() # In the hopes of keeping RAM down between iterations del au del v_au except KeyboardInterrupt: sys.exit()
def drive(): print options.display() spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] d = dictionarywriter.DictionaryWriter(150, 4) d.write() keep_going = True trial_number = 1 au = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], [ msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[3], [spam[3]]) ], msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), ) with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt", 'w') as outfile: while keep_going: chosen = set() current = au.select_initial() cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) while not cluster: current = au.select_initial(chosen) cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) cluster_list = list(cluster.cluster_set) dicts = au.driver.tester.train_examples[2] data = v_correlation(cluster_list, dicts) outfile.write("Trial " + str(trial_number) + " Percentage Overlap (Correlation): " + str(data)) answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "n": keep_going = False valid_input = True elif answer == "y": au.learn(cluster) au.init_ground() trial_number += 1 valid_input = True else: print "Please enter either y or n."
def main(): sets = [1, 2, 3, 4] dest = "C:/Users/bzpru/Desktop/spambayes-1.1a6/unpollute_stats/Yang_Data_Sets (inverse)/Hybrid Update - Nongreedy/" for i in sets: ham = hams[i] spam = spams[i] data_set = set_dirs[i] if i > 10: ham_test = ham[1] spam_test = spam[1] ham_train = ham[0] spam_train = spam[0] else: ham_test = ham[0] spam_test = spam[0] ham_train = ham[1] spam_train = spam[1] ham_p = ham[2] spam_p = spam[2] ham_polluted = dir_enumerate(ham_p) spam_polluted = dir_enumerate(spam_p) train_ham = dir_enumerate(ham_train) train_spam = dir_enumerate(spam_train) test_ham = dir_enumerate(ham_test) test_spam = dir_enumerate(spam_test) total_polluted = ham_polluted + spam_polluted total_unpolluted = train_ham + train_spam time_1 = time.time() p_au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]), msgs.HamStream(ham_p, [ham_p])], # Training Ham [msgs.SpamStream(spam_train, [spam_train]), msgs.SpamStream(spam_p, [spam_p])], # Training Spam msgs.HamStream(ham_test, [ham_test]), # Testing Ham msgs.SpamStream(spam_test, [spam_test]), # Testing Spam distance_opt="inv-match", all_opt=True, update_opt="hybrid", greedy_opt=False) time_2 = time.time() train_time = seconds_to_english(time_2 - time_1) print "Train time:", train_time, "\n" v_au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham_train, [ham_train]), []], [msgs.SpamStream(spam_train, [spam_train]), []], msgs.HamStream(ham_test, [ham_test]), msgs.SpamStream(spam_test, [spam_test])) p_c = p_au.driver.tester.classifier v_c = p_au.driver.tester.classifier words = set().union(set(p_c.wordinfo.keys()), set(v_c.wordinfo.keys())) p_pair = au_sig_words(p_au, words) v_pair = au_sig_words(v_au, words) with open(dest + data_set + " (unlearn_stats).txt", 'w') as outfile: stats(p_au, outfile, data_set, [train_ham, train_spam], [test_ham, test_spam], [ham_polluted, spam_polluted], total_polluted, total_unpolluted, train_time) words = words.union(set(p_c.wordinfo.keys())) u_pair = au_sig_words(p_au, words) features, sigs = extract_features([v_pair, p_pair, u_pair]) feature_matrix = feature_lists(sigs, 1) combined_matrix = [["", "Unpolluted", "Polluted", "Unlearned 1"]] + [[str(column) for column in feature] for feature in features] feature_col_width = max(len(row[1]) for row in feature_matrix) + 2 combined_col_width = max(len(item) for row in combined_matrix for item in row) + 2 feature_num_col_width = max(len(row[0]) for row in feature_matrix) + 2 with open(dest + data_set + " (Separate Features).txt", 'w') as outfile: outfile.write("---------------------------\n") outfile.write("Data Set: " + data_set + "\n") outfile.write("Vanilla Training: " + str(train_ham) + " ham and " + str(train_spam) + " spam.\n") outfile.write("Testing: " + str(test_ham) + " ham and " + str(test_spam) + " spam.\n") outfile.write("Pollution Training: " + str(ham_polluted) + " ham and " + str(spam_polluted) + " spam.\n") outfile.write("---------------------------\n") outfile.write("\n\n") outfile.write("Unpolluted and Polluted Most Significant Features:\n") outfile.write("---------------------------\n") for row in feature_matrix: justify = [row[0].ljust(feature_num_col_width)] for j in range(1, len(row)): justify.append(row[j].strip().ljust(feature_col_width)) outfile.write("".join(justify) + "\n") with open(dest + data_set + " (Combined Features).txt", 'w') as outfile: outfile.write("---------------------------\n") outfile.write("Data Set: " + data_set + "\n") outfile.write("Vanilla Training: " + str(train_ham) + " ham and " + str(train_spam) + " spam.\n") outfile.write("Testing: " + str(test_ham) + " ham and " + str(test_spam) + " spam.\n") outfile.write("Pollution Training: " + str(ham_polluted) + " ham and " + str(spam_polluted) + " spam.\n") outfile.write("---------------------------\n") outfile.write("\n\n") outfile.write("Feature Comparison:\n") outfile.write("---------------------------\n") for row in combined_matrix: outfile.write("".join(word.strip().ljust(combined_col_width) for word in row) + "\n")
def main(): import os import sys import shutil sys.path.insert(-1, os.getcwd()) sys.path.insert(-1, os.path.dirname(os.getcwd())) from spambayes import ActiveUnlearnDriver from spambayes.Options import get_pathname_option from spambayes import msgs import time ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] for i in range(1): au = ActiveUnlearnDriver.ActiveUnlearnDriver([ msgs.HamStream(ham[0], [ham[0]]), msgs.HamStream(ham[2], [ham[2]]), msgs.HamStream(ham[3], [ham[3]]) ], [ msgs.SpamStream(spam[0], [spam[0]]), msgs.SpamStream(spam[2], [spam[2]]), msgs.SpamStream(spam[3], [spam[3]]) ], msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]]), "ac-extreme") au.driver.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) au.driver.untrain(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]])) au.driver.untrain(msgs.HamStream(ham[3], [ham[3]]), msgs.SpamStream(spam[3], [spam[3]])) au.driver.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) msg = au.driver.tester.test_examples[5] shutil.copy(msg.tag, "C:\Users\Alex\Desktop\clustera") print msg.prob start_time = time.time() cluster = (au.cluster(msg, 10)) end_time = time.time() print cluster clueslist = [] for clue in msg.clues: clueslist.append((clue[0], clue[1])) print clueslist with open("C:\Users\Alex\Desktop\clustera\cluster7.txt", 'w') as outfile: spamcounter = 0 for sim in cluster: with open(sim.tag) as infile: if sim.tag.endswith(".spam.txt"): outfile.write("SPAMSPAMSPAMSPAMSPAM" + "\n\n") if sim.tag.endswith(".ham.txt"): outfile.write("HAMHAMHAMHAMHAM" + "\n\n") outfile.write(infile.read()) outfile.write("\n\n" + "----------------------------------------" + "\n\n") if sim.tag.endswith(".spam.txt"): spamcounter += 1 print spamcounter print end_time - start_time