def drive(nsets): print options.display() hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] d = TestDriver.Driver() d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1), msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1)) for i in range(nsets): h = hamdirs[i] s = spamdirs[i] hamstream = msgs.HamStream(h, [h], train=0) spamstream = msgs.SpamStream(s, [s], train=0) if i > 0: if options["CV Driver", "build_each_classifier_from_scratch"]: d.new_classifier() hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1) h2 = hamdirs[:] del h2[i] sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1) s2 = spamdirs[:] del s2[i] d.train(msgs.HamStream(hname, h2, train=1), msgs.SpamStream(sname, s2, train=1)) else: d.untrain(hamstream, spamstream) d.test(hamstream, spamstream) d.finishtest() if i < nsets - 1 and not options["CV Driver", "build_each_classifier_from_scratch"]: d.train(hamstream, spamstream) d.alldone()
def drive(nsets): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamhamdirs = zip(spamdirs, hamdirs) d = TestDriver.Driver() for spamdir, hamdir in spamhamdirs: d.new_classifier() d.train(msgs.HamStream(hamdir, [hamdir]), msgs.SpamStream(spamdir, [spamdir])) for sd2, hd2 in spamhamdirs: if (sd2, hd2) == (spamdir, hamdir): continue d.test(msgs.HamStream(hd2, [hd2]), msgs.SpamStream(sd2, [sd2])) d.finishtest() d.alldone()
def unlearn_compare(nsets, unsets): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets+1)] spamhamdirs = zip(spamdirs, hamdirs) unspamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, unsets+1)] unhamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, unsets+1)] unspamhamdirs = zip(unspamdirs, unhamdirs) d = TestDriver.Driver() d.new_classifier() """ for spamdir, hamdir in spamhamdirs: d.train(msgs.HamStream(hamdir, [hamdir]), msgs.SpamStream(spamdir, [spamdir])) """ d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) d.train(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) d.test(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) d.finishtest() d.alldone() unlearn_driver(d, spamhamdirs, unspamhamdirs)
def drive(): print options.display() spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)] ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)] d = dictionarywriter.DictionaryWriter(150, 4) d.write() keep_going = True trial_number = 1 au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], [msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[3], [spam[3]])], msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), ) with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt", 'w') as outfile: while keep_going: chosen = set() current = au.select_initial() cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) while not cluster: current = au.select_initial(chosen) cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) cluster_list = list(cluster.cluster_set) dicts = au.driver.tester.train_examples[2] data = v_correlation(cluster_list, dicts) outfile.write("Trial " + str(trial_number) + " Percentage Overlap (Correlation): " + str(data)) answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "n": keep_going = False valid_input = True elif answer == "y": au.learn(cluster) au.init_ground() trial_number += 1 valid_input = True else: print "Please enter either y or n."
def main(args): global FMT print options.display() FMT = "unix" NSETS = 10 SEED = 101 MAXMSGS = None opts, args = getopt.getopt(args, "f:n:s:m:") for k, v in opts: if k == '-f': FMT = v if k == '-n': NSETS = int(v) if k == '-s': SEED = int(v) if k == '-m': MAXMSGS = int(v) ham, spam = args random.seed(SEED) nham = len(list(mbox(ham))) nspam = len(list(mbox(spam))) if MAXMSGS: nham = min(nham, MAXMSGS) nspam = min(nspam, MAXMSGS) print "ham", ham, nham print "spam", spam, nspam ihams = map(tuple, randindices(nham, NSETS)) ispams = map(tuple, randindices(nspam, NSETS)) driver = Driver() for i in range(1, NSETS): driver.train(mbox(ham, ihams[i]), mbox(spam, ispams[i])) i = 0 for iham, ispam in zip(ihams, ispams): hams = mbox(ham, iham) spams = mbox(spam, ispam) if i > 0: driver.untrain(hams, spams) driver.test(hams, spams) driver.finishtest() if i < NSETS - 1: driver.train(hams, spams) i += 1 driver.alldone()
def drive(nsets,decision): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)] hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)] nham = len(hamfns) nspam = len(spamfns) cc = CostCounter.nodelay() allfns = {} for fn in spamfns+hamfns: allfns[fn] = None d = hammie.open('weaktest.db', False) hamtrain = 0 spamtrain = 0 n = 0 for dir,name, is_spam in allfns.iterkeys(): n += 1 m=msgs.Msg(dir, name).guts if debug > 1: print "trained:%dH+%dS"%(hamtrain,spamtrain) scr=d.score(m) if debug > 1: print "score:%.3f"%scr if not decision.tooearly(): if is_spam: if debug > 0: print "Spam with score %.2f"%scr cc.spam(scr) else: if debug > 0: print "Ham with score %.2f"%scr cc.ham(scr) de = decision(scr,is_spam) if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 elif de == TRAIN_AS_HAM: d.train_ham(m) hamtrain += 1 if n % 100 == 0: print "%5d trained:%dH+%dS wrds:%d"%( n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc print "="*70 print "%5d trained:%dH+%dS wrds:%d"%( n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc
def drive(nsets): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets+1)] d = TestDriver.Driver() d.new_classifier() d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) d.finishtest() d.alldone()
def drive(nsets): print options.display() hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] d = TestDriver.Driver() # Train it on all sets except the first. d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1), msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1)) # Now run nsets times, predicting pair i against all except pair i. for i in range(nsets): h = hamdirs[i] s = spamdirs[i] hamstream = msgs.HamStream(h, [h], train=0) spamstream = msgs.SpamStream(s, [s], train=0) if i > 0: if options["CV Driver", "build_each_classifier_from_scratch"]: # Build a new classifier from the other sets. d.new_classifier() hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1) h2 = hamdirs[:] del h2[i] sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1) s2 = spamdirs[:] del s2[i] d.train(msgs.HamStream(hname, h2, train=1), msgs.SpamStream(sname, s2, train=1)) else: # Forget this set. d.untrain(hamstream, spamstream) # Predict this set. d.test(hamstream, spamstream) d.finishtest() if i < nsets - 1 and not options["CV Driver", "build_each_classifier_from_scratch"]: # Add this set back in. d.train(hamstream, spamstream) d.alldone()
def drive(nsets, decision): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets + 1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets + 1)] spamfns = [(x, y, 1) for x in spamdirs for y in os.listdir(x)] hamfns = [(x, y, 0) for x in hamdirs for y in os.listdir(x)] nham = len(hamfns) nspam = len(spamfns) cc = CostCounter.nodelay() allfns = {} for fn in spamfns + hamfns: allfns[fn] = None d = hammie.open("weaktest.db", False) hamtrain = 0 spamtrain = 0 n = 0 for dir, name, is_spam in allfns.iterkeys(): n += 1 m = msgs.Msg(dir, name).guts if debug > 1: print "trained:%dH+%dS" % (hamtrain, spamtrain) scr = d.score(m) if debug > 1: print "score:%.3f" % scr if not decision.tooearly(): if is_spam: if debug > 0: print "Spam with score %.2f" % scr cc.spam(scr) else: if debug > 0: print "Ham with score %.2f" % scr cc.ham(scr) de = decision(scr, is_spam) if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 elif de == TRAIN_AS_HAM: d.train_ham(m) hamtrain += 1 if n % 100 == 0: print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc print "=" * 70 print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc
def drive(nsets): print options.display() hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] d = TestDriver.Driver() # Train it on all sets except the first. d.train( msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1), msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1)) # Now run nsets times, predicting pair i against all except pair i. for i in range(nsets): h = hamdirs[i] s = spamdirs[i] hamstream = msgs.HamStream(h, [h], train=0) spamstream = msgs.SpamStream(s, [s], train=0) if i > 0: if options["CV Driver", "build_each_classifier_from_scratch"]: # Build a new classifier from the other sets. d.new_classifier() hname = "%s-%d, except %d" % (hamdirs[0], nsets, i + 1) h2 = hamdirs[:] del h2[i] sname = "%s-%d, except %d" % (spamdirs[0], nsets, i + 1) s2 = spamdirs[:] del s2[i] d.train(msgs.HamStream(hname, h2, train=1), msgs.SpamStream(sname, s2, train=1)) else: # Forget this set. d.untrain(hamstream, spamstream) # Predict this set. d.test(hamstream, spamstream) d.finishtest() if i < nsets - 1 and not options["CV Driver", "build_each_classifier_from_scratch"]: # Add this set back in. d.train(hamstream, spamstream) d.alldone()
def drive(num): print options.display() spamdirs = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4) ] hamdirs = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4) ] r = mislabeledfilemover.MislabeledFileMover(num) r.random_move_file() d = TestDriver.Driver() d.new_classifier() d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) guess = d.classifier.spamprob polluted = [] for msg in msgs.HamStream(hamdirs[2], [hamdirs[2]]): msg.prob = guess(msg) polluted.append(msg) for msg in msgs.SpamStream(spamdirs[2], [spamdirs[2]]): msg.prob = guess(msg) polluted.append(msg) mislabeled = [] for fp in d.tester.false_positives(): mislabeled.append(fp) for fn in d.tester.false_negatives(): mislabeled.append(fn) for unsure in d.unsure: mislabeled.append(unsure) d.finishtest() d.alldone() data = v_correlation(polluted, mislabeled) print "Percentage Overlap (Correlation): " + str(data)
def drive(num): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)] r = mislabeledfilemover.MislabeledFileMover(num) r.random_move_file() d = TestDriver.Driver() d.new_classifier() d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) guess = d.classifier.spamprob polluted = [] for msg in msgs.HamStream(hamdirs[2], [hamdirs[2]]): msg.prob = guess(msg) polluted.append(msg) for msg in msgs.SpamStream(spamdirs[2], [spamdirs[2]]): msg.prob = guess(msg) polluted.append(msg) mislabeled = [] for fp in d.tester.false_positives(): mislabeled.append(fp) for fn in d.tester.false_negatives(): mislabeled.append(fn) for unsure in d.unsure: mislabeled.append(unsure) d.finishtest() d.alldone() data = v_correlation(polluted, mislabeled) print "Percentage Overlap (Correlation): " + str(data)
def drive(): print options.display() spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] d = dictionarywriter.DictionaryWriter(150, 4) d.write() keep_going = True trial_number = 1 au = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], [ msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[3], [spam[3]]) ], msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), ) with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt", 'w') as outfile: while keep_going: chosen = set() current = au.select_initial() cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) while not cluster: current = au.select_initial(chosen) cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) cluster_list = list(cluster.cluster_set) dicts = au.driver.tester.train_examples[2] data = v_correlation(cluster_list, dicts) outfile.write("Trial " + str(trial_number) + " Percentage Overlap (Correlation): " + str(data)) answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "n": keep_going = False valid_input = True elif answer == "y": au.learn(cluster) au.init_ground() trial_number += 1 valid_input = True else: print "Please enter either y or n."