def drive(nsets): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamhamdirs = zip(spamdirs, hamdirs) d = TestDriver.Driver() for spamdir, hamdir in spamhamdirs: d.new_classifier() d.train(msgs.HamStream(hamdir, [hamdir]), msgs.SpamStream(spamdir, [spamdir])) for sd2, hd2 in spamhamdirs: if (sd2, hd2) == (spamdir, hamdir): continue d.test(msgs.HamStream(hd2, [hd2]), msgs.SpamStream(sd2, [sd2])) d.finishtest() d.alldone()
def drive(nsets): print options.display() hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] d = TestDriver.Driver() d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1), msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1)) for i in range(nsets): h = hamdirs[i] s = spamdirs[i] hamstream = msgs.HamStream(h, [h], train=0) spamstream = msgs.SpamStream(s, [s], train=0) if i > 0: if options["CV Driver", "build_each_classifier_from_scratch"]: d.new_classifier() hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1) h2 = hamdirs[:] del h2[i] sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1) s2 = spamdirs[:] del s2[i] d.train(msgs.HamStream(hname, h2, train=1), msgs.SpamStream(sname, s2, train=1)) else: d.untrain(hamstream, spamstream) d.test(hamstream, spamstream) d.finishtest() if i < nsets - 1 and not options["CV Driver", "build_each_classifier_from_scratch"]: d.train(hamstream, spamstream) d.alldone()
def unlearn_compare(nsets, unsets): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets+1)] spamhamdirs = zip(spamdirs, hamdirs) unspamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, unsets+1)] unhamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, unsets+1)] unspamhamdirs = zip(unspamdirs, unhamdirs) d = TestDriver.Driver() d.new_classifier() """ for spamdir, hamdir in spamhamdirs: d.train(msgs.HamStream(hamdir, [hamdir]), msgs.SpamStream(spamdir, [spamdir])) """ d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) d.train(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) d.test(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) d.finishtest() d.alldone() unlearn_driver(d, spamhamdirs, unspamhamdirs)
def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print "Loading database...", if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.buildStatusStrings() # Don't set up the caches and training objects when running the self-test, # so as not to clutter the filesystem. if not self.isTest: def ensureDir(dirname): try: os.mkdir(dirname) except OSError, e: if e.errno != errno.EEXIST: raise # Create/open the Corpuses. Use small cache sizes to avoid hogging # lots of memory. sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") map(ensureDir, [sc, hc, uc]) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
def drive(): print options.display() spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)] ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)] d = dictionarywriter.DictionaryWriter(150, 4) d.write() keep_going = True trial_number = 1 au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], [msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[3], [spam[3]])], msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), ) with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt", 'w') as outfile: while keep_going: chosen = set() current = au.select_initial() cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) while not cluster: current = au.select_initial(chosen) cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) cluster_list = list(cluster.cluster_set) dicts = au.driver.tester.train_examples[2] data = v_correlation(cluster_list, dicts) outfile.write("Trial " + str(trial_number) + " Percentage Overlap (Correlation): " + str(data)) answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "n": keep_going = False valid_input = True elif answer == "y": au.learn(cluster) au.init_ground() trial_number += 1 valid_input = True else: print "Please enter either y or n."
def __init__(self, spam_feature=None, ham_feature=None, inject_type=0): self.h_injected = get_pathname_option("TestDriver", "ham_directories") % 3 + "/" self.s_injected = get_pathname_option("TestDriver", "spam_directories") % 3 + "/" if inject_type is 0: self.feature = spam_feature elif inject_type is 1: self.feature = ham_feature
def main(): ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)] spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)] t = TestDriver.Driver() t.train(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) t.dict_test(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[3], [spam[3]])) print "Test sizes: ", len(t.tester.truth_examples[0]), ", ", len(t.tester.truth_examples[1]), "\n" print "Detection rate:", t.tester.correct_classification_rate(), "\n"
def main(): ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)] spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)] sizes = [0, 60, 120, 240, 480, 840, 1200, 2400, 3600, 4800, 6000] d = TestDriver.Driver() d.new_classifier() detection_rates = [] target_rates = [] false_positives = [] false_negatives = [] unsures = [] for size in sizes: mislabeler = MislabeledFileMover(size) mislabeler.random_move_file() d.train(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) d.test(msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) target_rate = d.tester.correct_classification_rate() target_rates.append(target_rate) d.train(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]])) d.test(msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) detection_rate = d.tester.correct_classification_rate() detection_rates.append(detection_rate) fp = d.tester.nham_wrong false_positives.append(fp) fn = d.tester.nspam_wrong false_negatives.append(fn) unsure = d.tester.nham_unsure + d.tester.nspam_unsure unsures.append(unsure) d.untrain(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) d.untrain(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]])) mislabeler.reset() with open("/Users/AlexYang/Desktop/hamasspam.txt", 'w') as outfile: outfile.write(tabulate({"# of Mislabeled Words": sizes, "Detection Rates": detection_rates, "Target Rates": target_rates}, headers="keys", tablefmt="plain"))
def create_workers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" if self.is_test: self.use_db = "pickle" self.db_name = '_core_server.pickle' # This is never saved. if not hasattr(self, "db_name"): self.db_name, self.use_db = storage.database_type([]) self.bayes = storage.open_storage(self.db_name, self.use_db) # Load stats manager. self.stats = Stats.Stats(options, spambayes.message.Message().message_info_db) self.build_status_strings() # Don't set up the caches and training objects when running the # self-test, so as not to clutter the filesystem. if not self.is_test: # Create/open the Corpuses. Use small cache sizes to avoid # hogging lots of memory. sc = get_pathname_option("Storage", "core_spam_cache") hc = get_pathname_option("Storage", "core_ham_cache") uc = get_pathname_option("Storage", "core_unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzip_cache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spam_trainer = storage.SpamTrainer(self.bayes) self.ham_trainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spam_trainer) self.hamCorpus.addObserver(self.ham_trainer)
def drive(nsets,decision): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)] hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)] nham = len(hamfns) nspam = len(spamfns) cc = CostCounter.nodelay() allfns = {} for fn in spamfns+hamfns: allfns[fn] = None d = hammie.open('weaktest.db', False) hamtrain = 0 spamtrain = 0 n = 0 for dir,name, is_spam in allfns.iterkeys(): n += 1 m=msgs.Msg(dir, name).guts if debug > 1: print "trained:%dH+%dS"%(hamtrain,spamtrain) scr=d.score(m) if debug > 1: print "score:%.3f"%scr if not decision.tooearly(): if is_spam: if debug > 0: print "Spam with score %.2f"%scr cc.spam(scr) else: if debug > 0: print "Ham with score %.2f"%scr cc.ham(scr) de = decision(scr,is_spam) if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 elif de == TRAIN_AS_HAM: d.train_ham(m) hamtrain += 1 if n % 100 == 0: print "%5d trained:%dH+%dS wrds:%d"%( n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc print "="*70 print "%5d trained:%dH+%dS wrds:%d"%( n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc
def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print "Loading database...", if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.mdb = spambayes.message.Message().message_info_db # Load stats manager. self.stats = Stats.Stats(options, self.mdb) self.buildStatusStrings() # Don't set up the caches and training objects when running the self-test, # so as not to clutter the filesystem. if not self.isTest: # Create/open the Corpuses. Use small cache sizes to avoid hogging # lots of memory. sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") map(storage.ensureDir, [sc, hc, uc]) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
def drive(nsets): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets+1)] d = TestDriver.Driver() d.new_classifier() d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) d.finishtest() d.alldone()
def drive(nsets): print options.display() hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] d = TestDriver.Driver() # Train it on all sets except the first. d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1), msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1)) # Now run nsets times, predicting pair i against all except pair i. for i in range(nsets): h = hamdirs[i] s = spamdirs[i] hamstream = msgs.HamStream(h, [h], train=0) spamstream = msgs.SpamStream(s, [s], train=0) if i > 0: if options["CV Driver", "build_each_classifier_from_scratch"]: # Build a new classifier from the other sets. d.new_classifier() hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1) h2 = hamdirs[:] del h2[i] sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1) s2 = spamdirs[:] del s2[i] d.train(msgs.HamStream(hname, h2, train=1), msgs.SpamStream(sname, s2, train=1)) else: # Forget this set. d.untrain(hamstream, spamstream) # Predict this set. d.test(hamstream, spamstream) d.finishtest() if i < nsets - 1 and not options["CV Driver", "build_each_classifier_from_scratch"]: # Add this set back in. d.train(hamstream, spamstream) d.alldone()
def drive(nsets, decision): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets + 1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets + 1)] spamfns = [(x, y, 1) for x in spamdirs for y in os.listdir(x)] hamfns = [(x, y, 0) for x in hamdirs for y in os.listdir(x)] nham = len(hamfns) nspam = len(spamfns) cc = CostCounter.nodelay() allfns = {} for fn in spamfns + hamfns: allfns[fn] = None d = hammie.open("weaktest.db", False) hamtrain = 0 spamtrain = 0 n = 0 for dir, name, is_spam in allfns.iterkeys(): n += 1 m = msgs.Msg(dir, name).guts if debug > 1: print "trained:%dH+%dS" % (hamtrain, spamtrain) scr = d.score(m) if debug > 1: print "score:%.3f" % scr if not decision.tooearly(): if is_spam: if debug > 0: print "Spam with score %.2f" % scr cc.spam(scr) else: if debug > 0: print "Ham with score %.2f" % scr cc.ham(scr) de = decision(scr, is_spam) if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 elif de == TRAIN_AS_HAM: d.train_ham(m) hamtrain += 1 if n % 100 == 0: print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc print "=" * 70 print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc
def drive(nsets): print options.display() hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] d = TestDriver.Driver() # Train it on all sets except the first. d.train( msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1), msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1)) # Now run nsets times, predicting pair i against all except pair i. for i in range(nsets): h = hamdirs[i] s = spamdirs[i] hamstream = msgs.HamStream(h, [h], train=0) spamstream = msgs.SpamStream(s, [s], train=0) if i > 0: if options["CV Driver", "build_each_classifier_from_scratch"]: # Build a new classifier from the other sets. d.new_classifier() hname = "%s-%d, except %d" % (hamdirs[0], nsets, i + 1) h2 = hamdirs[:] del h2[i] sname = "%s-%d, except %d" % (spamdirs[0], nsets, i + 1) s2 = spamdirs[:] del s2[i] d.train(msgs.HamStream(hname, h2, train=1), msgs.SpamStream(sname, s2, train=1)) else: # Forget this set. d.untrain(hamstream, spamstream) # Predict this set. d.test(hamstream, spamstream) d.finishtest() if i < nsets - 1 and not options["CV Driver", "build_each_classifier_from_scratch"]: # Add this set back in. d.train(hamstream, spamstream) d.alldone()
def drive(num): print options.display() spamdirs = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4) ] hamdirs = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4) ] r = mislabeledfilemover.MislabeledFileMover(num) r.random_move_file() d = TestDriver.Driver() d.new_classifier() d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) guess = d.classifier.spamprob polluted = [] for msg in msgs.HamStream(hamdirs[2], [hamdirs[2]]): msg.prob = guess(msg) polluted.append(msg) for msg in msgs.SpamStream(spamdirs[2], [spamdirs[2]]): msg.prob = guess(msg) polluted.append(msg) mislabeled = [] for fp in d.tester.false_positives(): mislabeled.append(fp) for fn in d.tester.false_negatives(): mislabeled.append(fn) for unsure in d.unsure: mislabeled.append(unsure) d.finishtest() d.alldone() data = v_correlation(polluted, mislabeled) print "Percentage Overlap (Correlation): " + str(data)
def main(): ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)] spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)] injected = get_pathname_option("TestDriver", "spam_directories") % 3 au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[0], [ham[0]]), msgs.HamStream(ham[2], [ham[2]])], [msgs.SpamStream(spam[0], [spam[0]]), msgs.SpamStream(spam[2], [spam[2]])], msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) msg = choice(au.driver.tester.train_examples[2]) # Randomly chosen from Ham Set3 original_rate = au.driver.tester.correct_classification_rate() cluster_sizes = [] detection_rates = [] target_cluster_rates = [] sizes = [] for i in range(150, 1050, 50): sizes.append(i) for i in range(1000, 15000, 1000): sizes.append(i) for size in sizes: cluster = ActiveUnlearnDriver.Cluster(msg, size, au, "extreme") print "Clustering with size " + str(cluster.size) + "..." cluster_sizes.append(size) detection_rates.append(au.detect_rate(cluster)) target_cluster_rates.append(float(cluster.target_set3()) / float(cluster.size)) file = open("/Users/AlexYang/Desktop/clues.txt", 'w') features = au.driver.classifier._getclues(msg) i = 1 for feature in features: file.write(str(i) + ") ") file.write(str(feature) + "\n") i += 1 with open("/Users/AlexYang/Desktop/clusterstats.txt", 'w') as outfile: outfile.write("Clustered around: " + msg.tag) outfile.write("\nOriginal Rate: " + str(original_rate) + "\n") outfile.write(tabulate({"Cluster Sizes": cluster_sizes, "Detection Rates": detection_rates, "% of Targets Clustered": target_cluster_rates}, headers="keys", tablefmt="plain"))
def database_type(opts): """Return the name of the database and the type to use. The output of this function can be used as the db_type parameter for the open_storage function, for example: [standard getopts code] db_name, db_type = database_types(opts) storage = open_storage(db_name, db_type) The selection is made based on the options passed, or, if the appropriate options are not present, the options in the global options object. Currently supports: -p : pickle -d : dbm """ nm, typ = None, None for opt, arg in opts: if _storage_options.has_key(opt): if nm is None and typ is None: nm, typ = arg, _storage_options[opt] else: raise MutuallyExclusiveError() if nm is None and typ is None: typ = options["Storage", "persistent_use_database"] if typ is True or typ == "True": typ = "dbm" elif typ is False or typ == "False": typ = "pickle" nm = get_pathname_option("Storage", "persistent_storage_file") return nm, typ
def database_type( opts, default_type=("Storage", "persistent_use_database"), default_name=("Storage", "persistent_storage_file") ): """Return the name of the database and the type to use. The output of this function can be used as the db_type parameter for the open_storage function, for example: [standard getopts code] db_name, db_type = database_type(opts) storage = open_storage(db_name, db_type) The selection is made based on the options passed, or, if the appropriate options are not present, the options in the global options object. Currently supports: -p : pickle -d : dbm """ nm, typ = None, None for opt, arg in opts: if opt in _storage_options: if nm is None and typ is None: nm, typ = arg, _storage_options[opt] else: raise MutuallyExclusiveError() if nm is None and typ is None: typ = options[default_type] try: unused, unused, is_path = _storage_types[typ] except KeyError: raise NoSuchClassifierError(typ) if is_path: nm = get_pathname_option(*default_name) else: nm = options[default_name] return nm, typ
def main(): print "Pickle is available." db = dumbdbm.open("dumbdb", "c") db["1"] = "1" db.close() dbstr = whichdb.whichdb("dumbdb") if dbstr: print "Dumbdbm is available." else: print "Dumbdbm is not available." db = dbhash.open("dbhash", "c") db["1"] = "1" db.close() dbstr = whichdb.whichdb("dbhash") if dbstr == "dbhash": print "Dbhash is available." else: print "Dbhash is not available." if bsddb is None: dbstr = "" else: db = bsddb.hashopen("bsddb3", "c") db["1"] = "1" db.close() dbstr = whichdb.whichdb("bsddb3") if dbstr == "dbhash": print "Bsddb[3] is available." else: print "Bsddb[3] is not available." print hammie = get_pathname_option("Storage", "persistent_storage_file") use_dbm = options["Storage", "persistent_use_database"] if not use_dbm: print "Your storage %s is a: pickle" % (hammie,) return if not os.path.exists(hammie): print "Your storage file does not exist yet." return db_type = whichdb.whichdb(hammie) if db_type == "dbhash": # could be dbhash or bsddb3 # only bsddb3 has a __version__ attribute - old bsddb module does not if hasattr(bsddb, '__version__'): try: db = bsddb.hashopen(hammie, "r") except bsddb.error: pass else: db.close() print "Your storage", hammie, "is a: bsddb[3]" return elif db_type is None: print "Your storage %s is unreadable." % (hammie,) print "Your storage %s is a: %s" % (hammie, db_type)
def splice_set(n, dir_num=3): destination = get_pathname_option("TestDriver", "spam_directories") % dir_num + "/" dict_c = 1 for dictionary in listdir(destination): print "Slicing dictionary", dict_c, "into", n, "parts" splice(destination + dictionary, n) remove(destination + dictionary) dict_c += 1
def drive(num): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)] r = mislabeledfilemover.MislabeledFileMover(num) r.random_move_file() d = TestDriver.Driver() d.new_classifier() d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) guess = d.classifier.spamprob polluted = [] for msg in msgs.HamStream(hamdirs[2], [hamdirs[2]]): msg.prob = guess(msg) polluted.append(msg) for msg in msgs.SpamStream(spamdirs[2], [spamdirs[2]]): msg.prob = guess(msg) polluted.append(msg) mislabeled = [] for fp in d.tester.false_positives(): mislabeled.append(fp) for fn in d.tester.false_negatives(): mislabeled.append(fn) for unsure in d.unsure: mislabeled.append(unsure) d.finishtest() d.alldone() data = v_correlation(polluted, mislabeled) print "Percentage Overlap (Correlation): " + str(data)
def main(): ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] t = TestDriver.Driver() t.train(msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) keep_going = True trial_number = 0 while keep_going: start_time = time.time() if trial_number == 0: t.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), True) else: t.test(t.tester.truth_examples[1], t.tester.truth_examples[0]) end_time = time.time() seconds = end_time - start_time trial_number += 1 print "Test sizes: ", len(t.tester.truth_examples[0]), ", ", len( t.tester.truth_examples[1]), "\n" print "Detection rate:", t.tester.correct_classification_rate(), "\n" print "\nTime elapsed:", seconds, "seconds.\n" answer = raw_input("Keep trying (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "y": valid_input = True elif answer == "n": sys.exit() else: answer = raw_input("Please enter either y or n. ")
def main(): ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] t = TestDriver.Driver() t.train(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) t.dict_test(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[3], [spam[3]])) print "Test sizes: ", len(t.tester.truth_examples[0]), ", ", len( t.tester.truth_examples[1]), "\n" print "Detection rate:", t.tester.correct_classification_rate(), "\n"
def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print("Loading database...", end=' ') if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.mdb = spambayes.message.Message().message_info_db self.stats = Stats.Stats(options, self.mdb) self.buildStatusStrings() if not self.isTest: sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
def main(): """Main program; parse options and go.""" global loud try: opts, args = getopt.getopt(sys.argv[1:], 'hfqnrd:p:g:s:o:') except getopt.error as msg: usage(2, msg) if not opts: usage(2, "No options given") force = False trainnew = False removetrained = False good = [] spam = [] for opt, arg in opts: if opt == '-h': usage(0) elif opt == "-f": force = True elif opt == "-n": trainnew = True elif opt == "-q": loud = False elif opt == '-g': good.append(arg) elif opt == '-s': spam.append(arg) elif opt == "-r": removetrained = True elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) pck, usedb = storage.database_type(opts) if args: usage(2, "Positional arguments not allowed") if usedb == None: usedb = options["Storage", "persistent_use_database"] pck = get_pathname_option("Storage", "persistent_storage_file") h = hammie.open(pck, usedb, "c") for g in good: if loud: print("Training ham (%s):" % g) train(h, g, False, force, trainnew, removetrained) sys.stdout.flush() save = True for s in spam: if loud: print("Training spam (%s):" % s) train(h, s, True, force, trainnew, removetrained) sys.stdout.flush() save = True if save: h.store()
def main(): print("Pickle is available.") db = dbm.dumb.open("dumbdb", "c") db["1"] = "1" db.close() dbstr = dbm.whichdb("dumbdb") if dbstr: print("Dumbdbm is available.") else: print("Dumbdbm is not available.") db = dbm.bsd.open("dbhash", "c") db["1"] = "1" db.close() dbstr = dbm.whichdb("dbhash") if dbstr == "dbhash": print("Dbhash is available.") else: print("Dbhash is not available.") if bsddb is None: dbstr = "" else: db = bsddb.hashopen("bsddb3", "c") db["1"] = "1" db.close() dbstr = dbm.whichdb("bsddb3") if dbstr == "dbhash": print("Bsddb[3] is available.") else: print("Bsddb[3] is not available.") print() hammie = get_pathname_option("Storage", "persistent_storage_file") use_dbm = options["Storage", "persistent_use_database"] if not use_dbm: print("Your storage %s is a: pickle" % (hammie,)) return if not os.path.exists(hammie): print("Your storage file does not exist yet.") return db_type = dbm.whichdb(hammie) if db_type == "dbhash": if hasattr(bsddb, '__version__'): try: db = bsddb.hashopen(hammie, "r") except bsddb.error: pass else: db.close() print("Your storage", hammie, "is a: bsddb[3]") return elif db_type is None: print("Your storage %s is unreadable." % (hammie,)) print("Your storage %s is a: %s" % (hammie, db_type))
def main(): ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)] spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)] t = TestDriver.Driver() t.train(msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) keep_going = True trial_number = 0 while keep_going: start_time = time.time() if trial_number == 0: t.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), True) else: t.test(t.tester.truth_examples[1], t.tester.truth_examples[0]) end_time = time.time() seconds = end_time - start_time trial_number += 1 print "Test sizes: ", len(t.tester.truth_examples[0]), ", ", len(t.tester.truth_examples[1]), "\n" print "Detection rate:", t.tester.correct_classification_rate(), "\n" print "\nTime elapsed:", seconds, "seconds.\n" answer = raw_input("Keep trying (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "y": valid_input = True elif answer == "n": sys.exit() else: answer = raw_input("Please enter either y or n. ")
def __init__(self, number): self.NUMBER = number self.ham_num = self.NUMBER self.ham_source = get_pathname_option("TestDriver", "ham_directories") % 1 + "/" self.ham_test = get_pathname_option("TestDriver", "ham_directories") % 2 + "/" self.ham_destination = get_pathname_option("TestDriver", "ham_directories") % 3 + "/" self.ham_source_files = listdir(self.ham_source) self.ham_destination_files = listdir(self.ham_destination) self.spam_num = 0 self.spam_source = get_pathname_option("TestDriver", "spam_directories") % 1 + "/" self.spam_test = get_pathname_option("TestDriver", "spam_directories") % 2 + "/" self.spam_destination = get_pathname_option("TestDriver", "spam_directories") % 3 + "/" self.spam_source_files = listdir(self.spam_source) self.spam_destination_files = listdir(self.spam_destination)
def __init__(self, number): self.NUMBER = number self.ham_num = self.NUMBER self.ham_source = get_pathname_option("TestDriver", "ham_directories") % 1 + "/" self.ham_test = get_pathname_option("TestDriver", "ham_directories") % 2 + "/" self.ham_destination = get_pathname_option("TestDriver", "ham_directories") % 3 + "/" self.ham_source_files = listdir(self.ham_source) self.ham_destination_files = listdir(self.ham_destination) self.spam_num = 0 self.spam_source = get_pathname_option("TestDriver", "spam_directories") % 1 + "/" self.spam_test = get_pathname_option("TestDriver", "spam_directories") % 2 + "/" self.spam_destination = get_pathname_option( "TestDriver", "spam_directories") % 3 + "/" self.spam_source_files = listdir(self.spam_source) self.spam_destination_files = listdir(self.spam_destination)
def main(): import os import sys import shutil sys.path.insert(-1, os.getcwd()) sys.path.insert(-1, os.path.dirname(os.getcwd())) from spambayes import ActiveUnlearnDriver from spambayes.Options import get_pathname_option from spambayes import msgs import time ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)] spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)] for i in range(1): au = ActiveUnlearnDriver.ActiveUnlearnDriver([msgs.HamStream(ham[0], [ham[0]]), msgs.HamStream(ham[2], [ham[2]]), msgs.HamStream(ham[3], [ham[3]])], [msgs.SpamStream(spam[0], [spam[0]]), msgs.SpamStream(spam[2], [spam[2]]), msgs.SpamStream(spam[3], [spam[3]])], msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]]), "ac-extreme") au.driver.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) au.driver.untrain(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]])) au.driver.untrain(msgs.HamStream(ham[3], [ham[3]]), msgs.SpamStream(spam[3], [spam[3]])) au.driver.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) msg = au.driver.tester.test_examples[5] shutil.copy(msg.tag, "C:\Users\Alex\Desktop\clustera") print msg.prob start_time = time.time() cluster = (au.cluster(msg, 10)) end_time = time.time() print cluster clueslist = [] for clue in msg.clues: clueslist.append((clue[0], clue[1])) print clueslist with open("C:\Users\Alex\Desktop\clustera\cluster7.txt", 'w') as outfile: spamcounter = 0 for sim in cluster: with open(sim.tag) as infile: if sim.tag.endswith(".spam.txt"): outfile.write("SPAMSPAMSPAMSPAMSPAM" + "\n\n") if sim.tag.endswith(".ham.txt"): outfile.write("HAMHAMHAMHAMHAM" + "\n\n") outfile.write(infile.read()) outfile.write("\n\n" + "----------------------------------------" + "\n\n") if sim.tag.endswith(".spam.txt"): spamcounter += 1 print spamcounter print end_time - start_time
elif opt == '-g': good.append(arg) elif opt == '-s': spam.append(arg) elif opt == "-r": removetrained = True elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) pck, usedb = storage.database_type(opts) if args: usage(2, "Positional arguments not allowed") if usedb == None: # Use settings in configuration file. usedb = options["Storage", "persistent_use_database"] pck = get_pathname_option("Storage", "persistent_storage_file") h = hammie.open(pck, usedb, "c") for g in good: if loud: print "Training ham (%s):" % g train(h, g, False, force, trainnew, removetrained) sys.stdout.flush() save = True for s in spam: if loud: print "Training spam (%s):" % s train(h, s, True, force, trainnew, removetrained) sys.stdout.flush()
def drive(): print options.display() spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] d = dictionarywriter.DictionaryWriter(150, 4) d.write() keep_going = True trial_number = 1 au = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], [ msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[3], [spam[3]]) ], msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), ) with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt", 'w') as outfile: while keep_going: chosen = set() current = au.select_initial() cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) while not cluster: current = au.select_initial(chosen) cluster = au.determine_cluster(current) chosen.add(current) au.driver.test(au.testing_ham, au.testing_spam) cluster_list = list(cluster.cluster_set) dicts = au.driver.tester.train_examples[2] data = v_correlation(cluster_list, dicts) outfile.write("Trial " + str(trial_number) + " Percentage Overlap (Correlation): " + str(data)) answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "n": keep_going = False valid_input = True elif answer == "y": au.learn(cluster) au.init_ground() trial_number += 1 valid_input = True else: print "Please enter either y or n."
def main(): ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4) ] sizes = [0, 60, 120, 240, 480, 840, 1200, 2400, 3600, 4800, 6000] d = TestDriver.Driver() d.new_classifier() detection_rates = [] target_rates = [] false_positives = [] false_negatives = [] unsures = [] for size in sizes: mislabeler = MislabeledFileMover(size) mislabeler.random_move_file() d.train(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) d.test(msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) target_rate = d.tester.correct_classification_rate() target_rates.append(target_rate) d.train(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]])) d.test(msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) detection_rate = d.tester.correct_classification_rate() detection_rates.append(detection_rate) fp = d.tester.nham_wrong false_positives.append(fp) fn = d.tester.nspam_wrong false_negatives.append(fn) unsure = d.tester.nham_unsure + d.tester.nspam_unsure unsures.append(unsure) d.untrain(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) d.untrain(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]])) mislabeler.reset() with open("/Users/AlexYang/Desktop/hamasspam.txt", 'w') as outfile: outfile.write( tabulate( { "# of Mislabeled Words": sizes, "Detection Rates": detection_rates, "Target Rates": target_rates }, headers="keys", tablefmt="plain"))
def test(): y = [0, 60, 120, 240, 480] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)] spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)] d = TestDriver.Driver() d.new_classifier() d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) mislabeled = [[], [], []] prev_detection_rate = None detection_rates = [] detection_rates_on_mislabeled = [] correct_results = [] results_from_mislabeled = [] for y_val in y: dw = DictionaryWriter(y_val) dw.reset() dw.write() d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) if y_val is 0: # Initial Test d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) rate = d.tester.correct_classification_rate() mislabeled[0] = d.tester.ham_wrong_examples # Ham mislabeled as Spam mislabeled[1] = d.tester.spam_wrong_examples # Spam mislabeled as Ham mislabeled[2] = d.tester.unsure_examples # Unsure ham = [] spam = [] ham += mislabeled[0] spam += mislabeled[1] for msg in mislabeled[2]: if msg.tag.endswith(".ham.txt"): ham.append(msg) elif msg.tag.endswith(".spam.txt"): spam.append(msg) else: print "What" exit() d.test(ham, spam) m_rate = d.tester.correct_classification_rate() detection_rates.append(rate) prev_detection_rate = rate correct_results.append("") results_from_mislabeled.append("") detection_rates_on_mislabeled.append(m_rate) d.untrain(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) dw.reset() else: d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) rate = d.tester.correct_classification_rate() detection_rates.append(rate) if rate > prev_detection_rate: correct_results.append("Improved") elif rate < prev_detection_rate: correct_results.append("Worsened") else: correct_results.append("Unchanged") prev_detection_rate = rate ham = [] spam = [] ham += mislabeled[0] spam += mislabeled[1] # for msg in mislabeled[2]: # if msg.tag.endswith(".ham.txt"): # ham.append(msg) # elif msg.tag.endswith(".spam.txt"): # spam.append(msg) # else: # print "What" # exit() d.test(ham, spam) rate = d.tester.correct_classification_rate() detection_rates_on_mislabeled.append(rate) dw.reset() outfile = open("mislabeled_rates.txt", "w") outfile.write( tabulate( { "# of Dictionaries": y, "Detection Rate": detection_rates, "True Change": correct_results, "Detection Rate from Mislabeled": detection_rates_on_mislabeled, "Interpreted Change": results_from_mislabeled, }, headers="keys", ) )
def test(): y = [0, 60, 120, 240, 480] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)] spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)] d = TestDriver.Driver() d.new_classifier() d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]])) mislabeled = [[], [], []] prev_detection_rate = None detection_rates = [] detection_rates_on_mislabeled = [] correct_results = [] results_from_mislabeled = [] for y_val in y: dw = DictionaryWriter(y_val) dw.reset() dw.write() d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) if y_val is 0: # Initial Test d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) rate = d.tester.correct_classification_rate() mislabeled[0] = d.tester.ham_wrong_examples # Ham mislabeled as Spam mislabeled[1] = d.tester.spam_wrong_examples # Spam mislabeled as Ham mislabeled[2] = d.tester.unsure_examples # Unsure ham = [] spam = [] ham += mislabeled[0] spam += mislabeled[1] for msg in mislabeled[2]: if msg.tag.endswith(".ham.txt"): ham.append(msg) elif msg.tag.endswith(".spam.txt"): spam.append(msg) else: print "What" exit() d.test(ham, spam) m_rate = d.tester.correct_classification_rate() detection_rates.append(rate) prev_detection_rate = rate correct_results.append("") results_from_mislabeled.append("") detection_rates_on_mislabeled.append(m_rate) d.untrain(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]])) dw.reset() else: d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]])) rate = d.tester.correct_classification_rate() detection_rates.append(rate) if rate > prev_detection_rate: correct_results.append("Improved") elif rate < prev_detection_rate: correct_results.append("Worsened") else: correct_results.append("Unchanged") prev_detection_rate = rate ham = [] spam = [] ham += mislabeled[0] spam += mislabeled[1] #for msg in mislabeled[2]: # if msg.tag.endswith(".ham.txt"): # ham.append(msg) # elif msg.tag.endswith(".spam.txt"): # spam.append(msg) # else: # print "What" # exit() d.test(ham, spam) rate = d.tester.correct_classification_rate() detection_rates_on_mislabeled.append(rate) dw.reset() outfile = open("mislabeled_rates.txt", 'w') outfile.write(tabulate({"# of Dictionaries": y, "Detection Rate": detection_rates, "True Change": correct_results, "Detection Rate from Mislabeled": detection_rates_on_mislabeled, "Interpreted Change": results_from_mislabeled}, headers="keys"))
def main(): import os import sys from random import choice sys.path.insert(-1, os.getcwd()) sys.path.insert(-1, os.path.dirname(os.getcwd())) from spambayes import ActiveUnlearnDriver from spambayes.Options import get_pathname_option from spambayes import msgs """ from dictionarywriter import DictionaryWriter """ ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] """ DictionaryWriter(600).write() """ keep_going = True trial_number = 1 au_v = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], [ msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[3], [spam[3]]) ], msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), ) while keep_going: msg = choice(au_v.driver.tester.train_examples[0]) try: test_cl, counter = au_v.determine_cluster(msg) test_size = test_cl.size au_v.learn(test_cl) except TypeError: counter = 1 test_size = "100, but fail" cluster_detection_rates_v = [] cluster_spam_rates_v = [] cluster_sizes = [] au_v.init_ground() original_rate_v = au_v.driver.tester.correct_classification_rate() cluster_size = 100 cluster_sizes.append(100) print "Clustering with size", cluster_size, "..." cl_v = ActiveUnlearnDriver.Cluster(msg, cluster_size, au_v, "extreme") cluster_spam_rates_v.append( float(cl_v.target_spam()) / float(cluster_size)) cluster_detection_rates_v.append(au_v.start_detect_rate(cl_v)) for i in range(1, counter + 2): cluster_size += 100 cluster_sizes.append(cluster_size) print "Clustering with size", cluster_size, "..." cluster_detection_rates_v.append( au_v.continue_detect_rate(cl_v, 100)) cluster_spam_rates_v.append( float(cl_v.target_spam()) / float(cluster_size)) with open( "C:\Users\Alex\Desktop\det_cluster_stats_v" + str(trial_number) + ".txt", 'w') as outfile: outfile.write("VANILLA MACHINE\n") outfile.write("--------------------------\n") outfile.write("Clustered around: " + msg.tag + "\n") outfile.write("--------------------------\n") outfile.write("Detection Rates:\n") outfile.write(str(original_rate_v) + "\n") for item in cluster_detection_rates_v: outfile.write(str(item) + "\n") outfile.write("--------------------------\n") outfile.write("Spam Rate:\n") for item in cluster_spam_rates_v: outfile.write(str(item) + "\n") outfile.write("Test Cluster Size:\n") outfile.write(str(test_size)) answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trials so far. ") if answer == "n": keep_going = False else: au_v.learn(cl_v) au_v.init_ground() trial_number += 1
def __init__(self, num_files, dir_num=3): self.NUMFILES = num_files self.destination = get_pathname_option("TestDriver", "spam_directories") % dir_num + "/" self.destination_files = listdir(self.destination)
def main(): ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] keep_going = True trial_number = 1 try: time_1 = time.time() au = ActiveUnlearnDriver.ActiveUnlearner( [ msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]]) ], # Training Ham [ msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[2], [spam[2]]) ], # Training Spam msgs.HamStream(ham[0], [ham[0]]), # Testing Ham msgs.SpamStream(spam[0], [spam[0]]), # Testing Spam ) time_2 = time.time() train_time = time_2 - time_1 print "Train time:", train_time, "\n" while keep_going: with open("C:\\Users\\Alex\\Desktop\\unpollute_stats\\unlearn_stats" + str(trial_number) + ".txt", 'w') \ as outfile: try: outfile.write("CLUSTER AND RATE COUNTS:\n") outfile.write("---------------------------\n") original_detection_rate = au.driver.tester.correct_classification_rate( ) outfile.write("0: " + str(original_detection_rate) + "\n") time_start = time.time() cluster_list = au.brute_force_active_unlearn( outfile, test=True, center_iteration=False, pollution_set3=True, gold=True) time_end = time.time() unlearn_time = time_end - time_start total_polluted_unlearned = 0 total_unlearned = 0 total_unpolluted_unlearned = 0 final_detection_rate = au.current_detection_rate print "\nTallying up final counts...\n" for cluster in cluster_list: total_unlearned += cluster.size total_polluted_unlearned += cluster.target_set3() total_unpolluted_unlearned += (cluster.size - cluster.target_set3()) outfile.write("\nSTATS\n") outfile.write("---------------------------\n") outfile.write("Initial Detection Rate: " + str(original_detection_rate) + "\n") outfile.write("Final Detection Rate: " + str(final_detection_rate) + "\n") outfile.write("Total Unlearned:\n") outfile.write(str(total_unlearned) + "\n") outfile.write("Polluted Percentage of Unlearned:\n") outfile.write( str( float(total_polluted_unlearned) / float(total_unlearned)) + "\n") outfile.write("Unpolluted Percentage of Unlearned:\n") outfile.write( str( float(total_unpolluted_unlearned) / float(total_unlearned)) + "\n") outfile.write("Percentage of Polluted Unlearned:\n") outfile.write( str(float(total_polluted_unlearned) / 1200) + "\n") outfile.write("Time for training:\n") outfile.write(str(train_time) + "\n") outfile.write("Time for unlearning:\n") outfile.write(str(unlearn_time)) except KeyboardInterrupt: outfile.flush() os.fsync(outfile) """ m.reset() """ sys.exit() answer = raw_input("\nKeep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ") valid_input = False while not valid_input: if answer == "n": keep_going = False valid_input = True elif answer == "y": for cluster in cluster_list: au.learn(cluster) au.init_ground() trial_number += 1 valid_input = True else: answer = raw_input("Please enter either y or n. ") except KeyboardInterrupt: """ m.reset() """ sys.exit()
def main(): ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4) ] injected = get_pathname_option("TestDriver", "spam_directories") % 3 au = ActiveUnlearnDriver.ActiveUnlearner( [msgs.HamStream(ham[0], [ham[0]]), msgs.HamStream(ham[2], [ham[2]])], [ msgs.SpamStream(spam[0], [spam[0]]), msgs.SpamStream(spam[2], [spam[2]]) ], msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]])) msg = choice( au.driver.tester.train_examples[2]) # Randomly chosen from Ham Set3 original_rate = au.driver.tester.correct_classification_rate() cluster_sizes = [] detection_rates = [] target_cluster_rates = [] sizes = [] for i in range(150, 1050, 50): sizes.append(i) for i in range(1000, 15000, 1000): sizes.append(i) for size in sizes: cluster = ActiveUnlearnDriver.Cluster(msg, size, au, "extreme") print "Clustering with size " + str(cluster.size) + "..." cluster_sizes.append(size) detection_rates.append(au.detect_rate(cluster)) target_cluster_rates.append( float(cluster.target_set3()) / float(cluster.size)) file = open("/Users/AlexYang/Desktop/clues.txt", 'w') features = au.driver.classifier._getclues(msg) i = 1 for feature in features: file.write(str(i) + ") ") file.write(str(feature) + "\n") i += 1 with open("/Users/AlexYang/Desktop/clusterstats.txt", 'w') as outfile: outfile.write("Clustered around: " + msg.tag) outfile.write("\nOriginal Rate: " + str(original_rate) + "\n") outfile.write( tabulate( { "Cluster Sizes": cluster_sizes, "Detection Rates": detection_rates, "% of Targets Clustered": target_cluster_rates }, headers="keys", tablefmt="plain"))
def main(): import os import sys from random import choice sys.path.insert(-1, os.getcwd()) sys.path.insert(-1, os.path.dirname(os.getcwd())) from spambayes import ActiveUnlearnDriver from spambayes.Options import get_pathname_option from spambayes import msgs """ from dictionarywriter import DictionaryWriter """ ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)] spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)] """ DictionaryWriter(600).write() """ keep_going = True trial_number = 1 au_v = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[1], [ham[1]]), msgs.HamStream(ham[2], [ham[2]])], [msgs.SpamStream(spam[1], [spam[1]]), msgs.SpamStream(spam[3], [spam[3]])], msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), ) while keep_going: msg = choice(au_v.driver.tester.train_examples[0]) try: test_cl, counter = au_v.determine_cluster(msg) test_size = test_cl.size au_v.learn(test_cl) except TypeError: counter = 1 test_size = "100, but fail" cluster_detection_rates_v = [] cluster_spam_rates_v = [] cluster_sizes = [] au_v.init_ground() original_rate_v = au_v.driver.tester.correct_classification_rate() cluster_size = 100 cluster_sizes.append(100) print "Clustering with size", cluster_size, "..." cl_v = ActiveUnlearnDriver.Cluster(msg, cluster_size, au_v, "extreme") cluster_spam_rates_v.append(float(cl_v.target_spam()) / float(cluster_size)) cluster_detection_rates_v.append(au_v.start_detect_rate(cl_v)) for i in range(1, counter + 2): cluster_size += 100 cluster_sizes.append(cluster_size) print "Clustering with size", cluster_size, "..." cluster_detection_rates_v.append(au_v.continue_detect_rate(cl_v, 100)) cluster_spam_rates_v.append(float(cl_v.target_spam()) / float(cluster_size)) with open("C:\Users\Alex\Desktop\det_cluster_stats_v" + str(trial_number) + ".txt", 'w') as outfile: outfile.write("VANILLA MACHINE\n") outfile.write("--------------------------\n") outfile.write("Clustered around: " + msg.tag + "\n") outfile.write("--------------------------\n") outfile.write("Detection Rates:\n") outfile.write(str(original_rate_v) + "\n") for item in cluster_detection_rates_v: outfile.write(str(item) + "\n") outfile.write("--------------------------\n") outfile.write("Spam Rate:\n") for item in cluster_spam_rates_v: outfile.write(str(item) + "\n") outfile.write("Test Cluster Size:\n") outfile.write(str(test_size)) answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trials so far. ") if answer == "n": keep_going = False else: au_v.learn(cl_v) au_v.init_ground() trial_number += 1
def main(): import os import sys import shutil sys.path.insert(-1, os.getcwd()) sys.path.insert(-1, os.path.dirname(os.getcwd())) from spambayes import ActiveUnlearnDriver from spambayes.Options import get_pathname_option from spambayes import msgs import time ham = [ get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5) ] spam = [ get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5) ] for i in range(1): au = ActiveUnlearnDriver.ActiveUnlearnDriver([ msgs.HamStream(ham[0], [ham[0]]), msgs.HamStream(ham[2], [ham[2]]), msgs.HamStream(ham[3], [ham[3]]) ], [ msgs.SpamStream(spam[0], [spam[0]]), msgs.SpamStream(spam[2], [spam[2]]), msgs.SpamStream(spam[3], [spam[3]]) ], msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]]), "ac-extreme") au.driver.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) au.driver.untrain(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]])) au.driver.untrain(msgs.HamStream(ham[3], [ham[3]]), msgs.SpamStream(spam[3], [spam[3]])) au.driver.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]])) msg = au.driver.tester.test_examples[5] shutil.copy(msg.tag, "C:\Users\Alex\Desktop\clustera") print msg.prob start_time = time.time() cluster = (au.cluster(msg, 10)) end_time = time.time() print cluster clueslist = [] for clue in msg.clues: clueslist.append((clue[0], clue[1])) print clueslist with open("C:\Users\Alex\Desktop\clustera\cluster7.txt", 'w') as outfile: spamcounter = 0 for sim in cluster: with open(sim.tag) as infile: if sim.tag.endswith(".spam.txt"): outfile.write("SPAMSPAMSPAMSPAMSPAM" + "\n\n") if sim.tag.endswith(".ham.txt"): outfile.write("HAMHAMHAMHAMHAM" + "\n\n") outfile.write(infile.read()) outfile.write("\n\n" + "----------------------------------------" + "\n\n") if sim.tag.endswith(".spam.txt"): spamcounter += 1 print spamcounter print end_time - start_time
from string import ascii_lowercase from os import listdir, remove from random import choice, sample from spambayes.Options import get_pathname_option default_dest = get_pathname_option("TestDriver", "spam_directories") % 3 def write_dictionary_sets(number_clusters=26, x=0.5, y=200, destination=default_dest): destination += "/" letterset = {} # A dictionary of words: Key = Letter, Value = Words beginning with that letter for letter in ascii_lowercase: letterset[letter] = [] with open("dictionary.txt", 'r') as dictionary: for line in dictionary: letter = line[0] letterset[letter].append(line.strip()) keys = sample(letterset.keys(), number_clusters) for letter in keys: print "Writing sets for letter " + letter + " ..." x = x # Percentage overlap of words between sets y = y # Number of sets per letter o_set = letterset[letter] # Size of original set of words beginning with letter b = len(o_set) # Size of set being pulled from a = int(b * x) # Size of resultant sets for i in range(y):
def close(self): # Close our underlying database. Better not assume all databases # have close functions! def noop(): pass getattr(self.db, "close", noop)() getattr(self.dbm, "close", noop)() def store(self): if self.db is not None: self.db.sync() # This should come from a Mark Hammond idea of a master db # For the moment, we get the name of another file from the options, # so that these files don't litter lots of working directories. # Once there is a master db, this option can be removed. message_info_db_name = get_pathname_option("Storage", "messageinfo_storage_file") if options["Storage", "persistent_use_database"] is True or \ options["Storage", "persistent_use_database"] == "dbm": msginfoDB = MessageInfoDB(message_info_db_name) elif options["Storage", "persistent_use_database"] is False or \ options["Storage", "persistent_use_database"] == "pickle": msginfoDB = MessageInfoPickle(message_info_db_name) else: # Ah - now, what? Maybe the user has mysql or pgsql or zeo, # or some other newfangled thing! We don't know what to do # in that case, so just use a pickle, since it's the safest # option. msginfoDB = MessageInfoPickle(message_info_db_name) class Message(email.Message.Message): '''An email.Message.Message extended for Spambayes'''