def main(args): opts, args = getopt.getopt(args, "") for opt, arg in opts: pass if not args: mboxes = [getmbox("-")] else: mboxes = [getmbox(a) for a in args] for mbox in mboxes: for msg in mbox: print(generate_checksum(msg))
def main(args): opts, args = getopt.getopt(args, "") for opt, arg in opts: pass if not args: mboxes = [getmbox("-")] else: mboxes = [getmbox(a) for a in args] for mbox in mboxes: for msg in mbox: print generate_checksum(msg)
def score(unsure, h, cls, scores, msgids=None, skipspam=False): """See what effect on others each msg in unsure has""" ham_cutoff = options["Categorization", "ham_cutoff"] spam_cutoff = options["Categorization", "spam_cutoff"] n = 0 total = 0.0 okalready = set() add = okalready.add for msg in getmbox(unsure): prob = cls.spamprob(tokenize(msg)) n += 1 if prob >= spam_cutoff: add(msg['message-id']) else: total += prob first_mean = total/n print len(okalready), "out of", n, "messages already score as spam" print "initial mean spam prob: %.3f" % first_mean print "%5s %3s %5s %5s %s" % ("prob", "new", "mean", "sdev", "msgid") for msg in getmbox(unsure): msgid = msg['message-id'] if msgids is not None and msgid not in msgids: continue msgprob = cls.spamprob(tokenize(msg)) if skipspam and msgprob >= spam_cutoff: continue n = j = 0 h.train(msg, True) total = 0.0 probs = [] for trial in getmbox(unsure): if trial['message-id'] in okalready: continue n += 1 if n % 10 == 0: counter("", n) prob = cls.spamprob(tokenize(trial)) probs.append(prob) total += prob if prob >= spam_cutoff: j += 1 counter("", n) h.untrain(msg, True) mean = total/n meankey = round(mean, 3) scores.setdefault(meankey, []).append(msgid) sdev = math.sqrt(sum([(mean-prob)**2 for prob in probs])/n) print "\r%.3f %3d %.3f %.3f %s" % (msgprob, j, mean, sdev, msgid)
def cull(mbox_name, cullext, designation, tdict): print "writing new %s mbox..." % designation n = m = 0 if cullext: culled_mbox = file(mbox_name + cullext, "w") for msg in mboxutils.getmbox(mbox_name): m += 1 if msg["message-id"] in tdict: if cullext: culled_mbox.write(str(msg)) n += 1 elif not cullext: response = msg.imap_server.uid("STORE", msg.uid, "+FLAGS.SILENT", "(\\Deleted \\Seen)") command = "set %s to be deleted and seen" % (msg.uid, ) msg.imap_server.check_response(command, response) sys.stdout.write("\r%5d of %5d" % (n, m)) sys.stdout.flush() sys.stdout.write("\n") if cullext: culled_mbox.close()
def score(h, msgs, reverse=0): """Score (judge) all messages from a mailbox.""" # XXX The reporting needs work! mbox = mboxutils.getmbox(msgs) i = 0 spams = hams = unsures = 0 for msg in mbox: i += 1 prob, clues = h.score(msg, True) if hasattr(msg, '_mh_msgno'): msgno = msg._mh_msgno else: msgno = i isspam = (prob >= SPAM_THRESHOLD) isham = (prob <= HAM_THRESHOLD) if isspam: spams += 1 if not reverse: print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."), print h.formatclues(clues) elif isham: hams += 1 if reverse: print "%6s %4.2f %1s" % (msgno, prob, isham and "S" or "."), print h.formatclues(clues) else: unsures += 1 print "%6s %4.2f U" % (msgno, prob), print h.formatclues(clues) return (spams, hams, unsures)
def main(args): try: opts, args = getopt.getopt(args, "hd:S:H:f:", ["help", "database=", "spamfile=", "hamfile=", "feature="]) except getopt.GetoptError as msg: usage(msg) return 1 charset = locale.getdefaultlocale()[1] if not charset: charset = 'us-ascii' mapfile = spamfile = hamfile = None features = set() for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-d", "--database"): mapfile = arg elif opt in ("-H", "--hamfile"): hamfile = arg elif opt in ("-S", "--spamfile"): spamfile = arg elif opt in ("-f", "--feature"): features.add(str(arg, charset)) if hamfile is None and spamfile is None: usage("At least one of -S or -H are required") return 1 if mapfile is None: usage("'-d mapfile' is required") return 1 try: mapd = pickle_read(mapfile) except IOError: usage("Mapfile %s does not exist" % mapfile) return 1 if not features and not args: usage("Require at least one feature (-f) arg or one message file") return 1 if not features: for f in args: for msg in getmbox(f): evidence = msg.get("X-Spambayes-Evidence", "") evidence = re.sub(r"\s+", " ", evidence) l = [e.rsplit(": ", 1)[0] for e in evidence.split("; ")[2:]] for s in l: try: s = make_header(decode_header(s)).__unicode__() except: s = str(s, 'us-ascii', 'replace') features.add(s) if not features: usage("No X-Spambayes-Evidence headers found") return 1 if spamfile is not None: spamfile = file(spamfile, "w") if hamfile is not None: hamfile = file(hamfile, "w") extractmessages(features, mapd, hamfile, spamfile)
def main(profiling=False): h = HammieFilter() actions = [] opts, args = getopt.getopt(sys.argv[1:], 'hvxd:p:nfgstGSo:P', ['help', 'version', 'examples', 'option=']) create_newdb = False do_profile = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-v', '--version'): version() elif opt in ('-x', '--examples'): examples() elif opt in ('-o', '--option'): Options.options.set_from_cmdline(arg, sys.stderr) elif opt == '-f': actions.append(h.filter) elif opt == '-g': actions.append(h.train_ham) elif opt == '-s': actions.append(h.train_spam) elif opt == '-t': actions.append(h.filter_train) elif opt == '-G': actions.append(h.untrain_ham) elif opt == '-S': actions.append(h.untrain_spam) elif opt == '-P': do_profile = True if not profiling: try: import cProfile except ImportError: pass else: return cProfile.run("main(True)") elif opt == "-n": create_newdb = True h.dbname, h.usedb = storage.database_type(opts) if create_newdb or not os.path.exists(h.dbname): h.newdb() print("Created new database in", h.dbname, file=sys.stderr) if create_newdb: sys.exit(0) if actions == []: actions = [h.filter] if not args: args = ["-"] for fname in args: mbox = mboxutils.getmbox(fname) for msg in mbox: for action in actions: action(msg) if args == ["-"]: unixfrom = msg.get_unixfrom() is not None else: unixfrom = True result = mboxutils.as_string(msg, unixfrom=unixfrom) sys.stdout.write(result)
def untrain(h, msgs, is_spam): """Untrain bayes with all messages from a mailbox.""" mbox = mboxutils.getmbox(msgs) i = 0 for msg in mbox: i += 1 if i % 10 == 0: sys.stdout.write("\r%6d" % i) sys.stdout.flush() h.untrain(msg, is_spam) sys.stdout.write("\r%6d" % i) sys.stdout.flush() print()
def mapmessages(f, mboxtype, mapdb): i = 0 for msg in getmbox(f): i += 1 sys.stdout.write('\r%s: %d' % (f, i)) sys.stdout.flush() msgid = msg.get("message-id") if msgid is None: continue for t in tokenize(msg): ham, spam = mapdb.get(t, ({}, {})) if mboxtype == "ham": msgids = ham.get(f, set()) msgids.add(msgid) ham[f] = msgids else: msgids = spam.get(f, set()) msgids.add(msgid) spam[f] = msgids mapdb[t] = (ham, spam) if options["Classifier", "x-use_bigrams"]: for t in Classifier()._enhance_wordstream(tokenize(msg)): ham, spam = mapdb.get(t, ({}, {})) if mboxtype == "ham": msgids = ham.get(f, set()) msgids.add(msgid) ham[f] = msgids else: msgids = spam.get(f, set()) msgids.add(msgid) spam[f] = msgids mapdb[t] = (ham, spam) sys.stdout.write("\n")
def cull(mbox_name, cullext, designation, tdict): print "writing new %s mbox..." % designation n = m = 0 if cullext: culled_mbox = file(mbox_name + cullext, "w") for msg in mboxutils.getmbox(mbox_name): m += 1 if msg["message-id"] in tdict: if cullext: culled_mbox.write(str(msg)) n += 1 elif not cullext: response = msg.imap_server.uid( "STORE", msg.uid, "+FLAGS.SILENT", "(\\Deleted \\Seen)") command = "set %s to be deleted and seen" % (msg.uid,) msg.imap_server.check_response(command, response) sys.stdout.write("\r%5d of %5d" % (n, m)) sys.stdout.flush() sys.stdout.write("\n") if cullext: culled_mbox.close()
def learn(mbox, h, is_spam): i = 0 tag = is_spam and "Spam" or "Ham" for msg in getmbox(mbox): counter(tag, i) i += 1 h.train(msg, is_spam) print
def train(bayes, msgs, is_spam): """Train bayes with all messages from a mailbox.""" mbox = mboxutils.getmbox(msgs) for msg in mbox: bayes.learn(tokenize(msg), is_spam)
def extractmessages(features, mapdb, hamfile, spamfile): """extract messages which contain given features""" hamids = {} spamids = {} for feature in features: ham, spam = mapdb.get(feature, ([], [])) if hamfile is not None: for mbox in ham: msgids = hamids.get(mbox, set()) msgids.update(ham.get(mbox, set())) hamids[mbox] = msgids if spamfile is not None: for mbox in spam: msgids = spamids.get(mbox, set()) msgids.update(spam.get(mbox, set())) spamids[mbox] = msgids # now run through each mailbox in hamids and spamids and print # matching messages to relevant ham or spam files for mailfile in hamids: i = 0 msgids = hamids[mailfile] for msg in getmbox(mailfile): if msg.get("message-id") in msgids: i += 1 sys.stdout.write('\r%s: %5d' % (mailfile, i)) sys.stdout.flush() print >> hamfile, msg print for mailfile in spamids: i = 0 msgids = spamids[mailfile] for msg in getmbox(mailfile): if msg.get("message-id") in msgids: i += 1 sys.stdout.write('\r%s: %5d' % (mailfile, i)) sys.stdout.flush() print >> spamfile, msg print
def untrain(h, msgs, is_spam): """Untrain bayes with all messages from a mailbox.""" mbox = mboxutils.getmbox(msgs) i = 0 for msg in mbox: i += 1 if i % 10 == 0: sys.stdout.write("\r%6d" % i) sys.stdout.flush() h.untrain(msg, is_spam) sys.stdout.write("\r%6d" % i) sys.stdout.flush() print
def main(): h = HammieFilter() actions = [] opts, args = getopt.getopt(sys.argv[1:], 'hxd:p:nfgstGSo:', ['help', 'examples', 'option=']) create_newdb = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-x', '--examples'): examples() elif opt in ('-o', '--option'): Options.options.set_from_cmdline(arg, sys.stderr) elif opt == '-f': actions.append(h.filter) elif opt == '-g': actions.append(h.train_ham) elif opt == '-s': actions.append(h.train_spam) elif opt == '-t': actions.append(h.filter_train) elif opt == '-G': actions.append(h.untrain_ham) elif opt == '-S': actions.append(h.untrain_spam) elif opt == "-n": create_newdb = True h.dbname, h.usedb = storage.database_type(opts) if create_newdb: h.newdb() sys.exit(0) if actions == []: actions = [h.filter] if not args: args = ["-"] for fname in args: mbox = mboxutils.getmbox(fname) for msg in mbox: for action in actions: action(msg) if args == ["-"]: unixfrom = msg.get_unixfrom() is not None else: unixfrom = True result = mboxutils.as_string(msg, unixfrom=unixfrom) sys.stdout.write(result)
def extractmessages(features, mapdb, hamfile, spamfile): """extract messages which contain given features""" hamids = {} spamids = {} for feature in features: ham, spam = mapdb.get(feature, ([], [])) if hamfile is not None: for mbox in ham: msgids = hamids.get(mbox, set()) msgids.update(ham.get(mbox, set())) hamids[mbox] = msgids if spamfile is not None: for mbox in spam: msgids = spamids.get(mbox, set()) msgids.update(spam.get(mbox, set())) spamids[mbox] = msgids for mailfile in hamids: i = 0 msgids = hamids[mailfile] for msg in getmbox(mailfile): if msg.get("message-id") in msgids: i += 1 sys.stdout.write('\r%s: %5d' % (mailfile, i)) sys.stdout.flush() print(msg, file=hamfile) print() for mailfile in spamids: i = 0 msgids = spamids[mailfile] for msg in getmbox(mailfile): if msg.get("message-id") in msgids: i += 1 sys.stdout.write('\r%s: %5d' % (mailfile, i)) sys.stdout.flush() print(msg, file=spamfile) print()
def read_emails_from_disk(data_folder, categories=None, email_charset="latin1"): """Read emails from files in folders. Default email_charset="latin1", for CSDMC2010_SPAM email_charset="iso-8859-1" """ filenames = [] targets = [] data = [] # read data from emails folders = [ f for f in sorted(listdir(data_folder)) if isdir(join(data_folder, f)) ] if categories is not None: folders = [f for f in folders if f in categories] else: categories = [f for f in folders] # encoding mess {I know nothing about the magic it does.} from cStringIO import StringIO from email.generator import Generator fp = StringIO() g = Generator(fp, mangle_from_=False, maxheaderlen=60) for folder in folders: folder_path = join(data_folder, folder) mbox = mboxutils.getmbox(folder_path) print("Reading emails from folder %s" % folder_path) for msg in mbox: g.flatten(msg) data.append(msg.as_string()) # set targets here if folder[0].lower() == 'h': targets.append(0) # ham else: targets.append(1) # spam data = [d.decode(email_charset, 'strict') for d in data] return Bunch(data=data, filenames=filenames, categories=categories, targets=targets, DESCR='Data from E-mails')
def score(h, msgs, reverse=0): """Score (judge) all messages from a mailbox.""" global doc_clf global charset from cStringIO import StringIO from email.generator import Generator fp = StringIO() g = Generator(fp, mangle_from_=False, maxheaderlen=60) # XXX The reporting needs work! mbox = mboxutils.getmbox(msgs) i = 0 spams = hams = unsures = 0 sys.stdout.write("Scoring now: ") for msg in mbox: i += 1 #sys.stdout.write("\r${0}".format(i)) #sys.stdout.flush() prob, clues = h.score(msg, True) if hasattr(msg, '_mh_msgno'): msgno = msg._mh_msgno else: msgno = i isspam = (prob >= SPAM_THRESHOLD) isham = (prob <= HAM_THRESHOLD) if isham: hams += 1 else: g.flatten(msg) msg = msg.as_string() if doc_clf.predict(msg): spams += 1 else: hams += 1 sys.stdout.write("\r${0}".format(i)) sys.stdout.flush() return (spams, hams, unsures)
options.set_from_cmdline(arg, sys.stderr) dbname, usedb = storage.database_type(opts) bayes = storage.open_storage(dbname, usedb) bayes.load() if not args: args = ["-"] for fname in args: mbox = mboxutils.getmbox(fname) for msg in mbox: print ShowClues(bayes, msg, markup) if __name__ == "__main__": opts, args = getopt.getopt(sys.argv[1:], 'hmd:p:o:', ['help', 'option=', 'markup']) markup = False for opt, arg in opts: if opt in ('-m', '--markup'):
def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'dhgn:s:v', ['help']) except getopt.error as msg: usage(1, msg) doglob = False n = None verbose = False delete_dups = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt == '-g': doglob = True elif opt == '-s': random.seed(int(arg)) elif opt == '-n': n = int(arg) elif opt == '-v': verbose = True elif opt == '-d': delete_dups = True if n is None or n <= 1: usage(1, "an -n value > 1 is required") if len(args) < 2: usage(1, "input mbox name and output base path are required") inputpaths, outputbasepath = args[:-1], args[-1] outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)] for dir in outdirs: if not os.path.isdir(dir): os.makedirs(dir) counter = 0 cksums = set() skipped = 0 for inputpath in inputpaths: if doglob: inpaths = glob.glob(inputpath) else: inpaths = [inputpath] for inpath in inpaths: mbox = mboxutils.getmbox(inpath) for msg in mbox: astext = str(msg) cksum = md5(astext).hexdigest() if delete_dups and cksum in cksums: skipped += 1 continue cksums.add(cksum) i = random.randrange(n) counter += 1 msgfile = open('%s/%d' % (outdirs[i], counter), 'wb') msgfile.write(astext) msgfile.close() if verbose: if counter % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if verbose: print() print(counter, "messages split into", n, "directories") if skipped: print("skipped", skipped, "duplicate messages")
def train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose): smisses = hmisses = round = 0 ham_cutoff = Options.options["Categorization", "ham_cutoff"] spam_cutoff = Options.options["Categorization", "spam_cutoff"] while round < maxrounds and (hmisses or smisses or round == 0): hambone = mboxutils.getmbox(ham) spamcan = mboxutils.getmbox(spam) if reverse: hambone = reversed(list(hambone)) spamcan = reversed(list(spamcan)) round += 1 if verbose: print >> sys.stderr, "*** round", round, "***" hmisses = smisses = nmsgs = 0 start = datetime.datetime.now() try: while not maxmsgs or nmsgs < maxmsgs: hammsg = hambone.next() spammsg = spamcan.next() nmsgs += 2 sys.stdout.write("\r%5d" % nmsgs) sys.stdout.flush() score = store.spamprob(tokenize(hammsg)) if score > ham_cutoff: if verbose: print >> sys.stderr, "miss ham: %.6f %s" % (score, hammsg["message-id"]) hmisses += 1 tdict[hammsg["message-id"]] = True store.learn(tokenize(hammsg), False) score = store.spamprob(tokenize(spammsg)) if score < spam_cutoff: if verbose: print >> sys.stderr, "miss spam: %.6f %s" % (score, spammsg["message-id"]) smisses += 1 tdict[spammsg["message-id"]] = True store.learn(tokenize(spammsg), True) except StopIteration: pass delta = datetime.datetime.now()-start seconds = delta.seconds + delta.microseconds/1000000 print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \ (round, nmsgs, hmisses, smisses, seconds) # We count all untrained messages so the user knows what was skipped. # We also tag them for saving so we don't lose messages which might have # value in a future run nhamleft = 0 try: while True: msg = hambone.next() tdict[msg["message-id"]] = True nhamleft += 1 except StopIteration: if nhamleft: print nhamleft, "untrained hams" nspamleft = 0 try: while True: msg = spamcan.next() tdict[msg["message-id"]] = True nspamleft += 1 except StopIteration: if nspamleft: print nspamleft, "untrained spams"
from six.moves import xrange def test_spambayes(loops, messages, ham_classifier): # Prime the pump. This still leaves some hot functions uncompiled; these # will be noticed as hot during the timed loops below. for msg in messages: ham_classifier.score(msg) range_it = xrange(loops) t0 = perf.perf_counter() for _ in range_it: for msg in messages: ham_classifier.score(msg) return perf.perf_counter() - t0 if __name__ == "__main__": runner = perf.text_runner.TextRunner(name='spambayes') runner.metadata['description'] = "Run the SpamBayes benchmark." data_dir = os.path.join(os.path.dirname(__file__), "data") mailbox = os.path.join(data_dir, "spambayes_mailbox") ham_data = os.path.join(data_dir, "spambayes_hammie.pkl") msgs = list(mboxutils.getmbox(mailbox)) ham_classifier = hammie.open(ham_data, "pickle", "r") runner.bench_sample_func(test_spambayes, msgs, ham_classifier)
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose, ratio): round = 0 ham_cutoff = Options.options["Categorization", "ham_cutoff"] spam_cutoff = Options.options["Categorization", "spam_cutoff"] # list-ify ham and spam iterators immediately. We don't really want to # fetch the messages multiple times, and this is no worse than what happened # before when -R was passed. hambone_ = list(mboxutils.getmbox(hambox)) spamcan_ = list(mboxutils.getmbox(spambox)) if reverse: hambone_ = list(reversed(hambone_)) spamcan_ = list(reversed(spamcan_)) nspam, nham = len(spamcan_), len(hambone_) if ratio: rspam, rham = ratio # If the actual ratio of spam to ham in the database is better than # what was asked for, use that better ratio. if (rspam > rham) == (rspam * nham > rham * nspam): rspam, rham = nspam, nham # define some indexing constants ham = 0 spam = 1 name = ('ham','spam') misses = [0, 0] misclassified = lambda is_spam, score: ( is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff) while round < maxrounds and (misses[ham] or misses[spam] or round == 0): round += 1 if verbose: print >> sys.stderr, "*** round", round, "***" start = datetime.datetime.now() hambone = iter(hambone_) spamcan = iter(spamcan_) i = [0, 0] msgs_processed = 0 misses = [0, 0] training_sets = [hambone, spamcan] while not maxmsgs or msgs_processed < maxmsgs: # should the next message come from hambone or spamcan? train_spam = i[ham] * rspam > i[spam] * rham try: train_msg = training_sets[train_spam].next() except StopIteration: break i[train_spam] += 1 msgs_processed += 1 sys.stdout.write("\r%5d" % msgs_processed) sys.stdout.flush() tokens = list(tokenize(train_msg)) score = store.spamprob(tokens) selector = train_msg["message-id"] or train_msg["subject"] if misclassified(train_spam, score) and selector is not None: if verbose: print >> sys.stderr, "\tmiss %s: %.6f %s" % ( name[train_spam], score, selector) misses[train_spam] += 1 tdict[train_msg["message-id"]] = True store.learn(tokens, train_spam) delta = datetime.datetime.now()-start seconds = delta.seconds + delta.microseconds/1000000 print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \ (round, msgs_processed, misses[0], misses[1], seconds) training_sets = [hambone, spamcan] # We count all untrained messages so the user knows what was skipped. # We also tag them for saving so we don't lose messages which might have # value in a future run for is_spam in ham, spam: nleft = 0 try: while True: msg = training_sets[is_spam].next() score = store.spamprob(tokenize(msg)) if misclassified(is_spam, score): tdict[msg["message-id"]] = True nleft += 1 except StopIteration: if nleft: print nleft, "untrained %ss" % name[is_spam]
try: mapd = pickle_read(mapfile) except IOError: usage("Mapfile %s does not exist" % mapfile) return 1 if not features and not args: usage("Require at least one feature (-f) arg or one message file") return 1 if not features: # extract significant tokens from each message and identify # where they came from for f in args: for msg in getmbox(f): evidence = msg.get("X-Spambayes-Evidence", "") evidence = re.sub(r"\s+", " ", evidence) l = [e.rsplit(": ", 1)[0] for e in evidence.split("; ")[2:]] for s in l: try: s = make_header(decode_header(s)).__unicode__() except: s = unicode(s, 'us-ascii', 'replace') features.add(s) if not features: usage("No X-Spambayes-Evidence headers found") return 1 if spamfile is not None:
outdirs = [outputbasepath + ("%d" % i) for i in range(1, n + 1)] for dir in outdirs: if not os.path.isdir(dir): os.makedirs(dir) counter = 0 cksums = set() skipped = 0 for inputpath in inputpaths: if doglob: inpaths = glob.glob(inputpath) else: inpaths = [inputpath] for inpath in inpaths: mbox = mboxutils.getmbox(inpath) for msg in mbox: astext = str(msg) cksum = md5(astext).hexdigest() if delete_dups and cksum in cksums: skipped += 1 continue cksums.add(cksum) i = random.randrange(n) #assert astext.endswith('\n') counter += 1 msgfile = open('%s/%d' % (outdirs[i], counter), 'wb') msgfile.write(astext) msgfile.close() if verbose: if counter % 100 == 0:
def test_spambayes(iterations, timer, messages, ham_classifier): # Prime the pump. This still leaves some hot functions uncompiled; these # will be noticed as hot during the timed loops below. for msg in messages: ham_classifier.score(msg) times = [] for _ in xrange(iterations): t0 = timer() for msg in messages: ham_classifier.score(msg) t1 = timer() times.append(t1 - t0) return times if __name__ == "__main__": parser = optparse.OptionParser( usage="%prog [options]", description=("Run the SpamBayes benchmark.")) util.add_standard_options_to(parser) options, args = parser.parse_args() data_dir = os.path.join(os.path.dirname(__file__), "data") mailbox = os.path.join(data_dir, "spambayes_mailbox") ham_data = os.path.join(data_dir, "spambayes_hammie.pkl") msgs = list(mboxutils.getmbox(mailbox)) ham_classifier = hammie.open(ham_data, "pickle", "r") util.run_benchmark(options, options.num_runs, test_spambayes, msgs, ham_classifier)
os.unlink(dbname) except OSError: pass store = storage.open_storage(dbname, usedb) tdict = {} train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose) store.store() if cullext is not None: print "writing new ham mbox..." n = m = 0 newham = file(ham + cullext, "w") for msg in mboxutils.getmbox(ham): m += 1 if msg["message-id"] in tdict: newham.write(str(msg)) n += 1 sys.stdout.write("\r%5d of %5d" % (n, m)) sys.stdout.flush() sys.stdout.write("\n") newham.close() print "writing new spam mbox..." n = m = 0 newspam = file(spam + cullext, "w") for msg in mboxutils.getmbox(spam): m += 1 if msg["message-id"] in tdict:
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose, ratio): smisses = hmisses = round = 0 ham_cutoff = Options.options["Categorization", "ham_cutoff"] spam_cutoff = Options.options["Categorization", "spam_cutoff"] hambone_ = list(mboxutils.getmbox(hambox)) spamcan_ = list(mboxutils.getmbox(spambox)) if reverse: hambone_ = list(reversed(hambone_)) spamcan_ = list(reversed(spamcan_)) if ratio: rspam,rham = ratio else: rspam,rham = len(spamcan_),len(hambone_) ham = 0 spam = 1 name = ('ham','spam') misses = [0,0] misclassified = lambda is_spam, score: ( is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff) while round < maxrounds and (misses[ham] or misses[spam] or round == 0): round += 1 if verbose: print >> sys.stderr, "*** round", round, "***" start = datetime.datetime.now() hambone = iter(hambone_) spamcan = iter(spamcan_) i = [0,0] msgs_processed = 0 misses = [0,0] training_sets = [hambone, spamcan] while not maxmsgs or msgs_processed < maxmsgs: train_spam = i[ham] * rspam > i[spam] * rham try: train_msg = training_sets[train_spam].next() except StopIteration: break; i[train_spam] += 1 msgs_processed += 1 sys.stdout.write("\r%5d" % msgs_processed) sys.stdout.flush() tokens = list(tokenize(train_msg)) score = store.spamprob(tokens) selector = train_msg["message-id"] or train_msg["subject"] if misclassified(train_spam,score) and selector is not None: if verbose: print >> sys.stderr, "\tmiss %s: %.6f %s" % ( name[train_spam], score, selector) misses[train_spam] += 1 tdict[train_msg["message-id"]] = True store.learn(tokens, train_spam) delta = datetime.datetime.now()-start seconds = delta.seconds + delta.microseconds/1000000 print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \ (round, msgs_processed, misses[0], misses[1], seconds) training_sets = [hambone,spamcan] for is_spam in ham,spam: nleft = 0 try: while True: msg = training_sets[is_spam].next() score = store.spamprob(tokenize(msg)) if misclassified(is_spam,score): tdict[msg["message-id"]] = True nleft += 1 except StopIteration: if nleft: print nleft, "untrained %ss" % name[is_spam]
outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)] for dir in outdirs: if not os.path.isdir(dir): os.makedirs(dir) counter = 0 cksums = set() skipped = 0 for inputpath in inputpaths: if doglob: inpaths = glob.glob(inputpath) else: inpaths = [inputpath] for inpath in inpaths: mbox = mboxutils.getmbox(inpath) for msg in mbox: astext = str(msg) cksum = md5(astext).hexdigest() if delete_dups and cksum in cksums: skipped += 1 continue cksums.add(cksum) i = random.randrange(n) #assert astext.endswith('\n') counter += 1 msgfile = open('%s/%d' % (outdirs[i], counter), 'wb') msgfile.write(astext) msgfile.close() if verbose: if counter % 100 == 0:
def main(profiling=False): h = HammieFilter() actions = [] opts, args = getopt.getopt(sys.argv[1:], 'hvxd:p:nfgstGSo:P', ['help', 'version', 'examples', 'option=']) create_newdb = False do_profile = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-v', '--version'): version() elif opt in ('-x', '--examples'): examples() elif opt in ('-o', '--option'): Options.options.set_from_cmdline(arg, sys.stderr) elif opt == '-f': actions.append(h.filter) elif opt == '-g': actions.append(h.train_ham) elif opt == '-s': actions.append(h.train_spam) elif opt == '-t': actions.append(h.filter_train) elif opt == '-G': actions.append(h.untrain_ham) elif opt == '-S': actions.append(h.untrain_spam) elif opt == '-P': do_profile = True if not profiling: try: import cProfile except ImportError: pass else: return cProfile.run("main(True)") elif opt == "-n": create_newdb = True h.dbname, h.usedb = storage.database_type(opts) if create_newdb or not os.path.exists(h.dbname): h.newdb() print >> sys.stderr, "Created new database in", h.dbname if create_newdb: sys.exit(0) if actions == []: actions = [h.filter] if not args: args = ["-"] for fname in args: mbox = mboxutils.getmbox(fname) for msg in mbox: for action in actions: action(msg) if args == ["-"]: unixfrom = msg.get_unixfrom() is not None else: unixfrom = True result = mboxutils.as_string(msg, unixfrom=unixfrom) sys.stdout.write(result)
Run a canned mailbox through a SpamBayes ham/spam classifier. """ import os.path import perf from spambayes import hammie, mboxutils __author__ = "[email protected] (Skip Montanaro)" __contact__ = "[email protected] (Collin Winter)" def bench_spambayes(ham_classifier, messages): for msg in messages: ham_classifier.score(msg) if __name__ == "__main__": runner = perf.Runner() runner.metadata['description'] = "Run the SpamBayes benchmark." data_dir = os.path.join(os.path.dirname(__file__), "data") mailbox = os.path.join(data_dir, "spambayes_mailbox") ham_data = os.path.join(data_dir, "spambayes_hammie.pkl") messages = list(mboxutils.getmbox(mailbox)) ham_classifier = hammie.open(ham_data, "pickle", "r") runner.bench_func('spambayes', bench_spambayes, ham_classifier, messages)
#!/bin/python """Wrapper script for testing the performance of SpamBayes. Run a canned mailbox through a SpamBayes ham/spam classifier. """ import os.path from spambayes import hammie, mboxutils __author__ = "[email protected] (Skip Montanaro)" __contact__ = "[email protected] (Collin Winter)" def bench_spambayes(ham_classifier, messages): for msg in messages: ham_classifier.score(msg) # data_dir = os.path.join(os.path.dirname(__file__), "data") data_dir = os.path.dirname(__file__) mailbox = os.path.join(data_dir, "spambayes_mailbox") #mailbox = os.path.join(data_dir, "small_mailbox") ham_data = os.path.join(data_dir, "spambayes_hammie.pkl") messages = list(mboxutils.getmbox(mailbox)) ham_classifier = hammie.open(ham_data, "pickle", "r") bench_spambayes(ham_classifier, messages)
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose, ratio): round = 0 ham_cutoff = Options.options["Categorization", "ham_cutoff"] spam_cutoff = Options.options["Categorization", "spam_cutoff"] # list-ify ham and spam iterators immediately. We don't really want to # fetch the messages multiple times, and this is no worse than what happened # before when -R was passed. hambone_ = list(mboxutils.getmbox(hambox)) spamcan_ = list(mboxutils.getmbox(spambox)) if reverse: hambone_ = list(reversed(hambone_)) spamcan_ = list(reversed(spamcan_)) nspam, nham = len(spamcan_), len(hambone_) if ratio: rspam, rham = ratio # If the actual ratio of spam to ham in the database is better than # what was asked for, use that better ratio. if (rspam > rham) == (rspam * nham > rham * nspam): rspam, rham = nspam, nham # define some indexing constants ham = 0 spam = 1 name = ('ham', 'spam') misses = [0, 0] misclassified = lambda is_spam, score: (is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff) while round < maxrounds and (misses[ham] or misses[spam] or round == 0): round += 1 if verbose: print >> sys.stderr, "*** round", round, "***" start = datetime.datetime.now() hambone = iter(hambone_) spamcan = iter(spamcan_) i = [0, 0] msgs_processed = 0 misses = [0, 0] training_sets = [hambone, spamcan] while not maxmsgs or msgs_processed < maxmsgs: # should the next message come from hambone or spamcan? train_spam = i[ham] * rspam > i[spam] * rham try: train_msg = training_sets[train_spam].next() except StopIteration: break i[train_spam] += 1 msgs_processed += 1 sys.stdout.write("\r%5d" % msgs_processed) sys.stdout.flush() tokens = list(tokenize(train_msg)) score = store.spamprob(tokens) selector = train_msg["message-id"] or train_msg["subject"] if misclassified(train_spam, score) and selector is not None: if verbose: print >> sys.stderr, "\tmiss %s: %.6f %s" % ( name[train_spam], score, selector) misses[train_spam] += 1 tdict[train_msg["message-id"]] = True store.learn(tokens, train_spam) delta = datetime.datetime.now() - start seconds = delta.seconds + delta.microseconds / 1000000 print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \ (round, msgs_processed, misses[0], misses[1], seconds) training_sets = [hambone, spamcan] # We count all untrained messages so the user knows what was skipped. # We also tag them for saving so we don't lose messages which might have # value in a future run for is_spam in ham, spam: nleft = 0 try: while True: msg = training_sets[is_spam].next() score = store.spamprob(tokenize(msg)) if misclassified(is_spam, score): tdict[msg["message-id"]] = True nleft += 1 except StopIteration: if nleft: print nleft, "untrained %ss" % name[is_spam]