def test_newdb(self): b = open_storage(TEMP_DBM_NAME, "dbm") b.learn(tokenize(spam1), True) b.learn(tokenize(good1), False) b.store() b.close() self.h.newdb() self.assertEqual(self.h.h, None) b = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(b.nham, 0) self.assertEqual(b.nspam, 0) b.close()
def open(filename, useDB="dbm", mode="r"): """Open a file, returning a Hammie instance. mode is used as the flag to open DBDict objects. 'c' for read-write (create if needed), 'r' for read-only, 'w' for read-write. """ return Hammie(storage.open_storage(filename, useDB, mode), mode)
def test_dbm_export(self): # Create a dbm classifier to export. bayes = DBDictClassifier(TEMP_DBM_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save & Close. bayes.store() bayes.close() # Export. sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME) # Reopen the original. bayes = open_storage(TEMP_DBM_NAME, "dbm") # Verify that the CSV holds all the original data (and, by using # the CSV module to open it, that it is valid CSV data). fp = open(TEMP_CSV_NAME, "rb") reader = sb_dbexpimp.csv.reader(fp) (nham, nspam) = reader.next() self.assertEqual(int(nham), bayes.nham) self.assertEqual(int(nspam), bayes.nspam) for (word, hamcount, spamcount) in reader: word = sb_dbexpimp.uunquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(int(hamcount), wi.hamcount) self.assertEqual(int(spamcount), wi.spamcount)
def main(args): try: opts, args = getopt.getopt(args, "hd:p:", ["help", "database=", "pickle="]) except getopt.GetoptError as msg: usage(msg) return 1 if len(args) != 1: usage() return 1 cdbname = args[0] dbname = usedb = None for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 dbname, usedb = storage.database_type(opts) store = storage.open_storage(dbname, usedb) bayes = CdbClassifier() items = [] for word in store._wordinfokeys(): record = store._wordinfoget(word) prob = store.probability(record) items.append((word, str(prob))) cdbfile = open(cdbname, "wb") cdb.cdb_make(cdbfile, items) cdbfile.close()
def test_import_to_dbm(self): # Create a CSV file to import. temp = open(TEMP_CSV_NAME, "wb") temp.write("3,4\n") csv_data = { "this": (2, 1), "is": (0, 1), "a": (3, 4), 'test': (1, 1), "of": (1, 0), "the": (1, 2), "import": (3, 1) } for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", True, TEMP_CSV_NAME) # Open the converted file and verify that it has all the data from # the CSV file (and by opening it, that it is a valid dbm file). bayes = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(bayes.nham, 3) self.assertEqual(bayes.nspam, 4) for word, (ham, spam) in csv_data.items(): word = sb_dbexpimp.uquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(wi.hamcount, ham) self.assertEqual(wi.spamcount, spam)
def open(filename, useDB="dbm", mode='r'): """Open a file, returning a Hammie instance. mode is used as the flag to open DBDict objects. 'c' for read-write (create if needed), 'r' for read-only, 'w' for read-write. """ return Hammie(storage.open_storage(filename, useDB, mode), mode)
def main(args): try: opts, args = getopt.getopt(args, "hrto:", ["help", "re", "tokenize", "option="]) except getopt.GetoptError as msg: usage(msg) return 1 usere = False tokenizestdin = False for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-r", "--re"): usere = True elif opt in ("-t", "--tokenize"): tokenizestdin = True elif opt in ("-o", "--option"): options.set_from_cmdline(arg, sys.stderr) if usere and tokenizestdin: usage("-r and -t may not be used at the same time") return 1 dbname, usedb = database_type(opts) db = open_storage(dbname, usedb) if tokenizestdin: args = tokenize(sys.stdin) if args: print_spamcounts(args, db, usere) return 0 else: usage("need tokens on cmd line or -t w/ msg on stdin") return 1
def test_merge_to_pickle(self): bayes = PickledClassifier(TEMP_PICKLE_NAME) bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) bayes.store() nham, nspam = 3,4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1), "of":(1,0), "the":(1,2), "import":(3,1)} for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_PICKLE_NAME, "pickle", False, TEMP_CSV_NAME) bayes2 = open_storage(TEMP_PICKLE_NAME, "pickle") self.assertEqual(bayes2.nham, nham + bayes.nham) self.assertEqual(bayes2.nspam, nspam + bayes.nspam) words = bayes._wordinfokeys() words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0,0)) wi = bayes._wordinfoget(word) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print "Loading database...", if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.buildStatusStrings() # Don't set up the caches and training objects when running the self-test, # so as not to clutter the filesystem. if not self.isTest: def ensureDir(dirname): try: os.mkdir(dirname) except OSError, e: if e.errno != errno.EEXIST: raise # Create/open the Corpuses. Use small cache sizes to avoid hogging # lots of memory. sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") map(ensureDir, [sc, hc, uc]) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
def open(filename, useDB=True, mode="r"): """Open a file, returning a Hammie instance. If usedb is False, open as a pickle instead of a DBDict. mode is used as the flag to open DBDict objects. 'c' for read-write (create if needed), 'r' for read-only, 'w' for read-write. """ return Hammie(storage.open_storage(filename, useDB, mode))
def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print "Loading database...", if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.mdb = spambayes.message.Message().message_info_db # Load stats manager. self.stats = Stats.Stats(options, self.mdb) self.buildStatusStrings() # Don't set up the caches and training objects when running the self-test, # so as not to clutter the filesystem. if not self.isTest: # Create/open the Corpuses. Use small cache sizes to avoid hogging # lots of memory. sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") map(storage.ensureDir, [sc, hc, uc]) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
def create_workers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" if self.is_test: self.use_db = "pickle" self.db_name = '_core_server.pickle' # This is never saved. if not hasattr(self, "db_name"): self.db_name, self.use_db = storage.database_type([]) self.bayes = storage.open_storage(self.db_name, self.use_db) # Load stats manager. self.stats = Stats.Stats(options, spambayes.message.Message().message_info_db) self.build_status_strings() # Don't set up the caches and training objects when running the # self-test, so as not to clutter the filesystem. if not self.is_test: # Create/open the Corpuses. Use small cache sizes to avoid # hogging lots of memory. sc = get_pathname_option("Storage", "core_spam_cache") hc = get_pathname_option("Storage", "core_ham_cache") uc = get_pathname_option("Storage", "core_unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzip_cache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spam_trainer = storage.SpamTrainer(self.bayes) self.ham_trainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spam_trainer) self.hamCorpus.addObserver(self.ham_trainer)
def test_newdb(self): # Create an existing classifier. b = open_storage(TEMP_DBM_NAME, "dbm") b.learn(tokenize(spam1), True) b.learn(tokenize(good1), False) b.store() b.close() # Create the fresh classifier. self.h.newdb() # Verify that the classifier isn't open. self.assertEqual(self.h.h, None) # Verify that any existing classifier with the same name # is overwritten. b = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(b.nham, 0) self.assertEqual(b.nspam, 0) b.close()
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify, pwd, idxname, logname): bayes = storage.open_storage(bdbname, useDBM) try: fp = open(idxname, 'rb') except IOError, e: if e.errno != errno.ENOENT: raise notesindex = {} print "%s file not found, this is a first time run" % (idxname,) print "No classification will be performed"
def createWorkers(self): """There aren't many workers in an IMAP State - most of the work is done elsewhere. We do need to load the classifier, though, and build the status strings.""" if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) if not hasattr(self, "MBDName"): self.MDBName, self.useMDB = message.database_type() self.mdb = message.open_storage(self.MDBName, self.useMDB) self.stats = Stats(options, self.mdb) self.buildStatusStrings()
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify, pwd, idxname, logname): bayes = storage.open_storage(bdbname, useDBM) try: notesindex = pickle_read(idxname) except IOError, e: if e.errno != errno.ENOENT: raise notesindex = {} print "%s file not found, this is a first time run" % (idxname,) print "No classification will be performed"
def test_merge_to_dbm(self): # Create a dbm classifier to merge with. bayes = DBDictClassifier(TEMP_DBM_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save data to check against. original_nham = bayes.nham original_nspam = bayes.nspam original_data = {} for key in bayes._wordinfokeys(): original_data[key] = bayes._wordinfoget(key) # Save & Close. bayes.store() bayes.close() # Create a CSV file to import. nham, nspam = 3, 4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = { "this": (2, 1), "is": (0, 1), "a": (3, 4), 'test': (1, 1), "of": (1, 0), "the": (1, 2), "import": (3, 1) } for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME) # Open the converted file and verify that it has all the data from # the CSV file (and by opening it, that it is a valid dbm file), # and the data from the original dbm database. bayes2 = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(bayes2.nham, nham + original_nham) self.assertEqual(bayes2.nspam, nspam + original_nspam) words = original_data.keys()[:] words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0, 0)) wi = original_data.get(word, None) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def testNoDBMAvailable(self): import tempfile from spambayes.storage import open_storage db_name = tempfile.mktemp("nodbmtest") DBDictClassifier_load = DBDictClassifier.load DBDictClassifier.load = self._fail_open_best # Redirect sys.stderr, as open_storage() prints a msg to stderr. # Then it does sys.exit(), which we catch. sys_stderr = sys.stderr sys.stderr = StringIO.StringIO() try: try: open_storage(db_name, "dbm") except SystemExit: pass else: self.fail("expected SystemExit from open_storage() call") finally: DBDictClassifier.load = DBDictClassifier_load sys.stderr = sys_stderr if os.path.isfile(db_name): os.remove(db_name)
def testNoDBMAvailable(self): import tempfile from spambayes.storage import open_storage DBDictClassifier_load = DBDictClassifier.load DBDictClassifier.load = self.fail_open_best sys_exit = sys.exit sys.exit = self.success self.succeeded = False db_name = tempfile.mktemp("nodbmtest") s = open_storage(db_name, True) DBDictClassifier.load = DBDictClassifier_load sys.exit = sys_exit if not self.succeeded: self.fail() if os.path.isfile(db_name): os.remove(db_name)
def createWorkers(self): """There aren't many workers in an IMAP State - most of the work is done elsewhere. We do need to load the classifier, though, and build the status strings.""" # Load token and message databases. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) if not hasattr(self, "MBDName"): self.MDBName, self.useMDB = message.database_type() self.mdb = message.open_storage(self.MDBName, self.useMDB) # Load stats manager. self.stats = Stats(options, self.mdb) # Build status strings. self.buildStatusStrings()
def main(): """Main program; parse options and go.""" try: opts, args = getopt.getopt(sys.argv[1:], 'hd:p:o:') except getopt.error as msg: usage(2, msg) options = Options.options for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) dbname, usedb = storage.database_type(opts) if len(args) != 1: usage(2, "IP:PORT not specified") ip, port = args[0].split(":") port = int(port) bayes = storage.open_storage(dbname, usedb) h = XMLHammie(bayes) server = ReusableSimpleXMLRPCServer( (ip, port), xmlrpc.server.SimpleXMLRPCRequestHandler) server.register_instance(h) server.serve_forever()
def test_import_to_dbm(self): temp = open(TEMP_CSV_NAME, "wb") temp.write("3,4\n") csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1), "of":(1,0), "the":(1,2), "import":(3,1)} for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", True, TEMP_CSV_NAME) bayes = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(bayes.nham, 3) self.assertEqual(bayes.nspam, 4) for word, (ham, spam) in csv_data.items(): word = sb_dbexpimp.uquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(wi.hamcount, ham) self.assertEqual(wi.spamcount, spam)
def test_merge_to_dbm(self): # Create a dbm classifier to merge with. bayes = DBDictClassifier(TEMP_DBM_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save data to check against. original_nham = bayes.nham original_nspam = bayes.nspam original_data = {} for key in bayes._wordinfokeys(): original_data[key] = bayes._wordinfoget(key) # Save & Close. bayes.store() bayes.close() # Create a CSV file to import. nham, nspam = 3,4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1), "of":(1,0), "the":(1,2), "import":(3,1)} for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME) # Open the converted file and verify that it has all the data from # the CSV file (and by opening it, that it is a valid dbm file), # and the data from the original dbm database. bayes2 = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(bayes2.nham, nham + original_nham) self.assertEqual(bayes2.nspam, nspam + original_nspam) words = original_data.keys()[:] words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0,0)) wi = original_data.get(word, None) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def test_dbm_export(self): bayes = DBDictClassifier(TEMP_DBM_NAME) bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) bayes.store() bayes.close() sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME) bayes = open_storage(TEMP_DBM_NAME, "dbm") fp = open(TEMP_CSV_NAME, "rb") reader = sb_dbexpimp.csv.reader(fp) (nham, nspam) = reader.next() self.assertEqual(int(nham), bayes.nham) self.assertEqual(int(nspam), bayes.nspam) for (word, hamcount, spamcount) in reader: word = sb_dbexpimp.uunquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(int(hamcount), wi.hamcount) self.assertEqual(int(spamcount), wi.spamcount)
def hammer(): """Trains and classifies repeatedly.""" global bayes wellFlushed = False for i in range(1, 1000000): isSpam = random.choice([True, False]) train(makeMessage(isSpam), isSpam) if random.randrange(1000) == 1: print "Flushing." bayes.store() if i > 500: wellFlushed = True isSpam = random.choice([True, False]) prob = classify(makeMessage(isSpam)) if i < 10 or i % 100 == 0: print "%6.6d: %d, %.4f" % (i, isSpam, prob) if wellFlushed and random.randrange(1000) == 1: print "Re-opening." bayes = storage.open_storage(FILENAME, True)
def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print("Loading database...", end=' ') if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.mdb = spambayes.message.Message().message_info_db self.stats = Stats.Stats(options, self.mdb) self.buildStatusStrings() if not self.isTest: sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
def test_import_to_dbm(self): # Create a CSV file to import. temp = open(TEMP_CSV_NAME, "wb") temp.write("3,4\n") csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1), "of":(1,0), "the":(1,2), "import":(3,1)} for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", True, TEMP_CSV_NAME) # Open the converted file and verify that it has all the data from # the CSV file (and by opening it, that it is a valid dbm file). bayes = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(bayes.nham, 3) self.assertEqual(bayes.nspam, 4) for word, (ham, spam) in csv_data.items(): word = sb_dbexpimp.uquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(wi.hamcount, ham) self.assertEqual(wi.spamcount, spam)
def hammer(): """Trains and classifies repeatedly.""" global bayes wellFlushed = False for i in range(1, 1000000): # Train. isSpam = random.choice([True, False]) train(makeMessage(isSpam), isSpam) # Every thousand messages or so, flush the DB to disk. if random.randrange(1000) == 1: print "Flushing." bayes.store() if i > 500: wellFlushed = True # Classify. isSpam = random.choice([True, False]) prob = classify(makeMessage(isSpam)) if i < 10 or i % 100 == 0: print "%6.6d: %d, %.4f" % (i, isSpam, prob) # Every thousand messages or so, reopen the DB without closing it. # The way this works will open the new instance before the existing # one goes away, which can cause a DBRunRecoveryError. Versions up # to 1.0a5 had a bug in that did this, but people were still # reporting DBRunRecoveryErrors in 1.0a6, so I don't think we can # call it fixed. # We don't do this within the first few hundred messages, or before # the DB has been flushed, because that can give a "hamcount > nham" # error. Despite this, you still see those errors. Either I've got # something badly wrong, or they're the result of corrupt databases # that aren't caught by bsddb and turned into DBRunRecoveryErrors. if wellFlushed and random.randrange(1000) == 1: print "Re-opening." bayes = storage.open_storage(FILENAME, True)
__author__ = "Richie Hindle <*****@*****.**>" headerTemplate = """To: %(To)s From: %(From)s Subject: %(Subject)s Date: %(Date)s """ # Create a fresh bayes object to train and classify. FILENAME = "__hammer.db" try: os.remove(FILENAME) except OSError: pass bayes = storage.open_storage(FILENAME, True) def train(text, isSpam): """Trains the classifier on the given text.""" tokens = tokenizer.tokenize(text) bayes.learn(tokens, isSpam) def classify(text): """Classifies the given text, returning the spamprob.""" tokens = tokenizer.tokenize(text) return bayes.spamprob(tokens) def makeMessage(isSpam):
def open_spamdb(self, request): if self.sbayes is None: event_log = request.rootpage.getPagePath('event-log', isfile=1) spam_db = os.path.join(os.path.dirname(event_log), self.spam_db) self.sbayes = Hammie(storage.open_storage(spam_db, "pickle", 'c')) atexit.register(self.close_spamdb)
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify, pwd, idxname, logname): bayes = storage.open_storage(bdbname, useDBM) try: notesindex = pickle_read(idxname) except IOError as e: if e.errno != errno.ENOENT: raise notesindex = {} print("%s file not found, this is a first time run" % (idxname,)) print("No classification will be performed") need_replicate = False sess = win32com.client.Dispatch("Lotus.NotesSession") try: if pwd: sess.initialize(pwd) else: sess.initialize() except pywintypes.com_error: print("Session aborted") sys.exit() try: db = sess.GetDatabase(rdbname, ldbname) except pywintypes.com_error: if rdbname: print("Could not open database remotely, trying locally") try: db = sess.GetDatabase("", ldbname) need_replicate = True except pywintypes.com_error: print("Could not open database") sys.exit() else: raise log = sess.CreateLog("SpambayesAgentLog") try: log.OpenNotesLog("", logname) except pywintypes.com_error: print("Could not open log") log = None if log: log.LogAction("Running spambayes") vinbox = db.getView('($Inbox)') vspam = db.getView("%s\Spam" % (foldname,)) vham = db.getView("%s\Ham" % (foldname,)) vtrainspam = db.getView("%s\Train as Spam" % (foldname,)) vtrainham = db.getView("%s\Train as Ham" % (foldname,)) if doTrain: processAndTrain(vtrainspam, vspam, bayes, True, notesindex, log) processAndTrain(vtrainham, vham, bayes, False, notesindex, log) if need_replicate: try: print("Replicating...") db.Replicate(rdbname) print("Done") except pywintypes.com_error: print("Could not replicate") if doClassify: classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log) print("The Spambayes database currently has %s Spam and %s Ham" \ % (bayes.nspam, bayes.nham)) bayes.store() pickle_write(idxname, notesindex) if log: log.LogAction("Finished running spambayes")
elif opt == '--ratio': arg = arg.split(":") sh_ratio = (int(arg[0]), int(arg[1])) if ham is None or spam is None: usage("require both ham and spam piles") return 1 dbname, usedb = storage.database_type(opts) try: os.unlink(dbname) except OSError: pass store = storage.open_storage(dbname, usedb) tdict = {} train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose, sh_ratio) store.store() store.close() if cullext is not None: cull(ham, cullext, 'ham', tdict) cull(spam, cullext, 'spam', tdict) return 0
opts, args = getopt.getopt(sys.argv[1:], 'hd:p:o:') except getopt.error, msg: usage(2, msg) options = Options.options for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) dbname, usedb = storage.database_type(opts) if len(args) != 1: usage(2, "IP:PORT not specified") ip, port = args[0].split(":") port = int(port) bayes = storage.open_storage(dbname, usedb) h = XMLHammie(bayes) server = ReusableSimpleXMLRPCServer( (ip, port), SimpleXMLRPCServer.SimpleXMLRPCRequestHandler) server.register_instance(h) server.serve_forever() if __name__ == "__main__": main()
except getopt.GetoptError, msg: usage(msg) return 1 usere = False tokenizestdin = False for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-r", "--re"): usere = True elif opt in ("-t", "--tokenize"): tokenizestdin = True elif opt in ('-o', '--option'): options.set_from_cmdline(arg, sys.stderr) if usere and tokenizestdin: usage("-r and -t may not be used at the same time") return 1 dbname, usedb = database_type(opts) db = open_storage(dbname, usedb) if tokenizestdin: args = tokenize(sys.stdin) if args: print_spamcounts(args, db, usere) return 0 else: usage("need tokens on cmd line or -t w/ msg on stdin") return 1 if __name__ == "__main__": sys.exit(main(sys.argv[1:]))
import os import os.path import sys import syslog import tempfile import types import traceback import zipfile import zlib from email import Errors from email import Message import email from threading import Lock from spambayes import storage from spambayes import hammie bayes = storage.open_storage('/home/georg/hammie.db', 'dbm') scoremaster = hammie.Hammie(bayes) score = scoremaster.score # function!! import Milter from posix import getloadavg import cfg # Tune this? configfile = "/etc/mail/wumi.cf" # TODO - find out the truth about loadconfig ...
def change_db(): classifier = storage.open_storage(*storage.database_type(opts)) message.Message.message_info_db = message_db imap_filter = IMAPFilter(classifier, message_db)
if arg == 'y': doExpunge = True else: doExpunge = False elif opt == '-i': imapDebug = int(arg) elif opt == '-l': sleepTime = int(arg) * 60 elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) bdbname, useDBM = storage.database_type(opts) v = get_current_version(); print "%s.\n" % (v.get_long_version("SpamBayes IMAP Filter"),) if options["globals", "verbose"]: print "Loading database %s..." % (bdbname), classifier = storage.open_storage(bdbname, useDBM) message_db = message.Message().message_info_db if options["globals", "verbose"]: print "Done." if not ( launchUI or force_UI or options["imap", "server"] ): print "You need to specify both a server and a username." sys.exit() servers_data = servers(promptForPass) stats = Stats.Stats(options, message_db) imap_filter = IMAPFilter(classifier, stats) if sleepTime or not (doClassify or doTrain): imaps = [] for server, username, password in servers_data: if server == "": imaps.append(None) else:
cdbname = args[0] dbname = usedb = None for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 dbname, usedb = storage.database_type(opts) store = storage.open_storage(dbname, usedb) bayes = CdbClassifier() items = [] for word in store._wordinfokeys(): record = store._wordinfoget(word) prob = store.probability(record) items.append((word, str(prob))) cdbfile = open(cdbname, "wb")
try: opts, args = getopt.getopt(sys.argv[1:], "hd:p:o:") except getopt.error, msg: usage(2, msg) options = Options.options for opt, arg in opts: if opt == "-h": usage(0) elif opt == "-o": options.set_from_cmdline(arg, sys.stderr) dbname, usedb = storage.database_type(opts) if len(args) != 1: usage(2, "IP:PORT not specified") ip, port = args[0].split(":") port = int(port) bayes = storage.open_storage(dbname, usedb) h = XMLHammie(bayes) server = ReusableSimpleXMLRPCServer((ip, port), SimpleXMLRPCServer.SimpleXMLRPCRequestHandler) server.register_instance(h) server.serve_forever() if __name__ == "__main__": main()
if opt in ("-h", "--help"): usage() return 0 elif opt in ("-r", "--re"): usere = True elif opt in ("-t", "--tokenize"): tokenizestdin = True elif opt in ('-o', '--option'): options.set_from_cmdline(arg, sys.stderr) if usere and tokenizestdin: usage("-r and -t may not be used at the same time") return 1 dbname, usedb = database_type(opts) db = open_storage(dbname, usedb) if tokenizestdin: args = tokenize(sys.stdin) if args: print_spamcounts(args, db, usere) return 0 else: usage("need tokens on cmd line or -t w/ msg on stdin") return 1 if __name__ == "__main__": sys.exit(main(sys.argv[1:]))