def setUp(self): _FileCorpusBaseTest.setUp(self) self.directory = 'fctesthamcorpus' self.cache_size = 100 self.factory = FileMessageFactory() self.stuff_corpus() self.corpus = FileCorpus(self.factory, self.directory, '?', self.cache_size)
class FileCorpusTest(_FileCorpusBaseTest): def setUp(self): _FileCorpusBaseTest.setUp(self) self.directory = 'fctesthamcorpus' self.cache_size = 100 self.factory = FileMessageFactory() self.stuff_corpus() self.corpus = FileCorpus(self.factory, self.directory, '?', self.cache_size) def stuff_corpus(self): """Put messages in the corpus""" i = 0 for content in [good1, spam1, malformed1]: self.msg = self.factory.create(str(i), self.directory, content) self.msg.store() i += 1 msg = self.factory.create("10", self.directory, good1) msg.store() def test___init__(self): self.assertEqual(self.corpus.directory, self.directory) self.assertEqual(self.corpus.filter, '?') self.assertEqual(self.corpus.cacheSize, self.cache_size) def test_filter(self): self.assertEqual(len(self.corpus.msgs), 3) self.corpus = FileCorpus(self.factory, self.directory, '*', self.cache_size) self.assertEqual(len(self.corpus.msgs), 4) def test_makeMessage_no_content(self): key = "testmake" self.corpus.makeMessage(key) def test_makeMessage_with_content(self): key = "testmake" content = spam1 msg = self.corpus.makeMessage(key, content) self.assertEqual(msg.key(), key) self.assertEqual(msg.as_string(), content.replace("\n", "\r\n")) def test_addMessage_invalid(self): class msg(object): def key(self): return 'aa' self.assertRaises(ValueError, self.corpus.addMessage, msg()) def test_addMessage(self): msg = self.factory.create("9", 'fctestspamcorpus', good1) self.corpus.addMessage(msg) self.assertEqual(msg.directory, self.directory) fn = os.path.join(self.directory, "9") f = open(fn, "rU") content = f.read() f.close() self.assertEqual(content, good1) def test_removeMessage(self): fn = self.msg.pathname() self.assertEqual(os.path.exists(fn), True) self.corpus.removeMessage(self.msg) self.assertEqual(os.path.exists(fn), False)
def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print "Loading database...", if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.mdb = spambayes.message.Message().message_info_db # Load stats manager. self.stats = Stats.Stats(options, self.mdb) self.buildStatusStrings() # Don't set up the caches and training objects when running the self-test, # so as not to clutter the filesystem. if not self.isTest: # Create/open the Corpuses. Use small cache sizes to avoid hogging # lots of memory. sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") map(storage.ensureDir, [sc, hc, uc]) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
def main(argv): opts, args = getopt.getopt(argv, "h", ["help"]) for opt, arg in opts: if opt in ("-h", "--help"): usage() return # Create the corpuses and the factory that reads the messages. if options["pop3proxy", "cache_use_gzip"]: messageFactory = GzipFileMessageFactory() else: messageFactory = FileMessageFactory() sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") spamCorpus = FileCorpus(messageFactory, sc) hamCorpus = FileCorpus(messageFactory, hc) # Read in all the trained messages. allTrained = {} for corpus, disposition in [(spamCorpus, 'Yes'), (hamCorpus, 'No')]: for m in corpus: message = mboxutils.get_message(m.getSubstance()) message._pop3CacheDisposition = disposition allTrained[m.key()] = message # Sort the messages into the order they arrived, then work out a scaling # factor for the graph - 'limit' is the widest it can be in characters. keys = allTrained.keys() keys.sort() limit = 70 if len(keys) < limit: scale = 1 else: scale = len(keys) // (limit // 2) # Build the data - an array of cumulative success indexed by count. count = successful = 0 successByCount = [] for key in keys: message = allTrained[key] disposition = message[options["Headers", "classification_header_name"]] if (message._pop3CacheDisposition == disposition): successful += 1 count += 1 if count % scale == (scale - 1): successByCount.append(successful // scale) # Build the graph, as a list of rows of characters. size = count // scale graph = [[" " for i in range(size + 3)] for j in range(size)] for c in range(size): graph[c][1] = "|" graph[c][c + 3] = "." graph[successByCount[c]][c + 3] = "*" graph.reverse() # Print the graph. print "\n Success of the classifier over time:\n" print " . - Number of messages over time" print " * - Number of correctly classified messages over time\n\n" for row in range(size): line = ''.join(graph[row]) if row == 0: print line + " %d" % count elif row == (count - successful) // scale: print line + " %d" % successful else: print line print " " + "_" * (size + 2)
class FileCorpusTest(_FileCorpusBaseTest): def setUp(self): _FileCorpusBaseTest.setUp(self) self.directory = 'fctesthamcorpus' self.cache_size = 100 self.factory = FileMessageFactory() self.stuff_corpus() self.corpus = FileCorpus(self.factory, self.directory, '?', self.cache_size) def stuff_corpus(self): """Put messages in the corpus""" i = 0 for content in [good1, spam1, malformed1]: self.msg = self.factory.create(str(i), self.directory, content) self.msg.store() i += 1 # Put in a message that won't match the filter. msg = self.factory.create("10", self.directory, good1) msg.store() def test___init__(self): self.assertEqual(self.corpus.directory, self.directory) self.assertEqual(self.corpus.filter, '?') self.assertEqual(self.corpus.cacheSize, self.cache_size) def test_filter(self): self.assertEqual(len(self.corpus.msgs), 3) # Try again, with all messages. self.corpus = FileCorpus(self.factory, self.directory, '*', self.cache_size) self.assertEqual(len(self.corpus.msgs), 4) def test_makeMessage_no_content(self): key = "testmake" self.corpus.makeMessage(key) def test_makeMessage_with_content(self): key = "testmake" content = spam1 msg = self.corpus.makeMessage(key, content) self.assertEqual(msg.key(), key) self.assertEqual(msg.as_string(), content.replace("\n", "\r\n")) def test_addMessage_invalid(self): class msg(object): def key(self): return 'aa' self.assertRaises(ValueError, self.corpus.addMessage, msg()) def test_addMessage(self): msg = self.factory.create("9", 'fctestspamcorpus', good1) self.corpus.addMessage(msg) self.assertEqual(msg.directory, self.directory) fn = os.path.join(self.directory, "9") f = open(fn, "rU") content = f.read() f.close() self.assertEqual(content, good1) def test_removeMessage(self): fn = self.msg.pathname() self.assertEqual(os.path.exists(fn), True) self.corpus.removeMessage(self.msg) self.assertEqual(os.path.exists(fn), False)
def setup(self): # Can't import this at the top because it's circular. # XXX Someone smarter than me, please figure out the right # XXX way to do this. from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory username = options["globals", "proxy_username"] password = options["globals", "proxy_password"] server = options["globals", "proxy_server"] if server.find(":") != -1: server, port = server.split(':', 1) else: port = 8080 if server: # Build a new opener that uses a proxy requiring authorization proxy_support = urllib2.ProxyHandler({"http" : \ "http://%s:%s@%s:%d" % \ (username, password, server, port)}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) else: # Build a new opener without any proxy information. opener = urllib2.build_opener(urllib2.HTTPHandler) # Install it urllib2.install_opener(opener) # Setup the cache for retrieved urls age = options["URLRetriever", "x-cache_expiry_days"] * 24 * 60 * 60 dir = options["URLRetriever", "x-cache_directory"] if not os.path.exists(dir): # Create the directory. if options["globals", "verbose"]: print >> sys.stderr, "Creating URL cache directory" os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize=20) # Kill any old information in the cache self.urlCorpus.removeExpiredMessages() # Setup caches for unretrievable urls self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck") self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck") if os.path.exists(self.bad_url_cache_name): try: self.bad_urls = pickle_read(self.bad_url_cache_name) except (IOError, ValueError): # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad URL pickle, using new." self.bad_urls = { "url:non_resolving": (), "url:non_html": (), "url:unknown_error": () } else: if options["globals", "verbose"]: print "URL caches don't exist: creating" self.bad_urls = { "url:non_resolving": (), "url:non_html": (), "url:unknown_error": () } if os.path.exists(self.http_error_cache_name): try: self.http_error_urls = pickle_read(self.http_error_cache_name) except IOError, ValueError: # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad HHTP error pickle, using new." self.http_error_urls = {}
class FileCorpusTest (_FileCorpusBaseTest) : def setUp(self): _FileCorpusBaseTest.setUp(self) self.directory = 'fctesthamcorpus' self.cache_size = 100 self.factory = FileMessageFactory() self.stuff_corpus() self.corpus = FileCorpus(self.factory, self.directory, '?', self.cache_size) def stuff_corpus(self): """Put messages in the corpus""" i = 0 for content in [good1, spam1, malformed1]: self.msg = self.factory.create(str(i), self.directory, content) self.msg.store() i += 1 msg = self.factory.create("10", self.directory, good1) msg.store() def test___init__(self): self.assertEqual(self.corpus.directory, self.directory) self.assertEqual(self.corpus.filter, '?') self.assertEqual(self.corpus.cacheSize, self.cache_size) def test_filter(self): self.assertEqual(len(self.corpus.msgs), 3) self.corpus = FileCorpus(self.factory, self.directory, '*', self.cache_size) self.assertEqual(len(self.corpus.msgs), 4) def test_makeMessage_no_content(self): key = "testmake" self.corpus.makeMessage(key) def test_makeMessage_with_content(self): key = "testmake" content = spam1 msg = self.corpus.makeMessage(key, content) self.assertEqual(msg.key(), key) self.assertEqual(msg.as_string(), content.replace("\n", "\r\n")) def test_addMessage_invalid(self): class msg(object): def key(self): return 'aa' self.assertRaises(ValueError, self.corpus.addMessage, msg()) def test_addMessage(self): msg = self.factory.create("9", 'fctestspamcorpus', good1) self.corpus.addMessage(msg) self.assertEqual(msg.directory, self.directory) fn = os.path.join(self.directory, "9") f = open(fn) content = f.read() f.close() self.assertEqual(content, good1) def test_removeMessage(self): fn = self.msg.pathname() self.assertEqual(os.path.exists(fn), True) self.corpus.removeMessage(self.msg) self.assertEqual(os.path.exists(fn), False) class ExpiryFileCorpusTest (FileCorpusTest) : def setUp(self): _FileCorpusBaseTest.setUp(self) self.cache_size = 100 self.directory = 'fctesthamcorpus' self.factory = FileMessageFactory() self.stuff_corpus() self.corpus = ExpiryFileCorpus(1.0, self.factory, self.directory, '?', self.cache_size) def suite(): suite = unittest.TestSuite() clses = (FileMessageFactoryTest, GzipFileMessageFactoryTest, FileMessageTest, GzipFileMessageTest, FileCorpusTest, ExpiryFileCorpusTest, ) for cls in clses: suite.addTest(unittest.makeSuite(cls)) return suite if __name__=='__main__': sb_test_support.unittest_main(argv=sys.argv + ['suite']) if __name__=='__main__': sb_test_support.unittest_main(argv=sys.argv + ['suite'])