def test_merge_to_pickle(self): bayes = PickledClassifier(TEMP_PICKLE_NAME) bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) bayes.store() nham, nspam = 3,4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1), "of":(1,0), "the":(1,2), "import":(3,1)} for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_PICKLE_NAME, "pickle", False, TEMP_CSV_NAME) bayes2 = open_storage(TEMP_PICKLE_NAME, "pickle") self.assertEqual(bayes2.nham, nham + bayes.nham) self.assertEqual(bayes2.nspam, nspam + bayes.nspam) words = bayes._wordinfokeys() words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0,0)) wi = bayes._wordinfoget(word) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def looks_like_spam(message, config, section): log.info("Checking message for spam...") log.debug(message) pickle_filename = config.get(section, 'spam.pickle_file') min_spam_prob = config.getfloat(section, 'spam.min_spam_prob') or 0.90 log.debug("Loading pickle from %s", pickle_filename) bayes = PickledClassifier(pickle_filename) spamprob = bayes.chi2_spamprob(message) if spamprob >= min_spam_prob: log.debug("spamprob %s >= %s, probably spam", spamprob, min_spam_prob) return True log.debug("spamprob %s <= %s, probably not spam", spamprob, min_spam_prob) return False
def test_merge_to_pickle(self): # Create a pickled classifier to merge with. bayes = PickledClassifier(TEMP_PICKLE_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save. bayes.store() # Create a CSV file to import. nham, nspam = 3, 4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = { "this": (2, 1), "is": (0, 1), "a": (3, 4), 'test': (1, 1), "of": (1, 0), "the": (1, 2), "import": (3, 1) } for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_PICKLE_NAME, "pickle", False, TEMP_CSV_NAME) # Open the converted file and verify that it has all the data from # the CSV file (and by opening it, that it is a valid pickle), # and the data from the original pickle. bayes2 = open_storage(TEMP_PICKLE_NAME, "pickle") self.assertEqual(bayes2.nham, nham + bayes.nham) self.assertEqual(bayes2.nspam, nspam + bayes.nspam) words = bayes._wordinfokeys() words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0, 0)) wi = bayes._wordinfoget(word) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def test_pickle_export(self): bayes = PickledClassifier(TEMP_PICKLE_NAME) bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) bayes.store() sb_dbexpimp.runExport(TEMP_PICKLE_NAME, "pickle", TEMP_CSV_NAME) fp = open(TEMP_CSV_NAME, "rb") reader = sb_dbexpimp.csv.reader(fp) (nham, nspam) = reader.next() self.assertEqual(int(nham), bayes.nham) self.assertEqual(int(nspam), bayes.nspam) for (word, hamcount, spamcount) in reader: word = sb_dbexpimp.uunquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(int(hamcount), wi.hamcount) self.assertEqual(int(spamcount), wi.spamcount)
def test_pickle_export(self): # Create a pickled classifier to export. bayes = PickledClassifier(TEMP_PICKLE_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save. bayes.store() # Export. sb_dbexpimp.runExport(TEMP_PICKLE_NAME, "pickle", TEMP_CSV_NAME) # Verify that the CSV holds all the original data (and, by using # the CSV module to open it, that it is valid CSV data). fp = open(TEMP_CSV_NAME, "rb") reader = sb_dbexpimp.csv.reader(fp) (nham, nspam) = reader.next() self.assertEqual(int(nham), bayes.nham) self.assertEqual(int(nspam), bayes.nspam) for (word, hamcount, spamcount) in reader: word = sb_dbexpimp.uunquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(int(hamcount), wi.hamcount) self.assertEqual(int(spamcount), wi.spamcount)
def test_merge_to_pickle(self): # Create a pickled classifier to merge with. bayes = PickledClassifier(TEMP_PICKLE_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save. bayes.store() # Create a CSV file to import. nham, nspam = 3,4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1), "of":(1,0), "the":(1,2), "import":(3,1)} for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_PICKLE_NAME, "pickle", False, TEMP_CSV_NAME) # Open the converted file and verify that it has all the data from # the CSV file (and by opening it, that it is a valid pickle), # and the data from the original pickle. bayes2 = open_storage(TEMP_PICKLE_NAME, "pickle") self.assertEqual(bayes2.nham, nham + bayes.nham) self.assertEqual(bayes2.nspam, nspam + bayes.nspam) words = bayes._wordinfokeys() words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0,0)) wi = bayes._wordinfoget(word) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def main(): pickle_filename = sys.argv[-1] bayes = PickledClassifier(pickle_filename) message = sys.stdin.readlines() bayes.learn(message, True) bayes.store()