示例#1
0
 def test_newdb(self):
     b = open_storage(TEMP_DBM_NAME, "dbm")
     b.learn(tokenize(spam1), True)
     b.learn(tokenize(good1), False)
     b.store()
     b.close()
     self.h.newdb()
     self.assertEqual(self.h.h, None)
     b = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(b.nham, 0)
     self.assertEqual(b.nspam, 0)
     b.close()
def open(filename, useDB="dbm", mode="r"):
    """Open a file, returning a Hammie instance.

    mode is used as the flag to open DBDict objects.  'c' for read-write
    (create if needed), 'r' for read-only, 'w' for read-write.
    """
    return Hammie(storage.open_storage(filename, useDB, mode), mode)
示例#3
0
 def test_dbm_export(self):
     # Create a dbm classifier to export.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Export.
     sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME)
     # Reopen the original.
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     # Verify that the CSV holds all the original data (and, by using
     # the CSV module to open it, that it is valid CSV data).
     fp = open(TEMP_CSV_NAME, "rb")
     reader = sb_dbexpimp.csv.reader(fp)
     (nham, nspam) = reader.next()
     self.assertEqual(int(nham), bayes.nham)
     self.assertEqual(int(nspam), bayes.nspam)
     for (word, hamcount, spamcount) in reader:
         word = sb_dbexpimp.uunquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(int(hamcount), wi.hamcount)
         self.assertEqual(int(spamcount), wi.spamcount)
示例#4
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "hd:p:", ["help", "database=", "pickle="])
    except getopt.GetoptError as msg:
        usage(msg)
        return 1
    if len(args) != 1:
        usage()
        return 1
    cdbname = args[0]
    dbname = usedb = None
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
    dbname, usedb = storage.database_type(opts)
    store = storage.open_storage(dbname, usedb)
    bayes = CdbClassifier()
    items = []
    for word in store._wordinfokeys():
        record = store._wordinfoget(word)
        prob = store.probability(record)
        items.append((word, str(prob)))
    cdbfile = open(cdbname, "wb")
    cdb.cdb_make(cdbfile, items)
    cdbfile.close()
示例#5
0
 def test_import_to_dbm(self):
     # Create a CSV file to import.
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("3,4\n")
     csv_data = {
         "this": (2, 1),
         "is": (0, 1),
         "a": (3, 4),
         'test': (1, 1),
         "of": (1, 0),
         "the": (1, 2),
         "import": (3, 1)
     }
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", True, TEMP_CSV_NAME)
     # Open the converted file and verify that it has all the data from
     # the CSV file (and by opening it, that it is a valid dbm file).
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(bayes.nham, 3)
     self.assertEqual(bayes.nspam, 4)
     for word, (ham, spam) in csv_data.items():
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(wi.hamcount, ham)
         self.assertEqual(wi.spamcount, spam)
示例#6
0
def open(filename, useDB="dbm", mode='r'):
    """Open a file, returning a Hammie instance.

    mode is used as the flag to open DBDict objects.  'c' for read-write
    (create if needed), 'r' for read-only, 'w' for read-write.
    """
    return Hammie(storage.open_storage(filename, useDB, mode), mode)
示例#7
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "hrto:", ["help", "re", "tokenize", "option="])
    except getopt.GetoptError as msg:
        usage(msg)
        return 1
    usere = False
    tokenizestdin = False
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-r", "--re"):
            usere = True
        elif opt in ("-t", "--tokenize"):
            tokenizestdin = True
        elif opt in ("-o", "--option"):
            options.set_from_cmdline(arg, sys.stderr)
    if usere and tokenizestdin:
        usage("-r and -t may not be used at the same time")
        return 1
    dbname, usedb = database_type(opts)
    db = open_storage(dbname, usedb)
    if tokenizestdin:
        args = tokenize(sys.stdin)
    if args:
        print_spamcounts(args, db, usere)
        return 0
    else:
        usage("need tokens on cmd line or -t w/ msg on stdin")
        return 1
示例#8
0
 def test_merge_to_pickle(self):
     bayes = PickledClassifier(TEMP_PICKLE_NAME)
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     bayes.store()
     nham, nspam = 3,4
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("%d,%d\n" % (nham, nspam))
     csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
                 "of":(1,0), "the":(1,2), "import":(3,1)}
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_PICKLE_NAME, "pickle", False,
                           TEMP_CSV_NAME)
     bayes2 = open_storage(TEMP_PICKLE_NAME, "pickle")
     self.assertEqual(bayes2.nham, nham + bayes.nham)
     self.assertEqual(bayes2.nspam, nspam + bayes.nspam)
     words = bayes._wordinfokeys()
     words.extend(csv_data.keys())
     for word in words:
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes2._wordinfokeys())
         h, s = csv_data.get(word, (0,0))
         wi = bayes._wordinfoget(word)
         if wi:
             h += wi.hamcount
             s += wi.spamcount
         wi2 = bayes2._wordinfoget(word)
         self.assertEqual(h, wi2.hamcount)
         self.assertEqual(s, wi2.spamcount)
 def test_dbm_export(self):
     # Create a dbm classifier to export.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Export.
     sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME)
     # Reopen the original.
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     # Verify that the CSV holds all the original data (and, by using
     # the CSV module to open it, that it is valid CSV data).
     fp = open(TEMP_CSV_NAME, "rb")
     reader = sb_dbexpimp.csv.reader(fp)
     (nham, nspam) = reader.next()
     self.assertEqual(int(nham), bayes.nham)
     self.assertEqual(int(nspam), bayes.nspam)
     for (word, hamcount, spamcount) in reader:
         word = sb_dbexpimp.uunquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(int(hamcount), wi.hamcount)
         self.assertEqual(int(spamcount), wi.spamcount)
示例#10
0
文件: sb_server.py 项目: Xodarap/Eipi
    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        
        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            def ensureDir(dirname):
                try:
                    os.mkdir(dirname)
                except OSError, e:
                    if e.errno != errno.EEXIST:
                        raise

            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)
示例#11
0
文件: hammie.py 项目: ArildF/rogie
def open(filename, useDB=True, mode="r"):
    """Open a file, returning a Hammie instance.

    If usedb is False, open as a pickle instead of a DBDict.  mode is

    used as the flag to open DBDict objects.  'c' for read-write (create
    if needed), 'r' for read-only, 'w' for read-write.
    """
    return Hammie(storage.open_storage(filename, useDB, mode))
示例#12
0
    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        self.mdb = spambayes.message.Message().message_info_db

        # Load stats manager.
        self.stats = Stats.Stats(options, self.mdb)

        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(storage.ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)
示例#13
0
    def create_workers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        if self.is_test:
            self.use_db = "pickle"
            self.db_name = '_core_server.pickle'   # This is never saved.
        if not hasattr(self, "db_name"):
            self.db_name, self.use_db = storage.database_type([])
        self.bayes = storage.open_storage(self.db_name, self.use_db)

        # Load stats manager.
        self.stats = Stats.Stats(options,
                                 spambayes.message.Message().message_info_db)

        self.build_status_strings()

        # Don't set up the caches and training objects when running the
        # self-test, so as not to clutter the filesystem.
        if not self.is_test:
            # Create/open the Corpuses.  Use small cache sizes to avoid
            # hogging lots of memory.
            sc = get_pathname_option("Storage", "core_spam_cache")
            hc = get_pathname_option("Storage", "core_ham_cache")
            uc = get_pathname_option("Storage", "core_unknown_cache")
            for d in [sc, hc, uc]:
                storage.ensureDir(d)
            if self.gzip_cache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spam_trainer = storage.SpamTrainer(self.bayes)
            self.ham_trainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spam_trainer)
            self.hamCorpus.addObserver(self.ham_trainer)
示例#14
0
    def test_newdb(self):
        # Create an existing classifier.
        b = open_storage(TEMP_DBM_NAME, "dbm")
        b.learn(tokenize(spam1), True)
        b.learn(tokenize(good1), False)
        b.store()
        b.close()

        # Create the fresh classifier.
        self.h.newdb()

        # Verify that the classifier isn't open.
        self.assertEqual(self.h.h, None)

        # Verify that any existing classifier with the same name
        # is overwritten.
        b = open_storage(TEMP_DBM_NAME, "dbm")
        self.assertEqual(b.nham, 0)
        self.assertEqual(b.nspam, 0)
        b.close()
示例#15
0
    def test_newdb(self):
        # Create an existing classifier.
        b = open_storage(TEMP_DBM_NAME, "dbm")
        b.learn(tokenize(spam1), True)
        b.learn(tokenize(good1), False)
        b.store()
        b.close()

        # Create the fresh classifier.        
        self.h.newdb()
        
        # Verify that the classifier isn't open.
        self.assertEqual(self.h.h, None)

        # Verify that any existing classifier with the same name
        # is overwritten.
        b = open_storage(TEMP_DBM_NAME, "dbm")
        self.assertEqual(b.nham, 0)
        self.assertEqual(b.nspam, 0)
        b.close()
示例#16
0
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify,
        pwd, idxname, logname):
    bayes = storage.open_storage(bdbname, useDBM)
    try:
        fp = open(idxname, 'rb')
    except IOError, e:
        if e.errno != errno.ENOENT:
            raise
        notesindex = {}
        print "%s file not found, this is a first time run" % (idxname,)
        print "No classification will be performed"
示例#17
0
 def createWorkers(self):
     """There aren't many workers in an IMAP State - most of the
     work is done elsewhere.  We do need to load the classifier,
     though, and build the status strings."""
     if not hasattr(self, "DBName"):
         self.DBName, self.useDB = storage.database_type([])
     self.bayes = storage.open_storage(self.DBName, self.useDB)
     if not hasattr(self, "MBDName"):
         self.MDBName, self.useMDB = message.database_type()
     self.mdb = message.open_storage(self.MDBName, self.useMDB)
     self.stats = Stats(options, self.mdb)
     self.buildStatusStrings()
示例#18
0
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify,
        pwd, idxname, logname):
    bayes = storage.open_storage(bdbname, useDBM)

    try:
        notesindex = pickle_read(idxname)
    except IOError, e:
        if e.errno != errno.ENOENT:
            raise
        notesindex = {}
        print "%s file not found, this is a first time run" % (idxname,)
        print "No classification will be performed"
示例#19
0
 def test_merge_to_dbm(self):
     # Create a dbm classifier to merge with.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save data to check against.
     original_nham = bayes.nham
     original_nspam = bayes.nspam
     original_data = {}
     for key in bayes._wordinfokeys():
         original_data[key] = bayes._wordinfoget(key)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Create a CSV file to import.
     nham, nspam = 3, 4
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("%d,%d\n" % (nham, nspam))
     csv_data = {
         "this": (2, 1),
         "is": (0, 1),
         "a": (3, 4),
         'test': (1, 1),
         "of": (1, 0),
         "the": (1, 2),
         "import": (3, 1)
     }
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME)
     # Open the converted file and verify that it has all the data from
     # the CSV file (and by opening it, that it is a valid dbm file),
     # and the data from the original dbm database.
     bayes2 = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(bayes2.nham, nham + original_nham)
     self.assertEqual(bayes2.nspam, nspam + original_nspam)
     words = original_data.keys()[:]
     words.extend(csv_data.keys())
     for word in words:
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes2._wordinfokeys())
         h, s = csv_data.get(word, (0, 0))
         wi = original_data.get(word, None)
         if wi:
             h += wi.hamcount
             s += wi.spamcount
         wi2 = bayes2._wordinfoget(word)
         self.assertEqual(h, wi2.hamcount)
         self.assertEqual(s, wi2.spamcount)
示例#20
0
    def testNoDBMAvailable(self):
        import tempfile
        from spambayes.storage import open_storage

        db_name = tempfile.mktemp("nodbmtest")
        DBDictClassifier_load = DBDictClassifier.load
        DBDictClassifier.load = self._fail_open_best
        # Redirect sys.stderr, as open_storage() prints a msg to stderr.
        # Then it does sys.exit(), which we catch.
        sys_stderr = sys.stderr
        sys.stderr = StringIO.StringIO()
        try:
            try:
                open_storage(db_name, "dbm")
            except SystemExit:
                pass
            else:
                self.fail("expected SystemExit from open_storage() call")
        finally:
            DBDictClassifier.load = DBDictClassifier_load
            sys.stderr = sys_stderr

        if os.path.isfile(db_name):
            os.remove(db_name)
示例#21
0
 def testNoDBMAvailable(self):
     import tempfile
     from spambayes.storage import open_storage
     DBDictClassifier_load = DBDictClassifier.load
     DBDictClassifier.load = self.fail_open_best
     sys_exit = sys.exit
     sys.exit = self.success
     self.succeeded = False
     db_name = tempfile.mktemp("nodbmtest")
     s = open_storage(db_name, True)
     DBDictClassifier.load = DBDictClassifier_load
     sys.exit = sys_exit
     if not self.succeeded:
         self.fail()
     if os.path.isfile(db_name):
         os.remove(db_name)
示例#22
0
    def createWorkers(self):
        """There aren't many workers in an IMAP State - most of the
        work is done elsewhere.  We do need to load the classifier,
        though, and build the status strings."""
        # Load token and message databases.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        if not hasattr(self, "MBDName"):
            self.MDBName, self.useMDB = message.database_type()
        self.mdb = message.open_storage(self.MDBName, self.useMDB)

        # Load stats manager.
        self.stats = Stats(options, self.mdb)

        # Build status strings.
        self.buildStatusStrings()
示例#23
0
 def main():

    """Main program; parse options and go."""

    try:

        opts, args = getopt.getopt(sys.argv[1:], 'hd:p:o:')

    except getopt.error as msg:

        usage(2, msg)

    options = Options.options

    for opt, arg in opts:

        if opt == '-h':

            usage(0)

        elif opt == '-o':

            options.set_from_cmdline(arg, sys.stderr)

    dbname, usedb = storage.database_type(opts)

    if len(args) != 1:

        usage(2, "IP:PORT not specified")

    ip, port = args[0].split(":")

    port = int(port)

    bayes = storage.open_storage(dbname, usedb)

    h = XMLHammie(bayes)

    server = ReusableSimpleXMLRPCServer(
        (ip, port),
        xmlrpc.server.SimpleXMLRPCRequestHandler)

    server.register_instance(h)

    server.serve_forever()
示例#24
0
 def test_import_to_dbm(self):
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("3,4\n")
     csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
                 "of":(1,0), "the":(1,2), "import":(3,1)}
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", True, TEMP_CSV_NAME)
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(bayes.nham, 3)
     self.assertEqual(bayes.nspam, 4)
     for word, (ham, spam) in csv_data.items():
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(wi.hamcount, ham)
         self.assertEqual(wi.spamcount, spam)
示例#25
0
 def test_merge_to_dbm(self):
     # Create a dbm classifier to merge with.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save data to check against.
     original_nham = bayes.nham
     original_nspam = bayes.nspam
     original_data = {}
     for key in bayes._wordinfokeys():
         original_data[key] = bayes._wordinfoget(key)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Create a CSV file to import.
     nham, nspam = 3,4
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("%d,%d\n" % (nham, nspam))
     csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
                 "of":(1,0), "the":(1,2), "import":(3,1)}
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME)
     # Open the converted file and verify that it has all the data from
     # the CSV file (and by opening it, that it is a valid dbm file),
     # and the data from the original dbm database.
     bayes2 = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(bayes2.nham, nham + original_nham)
     self.assertEqual(bayes2.nspam, nspam + original_nspam)
     words = original_data.keys()[:]
     words.extend(csv_data.keys())
     for word in words:
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes2._wordinfokeys())
         h, s = csv_data.get(word, (0,0))
         wi = original_data.get(word, None)
         if wi:
             h += wi.hamcount
             s += wi.spamcount
         wi2 = bayes2._wordinfoget(word)
         self.assertEqual(h, wi2.hamcount)
         self.assertEqual(s, wi2.spamcount)
示例#26
0
 def test_dbm_export(self):
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     bayes.store()
     bayes.close()
     sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME)
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     fp = open(TEMP_CSV_NAME, "rb")
     reader = sb_dbexpimp.csv.reader(fp)
     (nham, nspam) = reader.next()
     self.assertEqual(int(nham), bayes.nham)
     self.assertEqual(int(nspam), bayes.nspam)
     for (word, hamcount, spamcount) in reader:
         word = sb_dbexpimp.uunquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(int(hamcount), wi.hamcount)
         self.assertEqual(int(spamcount), wi.spamcount)
示例#27
0
def hammer():
    """Trains and classifies repeatedly."""
    global bayes
    wellFlushed = False
    for i in range(1, 1000000):
        isSpam = random.choice([True, False])
        train(makeMessage(isSpam), isSpam)
        if random.randrange(1000) == 1:
            print "Flushing."
            bayes.store()
            if i > 500:
                wellFlushed = True
        isSpam = random.choice([True, False])
        prob = classify(makeMessage(isSpam))
        if i < 10 or i % 100 == 0:
            print "%6.6d: %d, %.4f" % (i, isSpam, prob)
        if wellFlushed and random.randrange(1000) == 1:
            print "Re-opening."
            bayes = storage.open_storage(FILENAME, True)
示例#28
0
 def createWorkers(self):
     """Using the options that were initialised in __init__ and then
     possibly overridden by the driver code, create the Bayes object,
     the Corpuses, the Trainers and so on."""
     print("Loading database...", end=' ')
     if self.isTest:
         self.useDB = "pickle"
         self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
     if not hasattr(self, "DBName"):
         self.DBName, self.useDB = storage.database_type([])
     self.bayes = storage.open_storage(self.DBName, self.useDB)
     self.mdb = spambayes.message.Message().message_info_db
     self.stats = Stats.Stats(options, self.mdb)
     self.buildStatusStrings()
     if not self.isTest:
         sc = get_pathname_option("Storage", "spam_cache")
         hc = get_pathname_option("Storage", "ham_cache")
         uc = get_pathname_option("Storage", "unknown_cache")
         for d in [sc, hc, uc]:
             storage.ensureDir(d)
         if self.gzipCache:
             factory = GzipFileMessageFactory()
         else:
             factory = FileMessageFactory()
         age = options["Storage", "cache_expiry_days"]*24*60*60
         self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                            '[0123456789\-]*',
                                            cacheSize=20)
         self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                           '[0123456789\-]*',
                                           cacheSize=20)
         self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
         self.spamCorpus.removeExpiredMessages()
         self.hamCorpus.removeExpiredMessages()
         self.unknownCorpus.removeExpiredMessages()
         self.spamTrainer = storage.SpamTrainer(self.bayes)
         self.hamTrainer = storage.HamTrainer(self.bayes)
         self.spamCorpus.addObserver(self.spamTrainer)
         self.hamCorpus.addObserver(self.hamTrainer)
示例#29
0
 def test_import_to_dbm(self):
     # Create a CSV file to import.
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("3,4\n")
     csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
                 "of":(1,0), "the":(1,2), "import":(3,1)}
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", True, TEMP_CSV_NAME)
     # Open the converted file and verify that it has all the data from
     # the CSV file (and by opening it, that it is a valid dbm file).
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(bayes.nham, 3)
     self.assertEqual(bayes.nspam, 4)
     for word, (ham, spam) in csv_data.items():
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(wi.hamcount, ham)
         self.assertEqual(wi.spamcount, spam)
示例#30
0
文件: hammer.py 项目: Xodarap/Eipi
def hammer():
    """Trains and classifies repeatedly."""
    global bayes
    wellFlushed = False
    for i in range(1, 1000000):
        # Train.
        isSpam = random.choice([True, False])
        train(makeMessage(isSpam), isSpam)

        # Every thousand messages or so, flush the DB to disk.
        if random.randrange(1000) == 1:
            print "Flushing."
            bayes.store()
            if i > 500:
                wellFlushed = True

        # Classify.
        isSpam = random.choice([True, False])
        prob = classify(makeMessage(isSpam))
        if i < 10 or i % 100 == 0:
            print "%6.6d: %d, %.4f" % (i, isSpam, prob)

        # Every thousand messages or so, reopen the DB without closing it.
        # The way this works will open the new instance before the existing
        # one goes away, which can cause a DBRunRecoveryError.  Versions up
        # to 1.0a5 had a bug in that did this, but people were still
        # reporting DBRunRecoveryErrors in 1.0a6, so I don't think we can
        # call it fixed.

        # We don't do this within the first few hundred messages, or before
        # the DB has been flushed, because that can give a "hamcount > nham"
        # error.  Despite this, you still see those errors.  Either I've got
        # something badly wrong, or they're the result of corrupt databases
        # that aren't caught by bsddb and turned into DBRunRecoveryErrors.
        if wellFlushed and random.randrange(1000) == 1:
            print "Re-opening."
            bayes = storage.open_storage(FILENAME, True)
示例#31
0
文件: hammer.py 项目: Xodarap/Eipi
__author__ = "Richie Hindle <*****@*****.**>"

headerTemplate = """To: %(To)s
From: %(From)s
Subject: %(Subject)s
Date: %(Date)s

"""

# Create a fresh bayes object to train and classify.
FILENAME = "__hammer.db"
try:
    os.remove(FILENAME)
except OSError:
    pass
bayes = storage.open_storage(FILENAME, True)


def train(text, isSpam):
    """Trains the classifier on the given text."""
    tokens = tokenizer.tokenize(text)
    bayes.learn(tokens, isSpam)


def classify(text):
    """Classifies the given text, returning the spamprob."""
    tokens = tokenizer.tokenize(text)
    return bayes.spamprob(tokens)


def makeMessage(isSpam):
 def open_spamdb(self, request):
     if self.sbayes is None:
         event_log = request.rootpage.getPagePath('event-log', isfile=1)
         spam_db = os.path.join(os.path.dirname(event_log), self.spam_db)
         self.sbayes = Hammie(storage.open_storage(spam_db, "pickle", 'c'))
         atexit.register(self.close_spamdb)
示例#33
0
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify,
        pwd, idxname, logname):
    bayes = storage.open_storage(bdbname, useDBM)
    try:
        notesindex = pickle_read(idxname)
    except IOError as e:
        if e.errno != errno.ENOENT:
            raise
        notesindex = {}
        print("%s file not found, this is a first time run" % (idxname,))
        print("No classification will be performed")
    need_replicate = False
    sess = win32com.client.Dispatch("Lotus.NotesSession")
    try:
        if pwd:
            sess.initialize(pwd)
        else:
            sess.initialize()
    except pywintypes.com_error:
        print("Session aborted")
        sys.exit()
    try:
        db = sess.GetDatabase(rdbname, ldbname)
    except pywintypes.com_error:
        if rdbname:
            print("Could not open database remotely, trying locally")
            try:
                db = sess.GetDatabase("", ldbname)
                need_replicate = True
            except pywintypes.com_error:
                print("Could not open database")
                sys.exit()
        else:
            raise
    log = sess.CreateLog("SpambayesAgentLog")
    try:
        log.OpenNotesLog("", logname)
    except pywintypes.com_error:
        print("Could not open log")
        log = None
    if log:
        log.LogAction("Running spambayes")
    vinbox = db.getView('($Inbox)')
    vspam = db.getView("%s\Spam" % (foldname,))
    vham = db.getView("%s\Ham" % (foldname,))
    vtrainspam = db.getView("%s\Train as Spam" % (foldname,))
    vtrainham = db.getView("%s\Train as Ham" % (foldname,))
    if doTrain:
        processAndTrain(vtrainspam, vspam, bayes, True, notesindex, log)
        processAndTrain(vtrainham, vham, bayes, False, notesindex, log)
    if need_replicate:
        try:
            print("Replicating...")
            db.Replicate(rdbname)
            print("Done")
        except pywintypes.com_error:
            print("Could not replicate")
    if doClassify:
        classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log)
    print("The Spambayes database currently has %s Spam and %s Ham" \
          % (bayes.nspam, bayes.nham))
    bayes.store()
    pickle_write(idxname, notesindex)
    if log:
        log.LogAction("Finished running spambayes")
示例#34
0
        elif opt == '--ratio':
            arg = arg.split(":")
            sh_ratio = (int(arg[0]), int(arg[1]))

    if ham is None or spam is None:
        usage("require both ham and spam piles")
        return 1

    dbname, usedb = storage.database_type(opts)

    try:
        os.unlink(dbname)
    except OSError:
        pass

    store = storage.open_storage(dbname, usedb)

    tdict = {}
    train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose,
          sh_ratio)

    store.store()
    store.close()

    if cullext is not None:
        cull(ham, cullext, 'ham', tdict)
        cull(spam, cullext, 'spam', tdict)

    return 0

        opts, args = getopt.getopt(sys.argv[1:], 'hd:p:o:')
    except getopt.error, msg:
        usage(2, msg)

    options = Options.options

    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == '-o':
            options.set_from_cmdline(arg, sys.stderr)
    dbname, usedb = storage.database_type(opts)

    if len(args) != 1:
        usage(2, "IP:PORT not specified")

    ip, port = args[0].split(":")
    port = int(port)

    bayes = storage.open_storage(dbname, usedb)
    h = XMLHammie(bayes)

    server = ReusableSimpleXMLRPCServer(
        (ip, port), SimpleXMLRPCServer.SimpleXMLRPCRequestHandler)
    server.register_instance(h)
    server.serve_forever()


if __name__ == "__main__":
    main()
示例#36
0
 def open_spamdb(self, request):
     if self.sbayes is None:
         event_log = request.rootpage.getPagePath('event-log', isfile=1)
         spam_db = os.path.join(os.path.dirname(event_log), self.spam_db)
         self.sbayes = Hammie(storage.open_storage(spam_db, "pickle", 'c'))
         atexit.register(self.close_spamdb)
示例#37
0
    except getopt.GetoptError, msg:
        usage(msg)
        return 1
    usere = False
    tokenizestdin = False
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-r", "--re"):
            usere = True
        elif opt in ("-t", "--tokenize"):
            tokenizestdin = True
        elif opt in ('-o', '--option'):
            options.set_from_cmdline(arg, sys.stderr)
    if usere and tokenizestdin:
        usage("-r and -t may not be used at the same time")
        return 1
    dbname, usedb = database_type(opts)
    db = open_storage(dbname, usedb)
    if tokenizestdin:
        args = tokenize(sys.stdin)
    if args:
        print_spamcounts(args, db, usere)
        return 0
    else:
        usage("need tokens on cmd line or -t w/ msg on stdin")
        return 1
if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
示例#38
0
文件: wumi.py 项目: gonter/wumi
import os
import os.path
import sys
import syslog
import tempfile
import types
import traceback
import zipfile
import zlib
from email import Errors
from email import Message
import email
from threading import Lock
from spambayes import storage
from spambayes import hammie
bayes = storage.open_storage('/home/georg/hammie.db', 'dbm')
scoremaster = hammie.Hammie(bayes)
score = scoremaster.score  # function!!

import Milter

from posix import getloadavg

import cfg

# Tune this?
configfile = "/etc/mail/wumi.cf"

# TODO - find out the truth about loadconfig ...

示例#39
0
 def change_db():
     classifier = storage.open_storage(*storage.database_type(opts))
     message.Message.message_info_db = message_db
     imap_filter = IMAPFilter(classifier, message_db)
示例#40
0
         if arg == 'y':
             doExpunge = True
         else:
             doExpunge = False
     elif opt == '-i':
         imapDebug = int(arg)
     elif opt == '-l':
         sleepTime = int(arg) * 60
     elif opt == '-o':
         options.set_from_cmdline(arg, sys.stderr)
 bdbname, useDBM = storage.database_type(opts)
 v = get_current_version();
 print "%s.\n" % (v.get_long_version("SpamBayes IMAP Filter"),)
 if options["globals", "verbose"]:
     print "Loading database %s..." % (bdbname),
 classifier = storage.open_storage(bdbname, useDBM)
 message_db = message.Message().message_info_db
 if options["globals", "verbose"]:
     print "Done."
 if not ( launchUI or force_UI or options["imap", "server"] ):
     print "You need to specify both a server and a username."
     sys.exit()
 servers_data = servers(promptForPass)
 stats = Stats.Stats(options, message_db)
 imap_filter = IMAPFilter(classifier, stats)
 if sleepTime or not (doClassify or doTrain):
     imaps = []
     for server, username, password in servers_data:
         if server == "":
             imaps.append(None)
         else:
示例#41
0
    cdbname = args[0]

    dbname = usedb = None

    for opt, arg in opts:

        if opt in ("-h", "--help"):

            usage()

            return 0

    dbname, usedb = storage.database_type(opts)

    store = storage.open_storage(dbname, usedb)

    bayes = CdbClassifier()

    items = []

    for word in store._wordinfokeys():

        record = store._wordinfoget(word)

        prob = store.probability(record)

        items.append((word, str(prob)))

    cdbfile = open(cdbname, "wb")
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hd:p:o:")
    except getopt.error, msg:
        usage(2, msg)

    options = Options.options

    for opt, arg in opts:
        if opt == "-h":
            usage(0)
        elif opt == "-o":
            options.set_from_cmdline(arg, sys.stderr)
    dbname, usedb = storage.database_type(opts)

    if len(args) != 1:
        usage(2, "IP:PORT not specified")

    ip, port = args[0].split(":")
    port = int(port)

    bayes = storage.open_storage(dbname, usedb)
    h = XMLHammie(bayes)

    server = ReusableSimpleXMLRPCServer((ip, port), SimpleXMLRPCServer.SimpleXMLRPCRequestHandler)
    server.register_instance(h)
    server.serve_forever()


if __name__ == "__main__":
    main()
示例#43
0
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-r", "--re"):
            usere = True
        elif opt in ("-t", "--tokenize"):
            tokenizestdin = True
        elif opt in ('-o', '--option'):
            options.set_from_cmdline(arg, sys.stderr)

    if usere and tokenizestdin:
        usage("-r and -t may not be used at the same time")
        return 1

    dbname, usedb = database_type(opts)
    db = open_storage(dbname, usedb)

    if tokenizestdin:
        args = tokenize(sys.stdin)

    if args:
        print_spamcounts(args, db, usere)
        return 0
    else:
        usage("need tokens on cmd line or -t w/ msg on stdin")
        return 1


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))