Пример #1
0
 def drive(nsets):

    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]

    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]

    spamhamdirs = zip(spamdirs, hamdirs)

    d = TestDriver.Driver()

    for spamdir, hamdir in spamhamdirs:

        d.new_classifier()

        d.train(msgs.HamStream(hamdir, [hamdir]),
                msgs.SpamStream(spamdir, [spamdir]))

        for sd2, hd2 in spamhamdirs:

            if (sd2, hd2) == (spamdir, hamdir):

                continue

            d.test(msgs.HamStream(hd2, [hd2]),
                   msgs.SpamStream(sd2, [sd2]))

        d.finishtest()

    d.alldone()
Пример #2
0
def drive(nsets):
    print options.display()
    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]
    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]
    d = TestDriver.Driver()
    d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets),
                            hamdirs[1:], train=1),
            msgs.SpamStream("%s-%d" % (spamdirs[1], nsets),
                            spamdirs[1:], train=1))
    for i in range(nsets):
        h = hamdirs[i]
        s = spamdirs[i]
        hamstream = msgs.HamStream(h, [h], train=0)
        spamstream = msgs.SpamStream(s, [s], train=0)
        if i > 0:
            if options["CV Driver", "build_each_classifier_from_scratch"]:
                d.new_classifier()
                hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1)
                h2 = hamdirs[:]
                del h2[i]
                sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1)
                s2 = spamdirs[:]
                del s2[i]
                d.train(msgs.HamStream(hname, h2, train=1),
                        msgs.SpamStream(sname, s2, train=1))
            else:
                d.untrain(hamstream, spamstream)
        d.test(hamstream, spamstream)
        d.finishtest()
        if i < nsets - 1 and not options["CV Driver",
                                         "build_each_classifier_from_scratch"]:
            d.train(hamstream, spamstream)
    d.alldone()
Пример #3
0
def unlearn_compare(nsets, unsets):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets+1)]
    hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets+1)]
    spamhamdirs = zip(spamdirs, hamdirs)
    unspamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, unsets+1)]
    unhamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, unsets+1)]
    unspamhamdirs = zip(unspamdirs, unhamdirs)

    d = TestDriver.Driver()
    d.new_classifier()
    """
    for spamdir, hamdir in spamhamdirs:
        d.train(msgs.HamStream(hamdir, [hamdir]),
                msgs.SpamStream(spamdir, [spamdir]))
    """
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]),
            msgs.SpamStream(spamdirs[0], [spamdirs[0]]))
    d.train(msgs.HamStream(hamdirs[1], [hamdirs[1]]),
            msgs.SpamStream(spamdirs[1], [spamdirs[1]]))
    d.test(msgs.HamStream(hamdirs[2], [hamdirs[2]]),
           msgs.SpamStream(spamdirs[2], [spamdirs[2]]))
    d.finishtest()
    d.alldone()

    unlearn_driver(d, spamhamdirs, unspamhamdirs)
Пример #4
0
    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        
        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            def ensureDir(dirname):
                try:
                    os.mkdir(dirname)
                except OSError, e:
                    if e.errno != errno.EEXIST:
                        raise

            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)
def drive():
    print options.display()

    spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)]
    ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)]

    d = dictionarywriter.DictionaryWriter(150, 4)
    d.write()

    keep_going = True
    trial_number = 1

    au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[1], [ham[1]]),
                                              msgs.HamStream(ham[2], [ham[2]])],
                                             [msgs.SpamStream(spam[1], [spam[1]]),
                                              msgs.SpamStream(spam[3], [spam[3]])],
                                             msgs.HamStream(ham[0], [ham[0]]),
                                             msgs.SpamStream(spam[0], [spam[0]]),
                                             )
    with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt", 'w') as outfile:

        while keep_going:
            chosen = set()
            current = au.select_initial()
            cluster = au.determine_cluster(current)
            chosen.add(current)
            au.driver.test(au.testing_ham, au.testing_spam)

            while not cluster:
                current = au.select_initial(chosen)
                cluster = au.determine_cluster(current)
                chosen.add(current)
                au.driver.test(au.testing_ham, au.testing_spam)

            cluster_list = list(cluster.cluster_set)

            dicts = au.driver.tester.train_examples[2]

            data = v_correlation(cluster_list, dicts)

            outfile.write("Trial " + str(trial_number) + " Percentage Overlap (Correlation): " + str(data))
            answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ")

            valid_input = False

            while not valid_input:
                if answer == "n":
                    keep_going = False
                    valid_input = True

                elif answer == "y":
                    au.learn(cluster)
                    au.init_ground()
                    trial_number += 1
                    valid_input = True

                else:
                    print "Please enter either y or n."
    def __init__(self, spam_feature=None, ham_feature=None, inject_type=0):

        self.h_injected = get_pathname_option("TestDriver", "ham_directories") % 3 + "/"
        self.s_injected = get_pathname_option("TestDriver", "spam_directories") % 3 + "/"

        if inject_type is 0:
            self.feature = spam_feature
        elif inject_type is 1:
            self.feature = ham_feature
def main():
    ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)]
    spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)]

    t = TestDriver.Driver()
    t.train(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]))
    t.dict_test(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[3], [spam[3]]))
    print "Test sizes: ", len(t.tester.truth_examples[0]), ", ", len(t.tester.truth_examples[1]), "\n"
    print "Detection rate:", t.tester.correct_classification_rate(), "\n"
def main():

    ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)]
    spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)]

    sizes = [0, 60, 120, 240, 480, 840, 1200, 2400, 3600, 4800, 6000]

    d = TestDriver.Driver()
    d.new_classifier()

    detection_rates = []
    target_rates    = []
    false_positives = []
    false_negatives = []
    unsures         = []

    for size in sizes:

        mislabeler = MislabeledFileMover(size)
        mislabeler.random_move_file()

        d.train(msgs.HamStream(ham[0], [ham[0]]),
            msgs.SpamStream(spam[0], [spam[0]]))
        d.test(msgs.HamStream(ham[1], [ham[1]]),
               msgs.SpamStream(spam[1], [spam[1]]))

        target_rate = d.tester.correct_classification_rate()
        target_rates.append(target_rate)

        d.train(msgs.HamStream(ham[2], [ham[2]]),
                msgs.SpamStream(spam[2], [spam[2]]))
        d.test(msgs.HamStream(ham[1], [ham[1]]),
               msgs.SpamStream(spam[1], [spam[1]]))

        detection_rate = d.tester.correct_classification_rate()
        detection_rates.append(detection_rate)

        fp = d.tester.nham_wrong
        false_positives.append(fp)
        fn = d.tester.nspam_wrong
        false_negatives.append(fn)
        unsure = d.tester.nham_unsure + d.tester.nspam_unsure
        unsures.append(unsure)

        d.untrain(msgs.HamStream(ham[0], [ham[0]]),
                  msgs.SpamStream(spam[0], [spam[0]]))
        d.untrain(msgs.HamStream(ham[2], [ham[2]]),
                  msgs.SpamStream(spam[2], [spam[2]]))

        mislabeler.reset()

    with open("/Users/AlexYang/Desktop/hamasspam.txt", 'w') as outfile:

        outfile.write(tabulate({"# of Mislabeled Words": sizes,
                                "Detection Rates": detection_rates,
                                "Target Rates": target_rates},
                               headers="keys", tablefmt="plain"))
Пример #9
0
    def create_workers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        if self.is_test:
            self.use_db = "pickle"
            self.db_name = '_core_server.pickle'   # This is never saved.
        if not hasattr(self, "db_name"):
            self.db_name, self.use_db = storage.database_type([])
        self.bayes = storage.open_storage(self.db_name, self.use_db)

        # Load stats manager.
        self.stats = Stats.Stats(options,
                                 spambayes.message.Message().message_info_db)

        self.build_status_strings()

        # Don't set up the caches and training objects when running the
        # self-test, so as not to clutter the filesystem.
        if not self.is_test:
            # Create/open the Corpuses.  Use small cache sizes to avoid
            # hogging lots of memory.
            sc = get_pathname_option("Storage", "core_spam_cache")
            hc = get_pathname_option("Storage", "core_ham_cache")
            uc = get_pathname_option("Storage", "core_unknown_cache")
            for d in [sc, hc, uc]:
                storage.ensureDir(d)
            if self.gzip_cache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spam_trainer = storage.SpamTrainer(self.bayes)
            self.ham_trainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spam_trainer)
            self.hamCorpus.addObserver(self.ham_trainer)
Пример #10
0
def drive(nsets,decision):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]
    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]

    spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)]
    hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)]

    nham = len(hamfns)
    nspam = len(spamfns)
    cc = CostCounter.nodelay()

    allfns = {}
    for fn in spamfns+hamfns:
        allfns[fn] = None

    d = hammie.open('weaktest.db', False)

    hamtrain = 0
    spamtrain = 0
    n = 0
    for dir,name, is_spam in allfns.iterkeys():
        n += 1
        m=msgs.Msg(dir, name).guts
        if debug > 1:
            print "trained:%dH+%dS"%(hamtrain,spamtrain)
        scr=d.score(m)
        if debug > 1:
            print "score:%.3f"%scr
        if not decision.tooearly():
            if is_spam:
                if debug > 0:
                    print "Spam with score %.2f"%scr
                cc.spam(scr)
            else:
                if debug > 0:
                    print "Ham with score %.2f"%scr
                cc.ham(scr)
        de = decision(scr,is_spam)
        if de == TRAIN_AS_SPAM:
            d.train_spam(m)
            spamtrain += 1
        elif de == TRAIN_AS_HAM:
            d.train_ham(m)
            hamtrain += 1
        if n % 100 == 0:
            print "%5d trained:%dH+%dS wrds:%d"%(
                n, hamtrain, spamtrain, len(d.bayes.wordinfo))
            print cc
    print "="*70
    print "%5d trained:%dH+%dS wrds:%d"%(
        n, hamtrain, spamtrain, len(d.bayes.wordinfo))
    print cc
Пример #11
0
    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        self.mdb = spambayes.message.Message().message_info_db

        # Load stats manager.
        self.stats = Stats.Stats(options, self.mdb)

        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(storage.ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)
Пример #12
0
def drive(nsets):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets+1)]
    hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets+1)]

    d = TestDriver.Driver()
    d.new_classifier()
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]]))
    d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]]))
    d.finishtest()
    d.alldone()
Пример #13
0
def drive(nsets):
    print options.display()

    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]
    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]

    d = TestDriver.Driver()
    # Train it on all sets except the first.
    d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets),
                            hamdirs[1:], train=1),
            msgs.SpamStream("%s-%d" % (spamdirs[1], nsets),
                            spamdirs[1:], train=1))

    # Now run nsets times, predicting pair i against all except pair i.
    for i in range(nsets):
        h = hamdirs[i]
        s = spamdirs[i]
        hamstream = msgs.HamStream(h, [h], train=0)
        spamstream = msgs.SpamStream(s, [s], train=0)

        if i > 0:
            if options["CV Driver", "build_each_classifier_from_scratch"]:
                # Build a new classifier from the other sets.
                d.new_classifier()

                hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1)
                h2 = hamdirs[:]
                del h2[i]

                sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1)
                s2 = spamdirs[:]
                del s2[i]

                d.train(msgs.HamStream(hname, h2, train=1),
                        msgs.SpamStream(sname, s2, train=1))

            else:
                # Forget this set.
                d.untrain(hamstream, spamstream)

        # Predict this set.
        d.test(hamstream, spamstream)
        d.finishtest()

        if i < nsets - 1 and not options["CV Driver",
                                         "build_each_classifier_from_scratch"]:
            # Add this set back in.
            d.train(hamstream, spamstream)

    d.alldone()
Пример #14
0
def drive(nsets, decision):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets + 1)]
    hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets + 1)]

    spamfns = [(x, y, 1) for x in spamdirs for y in os.listdir(x)]
    hamfns = [(x, y, 0) for x in hamdirs for y in os.listdir(x)]

    nham = len(hamfns)
    nspam = len(spamfns)
    cc = CostCounter.nodelay()

    allfns = {}
    for fn in spamfns + hamfns:
        allfns[fn] = None

    d = hammie.open("weaktest.db", False)

    hamtrain = 0
    spamtrain = 0
    n = 0
    for dir, name, is_spam in allfns.iterkeys():
        n += 1
        m = msgs.Msg(dir, name).guts
        if debug > 1:
            print "trained:%dH+%dS" % (hamtrain, spamtrain)
        scr = d.score(m)
        if debug > 1:
            print "score:%.3f" % scr
        if not decision.tooearly():
            if is_spam:
                if debug > 0:
                    print "Spam with score %.2f" % scr
                cc.spam(scr)
            else:
                if debug > 0:
                    print "Ham with score %.2f" % scr
                cc.ham(scr)
        de = decision(scr, is_spam)
        if de == TRAIN_AS_SPAM:
            d.train_spam(m)
            spamtrain += 1
        elif de == TRAIN_AS_HAM:
            d.train_ham(m)
            hamtrain += 1
        if n % 100 == 0:
            print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo))
            print cc
    print "=" * 70
    print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo))
    print cc
Пример #15
0
def drive(nsets):
    print options.display()

    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]
    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]

    d = TestDriver.Driver()
    # Train it on all sets except the first.
    d.train(
        msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1),
        msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1))

    # Now run nsets times, predicting pair i against all except pair i.
    for i in range(nsets):
        h = hamdirs[i]
        s = spamdirs[i]
        hamstream = msgs.HamStream(h, [h], train=0)
        spamstream = msgs.SpamStream(s, [s], train=0)

        if i > 0:
            if options["CV Driver", "build_each_classifier_from_scratch"]:
                # Build a new classifier from the other sets.
                d.new_classifier()

                hname = "%s-%d, except %d" % (hamdirs[0], nsets, i + 1)
                h2 = hamdirs[:]
                del h2[i]

                sname = "%s-%d, except %d" % (spamdirs[0], nsets, i + 1)
                s2 = spamdirs[:]
                del s2[i]

                d.train(msgs.HamStream(hname, h2, train=1),
                        msgs.SpamStream(sname, s2, train=1))

            else:
                # Forget this set.
                d.untrain(hamstream, spamstream)

        # Predict this set.
        d.test(hamstream, spamstream)
        d.finishtest()

        if i < nsets - 1 and not options["CV Driver",
                                         "build_each_classifier_from_scratch"]:
            # Add this set back in.
            d.train(hamstream, spamstream)

    d.alldone()
Пример #16
0
def drive(num):
    print options.display()

    spamdirs = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 4)
    ]
    hamdirs = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 4)
    ]

    r = mislabeledfilemover.MislabeledFileMover(num)
    r.random_move_file()

    d = TestDriver.Driver()
    d.new_classifier()
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]),
            msgs.SpamStream(spamdirs[0], [spamdirs[0]]))
    d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]),
            msgs.SpamStream(spamdirs[2], [spamdirs[2]]))
    d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]),
           msgs.SpamStream(spamdirs[1], [spamdirs[1]]))

    guess = d.classifier.spamprob
    polluted = []
    for msg in msgs.HamStream(hamdirs[2], [hamdirs[2]]):
        msg.prob = guess(msg)
        polluted.append(msg)

    for msg in msgs.SpamStream(spamdirs[2], [spamdirs[2]]):
        msg.prob = guess(msg)
        polluted.append(msg)

    mislabeled = []
    for fp in d.tester.false_positives():
        mislabeled.append(fp)

    for fn in d.tester.false_negatives():
        mislabeled.append(fn)

    for unsure in d.unsure:
        mislabeled.append(unsure)

    d.finishtest()
    d.alldone()

    data = v_correlation(polluted, mislabeled)

    print "Percentage Overlap (Correlation): " + str(data)
Пример #17
0
def main():

    ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)]
    spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)]
    injected = get_pathname_option("TestDriver", "spam_directories") % 3

    au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[0], [ham[0]]), msgs.HamStream(ham[2], [ham[2]])],
                                             [msgs.SpamStream(spam[0], [spam[0]]), msgs.SpamStream(spam[2], [spam[2]])],
                                             msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]]))

    msg = choice(au.driver.tester.train_examples[2])    # Randomly chosen from Ham Set3

    original_rate = au.driver.tester.correct_classification_rate()
    cluster_sizes = []
    detection_rates = []
    target_cluster_rates = []

    sizes = []
    for i in range(150, 1050, 50):
        sizes.append(i)
    for i in range(1000, 15000, 1000):
        sizes.append(i)

    for size in sizes:
        cluster = ActiveUnlearnDriver.Cluster(msg, size, au, "extreme")
        print "Clustering with size " + str(cluster.size) + "..."
        cluster_sizes.append(size)
        detection_rates.append(au.detect_rate(cluster))
        target_cluster_rates.append(float(cluster.target_set3()) / float(cluster.size))

    file = open("/Users/AlexYang/Desktop/clues.txt", 'w')

    features = au.driver.classifier._getclues(msg)
    i = 1
    for feature in features:
        file.write(str(i) + ") ")
        file.write(str(feature) + "\n")
        i += 1

    with open("/Users/AlexYang/Desktop/clusterstats.txt", 'w') as outfile:

        outfile.write("Clustered around: " + msg.tag)
        outfile.write("\nOriginal Rate: " + str(original_rate) + "\n")

        outfile.write(tabulate({"Cluster Sizes": cluster_sizes,
                                "Detection Rates": detection_rates,
                                "% of Targets Clustered": target_cluster_rates},
                               headers="keys", tablefmt="plain"))
Пример #18
0
def database_type(opts):
    """Return the name of the database and the type to use.  The output of
    this function can be used as the db_type parameter for the open_storage
    function, for example:

        [standard getopts code]
        db_name, db_type = database_types(opts)
        storage = open_storage(db_name, db_type)

    The selection is made based on the options passed, or, if the
    appropriate options are not present, the options in the global
    options object.

    Currently supports:
       -p  :  pickle
       -d  :  dbm
    """
    nm, typ = None, None
    for opt, arg in opts:
        if _storage_options.has_key(opt):
            if nm is None and typ is None:
                nm, typ = arg, _storage_options[opt]
            else:
                raise MutuallyExclusiveError()
    if nm is None and typ is None:
        typ = options["Storage", "persistent_use_database"]
        if typ is True or typ == "True":
            typ = "dbm"
        elif typ is False or typ == "False":
            typ = "pickle"
        nm = get_pathname_option("Storage", "persistent_storage_file")
    return nm, typ
Пример #19
0
def database_type(
    opts, default_type=("Storage", "persistent_use_database"), default_name=("Storage", "persistent_storage_file")
):
    """Return the name of the database and the type to use.  The output of
    this function can be used as the db_type parameter for the open_storage
    function, for example:
        [standard getopts code]
        db_name, db_type = database_type(opts)
        storage = open_storage(db_name, db_type)
    The selection is made based on the options passed, or, if the
    appropriate options are not present, the options in the global
    options object.
    Currently supports:
       -p  :  pickle
       -d  :  dbm
    """
    nm, typ = None, None
    for opt, arg in opts:
        if opt in _storage_options:
            if nm is None and typ is None:
                nm, typ = arg, _storage_options[opt]
            else:
                raise MutuallyExclusiveError()
    if nm is None and typ is None:
        typ = options[default_type]
        try:
            unused, unused, is_path = _storage_types[typ]
        except KeyError:
            raise NoSuchClassifierError(typ)
        if is_path:
            nm = get_pathname_option(*default_name)
        else:
            nm = options[default_name]
    return nm, typ
Пример #20
0
def main():
    print "Pickle is available."
    db = dumbdbm.open("dumbdb", "c")
    db["1"] = "1"
    db.close()
    dbstr = whichdb.whichdb("dumbdb")
    if dbstr:
        print "Dumbdbm is available."
    else:
        print "Dumbdbm is not available."

    db = dbhash.open("dbhash", "c")
    db["1"] = "1"
    db.close()
    dbstr = whichdb.whichdb("dbhash")
    if dbstr == "dbhash":
        print "Dbhash is available."
    else:
        print "Dbhash is not available."

    if bsddb is None:
        dbstr = ""
    else:
        db = bsddb.hashopen("bsddb3", "c")
        db["1"] = "1"
        db.close()
        dbstr = whichdb.whichdb("bsddb3")
    if dbstr == "dbhash":
        print "Bsddb[3] is available."
    else:
        print "Bsddb[3] is not available."

    print

    hammie = get_pathname_option("Storage", "persistent_storage_file")
    use_dbm = options["Storage", "persistent_use_database"]
    if not use_dbm:
        print "Your storage %s is a: pickle" % (hammie,)
        return

    if not os.path.exists(hammie):
        print "Your storage file does not exist yet."
        return
    db_type = whichdb.whichdb(hammie)
    if db_type == "dbhash":
        # could be dbhash or bsddb3
        # only bsddb3 has a __version__ attribute - old bsddb module does not
        if hasattr(bsddb, '__version__'):
            try:
                db = bsddb.hashopen(hammie, "r")
            except bsddb.error:
                pass
            else:
                db.close()
                print "Your storage", hammie, "is a: bsddb[3]"
                return
    elif db_type is None:
        print "Your storage %s is unreadable." % (hammie,)
    print "Your storage %s is a: %s" % (hammie, db_type)
Пример #21
0
def main():
    print "Pickle is available."
    db = dumbdbm.open("dumbdb", "c")
    db["1"] = "1"
    db.close()
    dbstr = whichdb.whichdb("dumbdb")
    if dbstr:
        print "Dumbdbm is available."
    else:
        print "Dumbdbm is not available."

    db = dbhash.open("dbhash", "c")
    db["1"] = "1"
    db.close()
    dbstr = whichdb.whichdb("dbhash")
    if dbstr == "dbhash":
        print "Dbhash is available."
    else:
        print "Dbhash is not available."

    if bsddb is None:
        dbstr = ""
    else:
        db = bsddb.hashopen("bsddb3", "c")
        db["1"] = "1"
        db.close()
        dbstr = whichdb.whichdb("bsddb3")
    if dbstr == "dbhash":
        print "Bsddb[3] is available."
    else:
        print "Bsddb[3] is not available."

    print

    hammie = get_pathname_option("Storage", "persistent_storage_file")
    use_dbm = options["Storage", "persistent_use_database"]
    if not use_dbm:
        print "Your storage %s is a: pickle" % (hammie,)
        return

    if not os.path.exists(hammie):
        print "Your storage file does not exist yet."
        return
    db_type = whichdb.whichdb(hammie)
    if db_type == "dbhash":
        # could be dbhash or bsddb3
        # only bsddb3 has a __version__ attribute - old bsddb module does not
        if hasattr(bsddb, '__version__'):
            try:
                db = bsddb.hashopen(hammie, "r")
            except bsddb.error:
                pass
            else:
                db.close()
                print "Your storage", hammie, "is a: bsddb[3]"
                return
    elif db_type is None:
        print "Your storage %s is unreadable." % (hammie,)
    print "Your storage %s is a: %s" % (hammie, db_type)
def splice_set(n, dir_num=3):
    destination = get_pathname_option("TestDriver", "spam_directories") % dir_num + "/"
    dict_c = 1
    for dictionary in listdir(destination):
        print "Slicing dictionary", dict_c, "into", n, "parts"
        splice(destination + dictionary, n)
        remove(destination + dictionary)
        dict_c += 1
def drive(num):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") %
                i for i in range(1, 4)]
    hamdirs = [get_pathname_option("TestDriver", "ham_directories") %
               i for i in range(1, 4)]

    r = mislabeledfilemover.MislabeledFileMover(num)
    r.random_move_file()

    d = TestDriver.Driver()
    d.new_classifier()
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]),
            msgs.SpamStream(spamdirs[0], [spamdirs[0]]))
    d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]),
            msgs.SpamStream(spamdirs[2], [spamdirs[2]]))
    d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]),
           msgs.SpamStream(spamdirs[1], [spamdirs[1]]))

    guess = d.classifier.spamprob
    polluted = []
    for msg in msgs.HamStream(hamdirs[2], [hamdirs[2]]):
        msg.prob = guess(msg)
        polluted.append(msg)

    for msg in msgs.SpamStream(spamdirs[2], [spamdirs[2]]):
        msg.prob = guess(msg)
        polluted.append(msg)

    mislabeled = []
    for fp in d.tester.false_positives():
        mislabeled.append(fp)

    for fn in d.tester.false_negatives():
        mislabeled.append(fn)

    for unsure in d.unsure:
        mislabeled.append(unsure)

    d.finishtest()
    d.alldone()

    data = v_correlation(polluted, mislabeled)

    print "Percentage Overlap (Correlation): " + str(data)
Пример #24
0
def main():
    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]

    t = TestDriver.Driver()
    t.train(msgs.HamStream(ham[1], [ham[1]]),
            msgs.SpamStream(spam[1], [spam[1]]))

    keep_going = True
    trial_number = 0

    while keep_going:
        start_time = time.time()
        if trial_number == 0:
            t.test(msgs.HamStream(ham[0], [ham[0]]),
                   msgs.SpamStream(spam[0], [spam[0]]), True)

        else:
            t.test(t.tester.truth_examples[1], t.tester.truth_examples[0])
        end_time = time.time()
        seconds = end_time - start_time

        trial_number += 1
        print "Test sizes: ", len(t.tester.truth_examples[0]), ", ", len(
            t.tester.truth_examples[1]), "\n"
        print "Detection rate:", t.tester.correct_classification_rate(), "\n"
        print "\nTime elapsed:", seconds, "seconds.\n"
        answer = raw_input("Keep trying (y/n)? You have performed " +
                           str(trial_number) + " trial(s) so far. ")

        valid_input = False
        while not valid_input:
            if answer == "y":
                valid_input = True

            elif answer == "n":
                sys.exit()

            else:
                answer = raw_input("Please enter either y or n. ")
Пример #25
0
def splice_set(n, dir_num=3):
    destination = get_pathname_option("TestDriver",
                                      "spam_directories") % dir_num + "/"
    dict_c = 1
    for dictionary in listdir(destination):
        print "Slicing dictionary", dict_c, "into", n, "parts"
        splice(destination + dictionary, n)
        remove(destination + dictionary)
        dict_c += 1
Пример #26
0
def main():
    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]

    t = TestDriver.Driver()
    t.train(msgs.HamStream(ham[0], [ham[0]]),
            msgs.SpamStream(spam[0], [spam[0]]))
    t.dict_test(msgs.HamStream(ham[2], [ham[2]]),
                msgs.SpamStream(spam[3], [spam[3]]))
    print "Test sizes: ", len(t.tester.truth_examples[0]), ", ", len(
        t.tester.truth_examples[1]), "\n"
    print "Detection rate:", t.tester.correct_classification_rate(), "\n"
Пример #27
0
 def createWorkers(self):
     """Using the options that were initialised in __init__ and then
     possibly overridden by the driver code, create the Bayes object,
     the Corpuses, the Trainers and so on."""
     print("Loading database...", end=' ')
     if self.isTest:
         self.useDB = "pickle"
         self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
     if not hasattr(self, "DBName"):
         self.DBName, self.useDB = storage.database_type([])
     self.bayes = storage.open_storage(self.DBName, self.useDB)
     self.mdb = spambayes.message.Message().message_info_db
     self.stats = Stats.Stats(options, self.mdb)
     self.buildStatusStrings()
     if not self.isTest:
         sc = get_pathname_option("Storage", "spam_cache")
         hc = get_pathname_option("Storage", "ham_cache")
         uc = get_pathname_option("Storage", "unknown_cache")
         for d in [sc, hc, uc]:
             storage.ensureDir(d)
         if self.gzipCache:
             factory = GzipFileMessageFactory()
         else:
             factory = FileMessageFactory()
         age = options["Storage", "cache_expiry_days"]*24*60*60
         self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                            '[0123456789\-]*',
                                            cacheSize=20)
         self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                           '[0123456789\-]*',
                                           cacheSize=20)
         self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
         self.spamCorpus.removeExpiredMessages()
         self.hamCorpus.removeExpiredMessages()
         self.unknownCorpus.removeExpiredMessages()
         self.spamTrainer = storage.SpamTrainer(self.bayes)
         self.hamTrainer = storage.HamTrainer(self.bayes)
         self.spamCorpus.addObserver(self.spamTrainer)
         self.hamCorpus.addObserver(self.hamTrainer)
Пример #28
0
def drive(nsets):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]
    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]
    spamhamdirs = zip(spamdirs, hamdirs)

    d = TestDriver.Driver()
    for spamdir, hamdir in spamhamdirs:
        d.new_classifier()
        d.train(msgs.HamStream(hamdir, [hamdir]),
                msgs.SpamStream(spamdir, [spamdir]))
        for sd2, hd2 in spamhamdirs:
            if (sd2, hd2) == (spamdir, hamdir):
                continue
            d.test(msgs.HamStream(hd2, [hd2]),
                   msgs.SpamStream(sd2, [sd2]))
        d.finishtest()
    d.alldone()
Пример #29
0
def main():
    """Main program; parse options and go."""
    global loud
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hfqnrd:p:g:s:o:')
    except getopt.error as msg:
        usage(2, msg)
    if not opts:
        usage(2, "No options given")
    force = False
    trainnew = False
    removetrained = False
    good = []
    spam = []
    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == "-f":
            force = True
        elif opt == "-n":
            trainnew = True
        elif opt == "-q":
            loud = False
        elif opt == '-g':
            good.append(arg)
        elif opt == '-s':
            spam.append(arg)
        elif opt == "-r":
            removetrained = True
        elif opt == '-o':
            options.set_from_cmdline(arg, sys.stderr)
    pck, usedb = storage.database_type(opts)
    if args:
        usage(2, "Positional arguments not allowed")
    if usedb == None:
        usedb = options["Storage", "persistent_use_database"]
        pck = get_pathname_option("Storage",
                                          "persistent_storage_file")
    h = hammie.open(pck, usedb, "c")
    for g in good:
        if loud:
            print("Training ham (%s):" % g)
        train(h, g, False, force, trainnew, removetrained)
        sys.stdout.flush()
        save = True
    for s in spam:
        if loud:
            print("Training spam (%s):" % s)
        train(h, s, True, force, trainnew, removetrained)
        sys.stdout.flush()
        save = True
    if save:
        h.store()
Пример #30
0
def main():
    print("Pickle is available.")
    db = dbm.dumb.open("dumbdb", "c")
    db["1"] = "1"
    db.close()
    dbstr = dbm.whichdb("dumbdb")
    if dbstr:
        print("Dumbdbm is available.")
    else:
        print("Dumbdbm is not available.")
    db = dbm.bsd.open("dbhash", "c")
    db["1"] = "1"
    db.close()
    dbstr = dbm.whichdb("dbhash")
    if dbstr == "dbhash":
        print("Dbhash is available.")
    else:
        print("Dbhash is not available.")
    if bsddb is None:
        dbstr = ""
    else:
        db = bsddb.hashopen("bsddb3", "c")
        db["1"] = "1"
        db.close()
        dbstr = dbm.whichdb("bsddb3")
    if dbstr == "dbhash":
        print("Bsddb[3] is available.")
    else:
        print("Bsddb[3] is not available.")
    print()
    hammie = get_pathname_option("Storage", "persistent_storage_file")
    use_dbm = options["Storage", "persistent_use_database"]
    if not use_dbm:
        print("Your storage %s is a: pickle" % (hammie,))
        return
    if not os.path.exists(hammie):
        print("Your storage file does not exist yet.")
        return
    db_type = dbm.whichdb(hammie)
    if db_type == "dbhash":
        if hasattr(bsddb, '__version__'):
            try:
                db = bsddb.hashopen(hammie, "r")
            except bsddb.error:
                pass
            else:
                db.close()
                print("Your storage", hammie, "is a: bsddb[3]")
                return
    elif db_type is None:
        print("Your storage %s is unreadable." % (hammie,))
    print("Your storage %s is a: %s" % (hammie, db_type))
Пример #31
0
def main():
    ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)]
    spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)]

    t = TestDriver.Driver()
    t.train(msgs.HamStream(ham[1], [ham[1]]), msgs.SpamStream(spam[1], [spam[1]]))

    keep_going = True
    trial_number = 0

    while keep_going:
        start_time = time.time()
        if trial_number == 0:
            t.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]), True)

        else:
            t.test(t.tester.truth_examples[1], t.tester.truth_examples[0])
        end_time = time.time()
        seconds = end_time - start_time

        trial_number += 1
        print "Test sizes: ", len(t.tester.truth_examples[0]), ", ", len(t.tester.truth_examples[1]), "\n"
        print "Detection rate:", t.tester.correct_classification_rate(), "\n"
        print "\nTime elapsed:", seconds, "seconds.\n"
        answer = raw_input("Keep trying (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ")

        valid_input = False
        while not valid_input:
            if answer == "y":
                valid_input = True

            elif answer == "n":
                sys.exit()

            else:
                answer = raw_input("Please enter either y or n. ")
    def __init__(self, number):
        self.NUMBER = number

        self.ham_num = self.NUMBER
        self.ham_source = get_pathname_option("TestDriver", "ham_directories") % 1 + "/"
        self.ham_test = get_pathname_option("TestDriver", "ham_directories") % 2 + "/"
        self.ham_destination = get_pathname_option("TestDriver", "ham_directories") % 3 + "/"
        self.ham_source_files = listdir(self.ham_source)
        self.ham_destination_files = listdir(self.ham_destination)

        self.spam_num = 0
        self.spam_source = get_pathname_option("TestDriver", "spam_directories") % 1 + "/"
        self.spam_test = get_pathname_option("TestDriver", "spam_directories") % 2 + "/"
        self.spam_destination = get_pathname_option("TestDriver", "spam_directories") % 3 + "/"
        self.spam_source_files = listdir(self.spam_source)
        self.spam_destination_files = listdir(self.spam_destination)
Пример #33
0
    def __init__(self, number):
        self.NUMBER = number

        self.ham_num = self.NUMBER
        self.ham_source = get_pathname_option("TestDriver",
                                              "ham_directories") % 1 + "/"
        self.ham_test = get_pathname_option("TestDriver",
                                            "ham_directories") % 2 + "/"
        self.ham_destination = get_pathname_option("TestDriver",
                                                   "ham_directories") % 3 + "/"
        self.ham_source_files = listdir(self.ham_source)
        self.ham_destination_files = listdir(self.ham_destination)

        self.spam_num = 0
        self.spam_source = get_pathname_option("TestDriver",
                                               "spam_directories") % 1 + "/"
        self.spam_test = get_pathname_option("TestDriver",
                                             "spam_directories") % 2 + "/"
        self.spam_destination = get_pathname_option(
            "TestDriver", "spam_directories") % 3 + "/"
        self.spam_source_files = listdir(self.spam_source)
        self.spam_destination_files = listdir(self.spam_destination)
Пример #34
0
def main():
    import os
    import sys
    import shutil

    sys.path.insert(-1, os.getcwd())
    sys.path.insert(-1, os.path.dirname(os.getcwd()))

    from spambayes import ActiveUnlearnDriver
    from spambayes.Options import get_pathname_option
    from spambayes import msgs
    import time

    ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)]
    spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)]

    for i in range(1):
        au = ActiveUnlearnDriver.ActiveUnlearnDriver([msgs.HamStream(ham[0], [ham[0]]),
                                                      msgs.HamStream(ham[2], [ham[2]]),
                                                      msgs.HamStream(ham[3], [ham[3]])],
                                                     [msgs.SpamStream(spam[0], [spam[0]]),
                                                      msgs.SpamStream(spam[2], [spam[2]]),
                                                      msgs.SpamStream(spam[3], [spam[3]])],
                                                     msgs.HamStream(ham[2], [ham[2]]),
                                                     msgs.SpamStream(spam[2], [spam[2]]),
                                                     "ac-extreme")


        au.driver.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]))
        au.driver.untrain(msgs.HamStream(ham[2], [ham[2]]), msgs.SpamStream(spam[2], [spam[2]]))
        au.driver.untrain(msgs.HamStream(ham[3], [ham[3]]), msgs.SpamStream(spam[3], [spam[3]]))
        au.driver.test(msgs.HamStream(ham[0], [ham[0]]), msgs.SpamStream(spam[0], [spam[0]]))
        msg = au.driver.tester.test_examples[5]

        shutil.copy(msg.tag, "C:\Users\Alex\Desktop\clustera")
        print msg.prob

        start_time = time.time()
        cluster = (au.cluster(msg, 10))
        end_time = time.time()
        print cluster

        clueslist = []
        for clue in msg.clues:
            clueslist.append((clue[0], clue[1]))
        print clueslist

        with open("C:\Users\Alex\Desktop\clustera\cluster7.txt", 'w') as outfile:
            spamcounter = 0
            for sim in cluster:
                with open(sim.tag) as infile:
                    if sim.tag.endswith(".spam.txt"):
                        outfile.write("SPAMSPAMSPAMSPAMSPAM" + "\n\n")
                    if sim.tag.endswith(".ham.txt"):
                        outfile.write("HAMHAMHAMHAMHAM" + "\n\n")

                    outfile.write(infile.read())
                    outfile.write("\n\n" + "----------------------------------------" + "\n\n")

                if sim.tag.endswith(".spam.txt"):
                    spamcounter += 1

            print spamcounter

        print end_time - start_time
Пример #35
0
        elif opt == '-g':
            good.append(arg)
        elif opt == '-s':
            spam.append(arg)
        elif opt == "-r":
            removetrained = True
        elif opt == '-o':
            options.set_from_cmdline(arg, sys.stderr)
    pck, usedb = storage.database_type(opts)
    if args:
        usage(2, "Positional arguments not allowed")

    if usedb == None:
        # Use settings in configuration file.
        usedb = options["Storage", "persistent_use_database"]
        pck = get_pathname_option("Storage", "persistent_storage_file")

    h = hammie.open(pck, usedb, "c")

    for g in good:
        if loud:
            print "Training ham (%s):" % g
        train(h, g, False, force, trainnew, removetrained)
        sys.stdout.flush()
        save = True

    for s in spam:
        if loud:
            print "Training spam (%s):" % s
        train(h, s, True, force, trainnew, removetrained)
        sys.stdout.flush()
Пример #36
0
def drive():
    print options.display()

    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]
    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]

    d = dictionarywriter.DictionaryWriter(150, 4)
    d.write()

    keep_going = True
    trial_number = 1

    au = ActiveUnlearnDriver.ActiveUnlearner(
        [msgs.HamStream(ham[1], [ham[1]]),
         msgs.HamStream(ham[2], [ham[2]])],
        [
            msgs.SpamStream(spam[1], [spam[1]]),
            msgs.SpamStream(spam[3], [spam[3]])
        ],
        msgs.HamStream(ham[0], [ham[0]]),
        msgs.SpamStream(spam[0], [spam[0]]),
    )
    with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt",
              'w') as outfile:

        while keep_going:
            chosen = set()
            current = au.select_initial()
            cluster = au.determine_cluster(current)
            chosen.add(current)
            au.driver.test(au.testing_ham, au.testing_spam)

            while not cluster:
                current = au.select_initial(chosen)
                cluster = au.determine_cluster(current)
                chosen.add(current)
                au.driver.test(au.testing_ham, au.testing_spam)

            cluster_list = list(cluster.cluster_set)

            dicts = au.driver.tester.train_examples[2]

            data = v_correlation(cluster_list, dicts)

            outfile.write("Trial " + str(trial_number) +
                          " Percentage Overlap (Correlation): " + str(data))
            answer = raw_input("Keep going (y/n)? You have performed " +
                               str(trial_number) + " trial(s) so far. ")

            valid_input = False

            while not valid_input:
                if answer == "n":
                    keep_going = False
                    valid_input = True

                elif answer == "y":
                    au.learn(cluster)
                    au.init_ground()
                    trial_number += 1
                    valid_input = True

                else:
                    print "Please enter either y or n."
Пример #37
0
def main():

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 4)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 4)
    ]

    sizes = [0, 60, 120, 240, 480, 840, 1200, 2400, 3600, 4800, 6000]

    d = TestDriver.Driver()
    d.new_classifier()

    detection_rates = []
    target_rates = []
    false_positives = []
    false_negatives = []
    unsures = []

    for size in sizes:

        mislabeler = MislabeledFileMover(size)
        mislabeler.random_move_file()

        d.train(msgs.HamStream(ham[0], [ham[0]]),
                msgs.SpamStream(spam[0], [spam[0]]))
        d.test(msgs.HamStream(ham[1], [ham[1]]),
               msgs.SpamStream(spam[1], [spam[1]]))

        target_rate = d.tester.correct_classification_rate()
        target_rates.append(target_rate)

        d.train(msgs.HamStream(ham[2], [ham[2]]),
                msgs.SpamStream(spam[2], [spam[2]]))
        d.test(msgs.HamStream(ham[1], [ham[1]]),
               msgs.SpamStream(spam[1], [spam[1]]))

        detection_rate = d.tester.correct_classification_rate()
        detection_rates.append(detection_rate)

        fp = d.tester.nham_wrong
        false_positives.append(fp)
        fn = d.tester.nspam_wrong
        false_negatives.append(fn)
        unsure = d.tester.nham_unsure + d.tester.nspam_unsure
        unsures.append(unsure)

        d.untrain(msgs.HamStream(ham[0], [ham[0]]),
                  msgs.SpamStream(spam[0], [spam[0]]))
        d.untrain(msgs.HamStream(ham[2], [ham[2]]),
                  msgs.SpamStream(spam[2], [spam[2]]))

        mislabeler.reset()

    with open("/Users/AlexYang/Desktop/hamasspam.txt", 'w') as outfile:

        outfile.write(
            tabulate(
                {
                    "# of Mislabeled Words": sizes,
                    "Detection Rates": detection_rates,
                    "Target Rates": target_rates
                },
                headers="keys",
                tablefmt="plain"))
def test():

    y = [0, 60, 120, 240, 480]

    hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)]
    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)]

    d = TestDriver.Driver()
    d.new_classifier()
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]]))

    mislabeled = [[], [], []]
    prev_detection_rate = None

    detection_rates = []
    detection_rates_on_mislabeled = []
    correct_results = []
    results_from_mislabeled = []
    for y_val in y:
        dw = DictionaryWriter(y_val)
        dw.reset()
        dw.write()

        d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]]))

        if y_val is 0:  # Initial Test
            d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]]))

            rate = d.tester.correct_classification_rate()
            mislabeled[0] = d.tester.ham_wrong_examples  # Ham mislabeled as Spam
            mislabeled[1] = d.tester.spam_wrong_examples  # Spam mislabeled as Ham
            mislabeled[2] = d.tester.unsure_examples  # Unsure

            ham = []
            spam = []

            ham += mislabeled[0]
            spam += mislabeled[1]

            for msg in mislabeled[2]:
                if msg.tag.endswith(".ham.txt"):
                    ham.append(msg)
                elif msg.tag.endswith(".spam.txt"):
                    spam.append(msg)
                else:
                    print "What"
                    exit()

            d.test(ham, spam)
            m_rate = d.tester.correct_classification_rate()

            detection_rates.append(rate)
            prev_detection_rate = rate
            correct_results.append("")
            results_from_mislabeled.append("")
            detection_rates_on_mislabeled.append(m_rate)

            d.untrain(msgs.HamStream(hamdirs[2], [hamdirs[2]]), msgs.SpamStream(spamdirs[2], [spamdirs[2]]))
            dw.reset()
        else:
            d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]]))

            rate = d.tester.correct_classification_rate()
            detection_rates.append(rate)

            if rate > prev_detection_rate:
                correct_results.append("Improved")
            elif rate < prev_detection_rate:
                correct_results.append("Worsened")
            else:
                correct_results.append("Unchanged")

            prev_detection_rate = rate

            ham = []
            spam = []

            ham += mislabeled[0]
            spam += mislabeled[1]

            # for msg in mislabeled[2]:
            #    if msg.tag.endswith(".ham.txt"):
            #        ham.append(msg)
            #    elif msg.tag.endswith(".spam.txt"):
            #        spam.append(msg)
            #    else:
            #        print "What"
            #        exit()

            d.test(ham, spam)
            rate = d.tester.correct_classification_rate()
            detection_rates_on_mislabeled.append(rate)

            dw.reset()

    outfile = open("mislabeled_rates.txt", "w")
    outfile.write(
        tabulate(
            {
                "# of Dictionaries": y,
                "Detection Rate": detection_rates,
                "True Change": correct_results,
                "Detection Rate from Mislabeled": detection_rates_on_mislabeled,
                "Interpreted Change": results_from_mislabeled,
            },
            headers="keys",
        )
    )
        elif opt == '-g':
            good.append(arg)
        elif opt == '-s':
            spam.append(arg)
        elif opt == "-r":
            removetrained = True
        elif opt == '-o':
            options.set_from_cmdline(arg, sys.stderr)
    pck, usedb = storage.database_type(opts)
    if args:
        usage(2, "Positional arguments not allowed")

    if usedb == None:
        # Use settings in configuration file.
        usedb = options["Storage", "persistent_use_database"]
        pck = get_pathname_option("Storage",
                                          "persistent_storage_file")

    h = hammie.open(pck, usedb, "c")

    for g in good:
        if loud:
            print "Training ham (%s):" % g
        train(h, g, False, force, trainnew, removetrained)
        sys.stdout.flush()
        save = True

    for s in spam:
        if loud:
            print "Training spam (%s):" % s
        train(h, s, True, force, trainnew, removetrained)
        sys.stdout.flush()
Пример #40
0
def test():

    y = [0, 60, 120, 240, 480]

    hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 4)]
    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 4)]

    d = TestDriver.Driver()
    d.new_classifier()
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]),
            msgs.SpamStream(spamdirs[0], [spamdirs[0]]))

    mislabeled = [[], [], []]
    prev_detection_rate = None

    detection_rates = []
    detection_rates_on_mislabeled = []
    correct_results = []
    results_from_mislabeled = []
    for y_val in y:
        dw = DictionaryWriter(y_val)
        dw.reset()
        dw.write()

        d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]),
                msgs.SpamStream(spamdirs[2], [spamdirs[2]]))

        if y_val is 0:  # Initial Test
            d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]),
                   msgs.SpamStream(spamdirs[1], [spamdirs[1]]))

            rate = d.tester.correct_classification_rate()
            mislabeled[0] = d.tester.ham_wrong_examples    # Ham mislabeled as Spam
            mislabeled[1] = d.tester.spam_wrong_examples   # Spam mislabeled as Ham
            mislabeled[2] = d.tester.unsure_examples       # Unsure

            ham = []
            spam = []

            ham += mislabeled[0]
            spam += mislabeled[1]

            for msg in mislabeled[2]:
                if msg.tag.endswith(".ham.txt"):
                    ham.append(msg)
                elif msg.tag.endswith(".spam.txt"):
                    spam.append(msg)
                else:
                    print "What"
                    exit()

            d.test(ham, spam)
            m_rate = d.tester.correct_classification_rate()

            detection_rates.append(rate)
            prev_detection_rate = rate
            correct_results.append("")
            results_from_mislabeled.append("")
            detection_rates_on_mislabeled.append(m_rate)

            d.untrain(msgs.HamStream(hamdirs[2], [hamdirs[2]]),
                      msgs.SpamStream(spamdirs[2], [spamdirs[2]]))
            dw.reset()
        else:
            d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]),
                   msgs.SpamStream(spamdirs[1], [spamdirs[1]]))

            rate = d.tester.correct_classification_rate()
            detection_rates.append(rate)

            if rate > prev_detection_rate:
                correct_results.append("Improved")
            elif rate < prev_detection_rate:
                correct_results.append("Worsened")
            else:
                correct_results.append("Unchanged")

            prev_detection_rate = rate

            ham = []
            spam = []

            ham += mislabeled[0]
            spam += mislabeled[1]

            #for msg in mislabeled[2]:
            #    if msg.tag.endswith(".ham.txt"):
            #        ham.append(msg)
            #    elif msg.tag.endswith(".spam.txt"):
            #        spam.append(msg)
            #    else:
            #        print "What"
            #        exit()

            d.test(ham, spam)
            rate = d.tester.correct_classification_rate()
            detection_rates_on_mislabeled.append(rate)

            dw.reset()

    outfile = open("mislabeled_rates.txt", 'w')
    outfile.write(tabulate({"# of Dictionaries": y, "Detection Rate": detection_rates, "True Change": correct_results,
                            "Detection Rate from Mislabeled": detection_rates_on_mislabeled,
                            "Interpreted Change": results_from_mislabeled},
                           headers="keys"))
Пример #41
0
def main():
    import os
    import sys
    from random import choice

    sys.path.insert(-1, os.getcwd())
    sys.path.insert(-1, os.path.dirname(os.getcwd()))

    from spambayes import ActiveUnlearnDriver
    from spambayes.Options import get_pathname_option
    from spambayes import msgs
    """
    from dictionarywriter import DictionaryWriter
    """

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]
    """
    DictionaryWriter(600).write()
    """

    keep_going = True
    trial_number = 1

    au_v = ActiveUnlearnDriver.ActiveUnlearner(
        [msgs.HamStream(ham[1], [ham[1]]),
         msgs.HamStream(ham[2], [ham[2]])],
        [
            msgs.SpamStream(spam[1], [spam[1]]),
            msgs.SpamStream(spam[3], [spam[3]])
        ],
        msgs.HamStream(ham[0], [ham[0]]),
        msgs.SpamStream(spam[0], [spam[0]]),
    )
    while keep_going:
        msg = choice(au_v.driver.tester.train_examples[0])
        try:
            test_cl, counter = au_v.determine_cluster(msg)
            test_size = test_cl.size
            au_v.learn(test_cl)

        except TypeError:
            counter = 1
            test_size = "100, but fail"

        cluster_detection_rates_v = []
        cluster_spam_rates_v = []
        cluster_sizes = []

        au_v.init_ground()
        original_rate_v = au_v.driver.tester.correct_classification_rate()
        cluster_size = 100
        cluster_sizes.append(100)

        print "Clustering with size", cluster_size, "..."

        cl_v = ActiveUnlearnDriver.Cluster(msg, cluster_size, au_v, "extreme")
        cluster_spam_rates_v.append(
            float(cl_v.target_spam()) / float(cluster_size))
        cluster_detection_rates_v.append(au_v.start_detect_rate(cl_v))

        for i in range(1, counter + 2):
            cluster_size += 100
            cluster_sizes.append(cluster_size)

            print "Clustering with size", cluster_size, "..."

            cluster_detection_rates_v.append(
                au_v.continue_detect_rate(cl_v, 100))
            cluster_spam_rates_v.append(
                float(cl_v.target_spam()) / float(cluster_size))

        with open(
                "C:\Users\Alex\Desktop\det_cluster_stats_v" +
                str(trial_number) + ".txt", 'w') as outfile:
            outfile.write("VANILLA MACHINE\n")

            outfile.write("--------------------------\n")

            outfile.write("Clustered around: " + msg.tag + "\n")

            outfile.write("--------------------------\n")

            outfile.write("Detection Rates:\n")
            outfile.write(str(original_rate_v) + "\n")

            for item in cluster_detection_rates_v:
                outfile.write(str(item) + "\n")

            outfile.write("--------------------------\n")

            outfile.write("Spam Rate:\n")
            for item in cluster_spam_rates_v:
                outfile.write(str(item) + "\n")

            outfile.write("Test Cluster Size:\n")
            outfile.write(str(test_size))

        answer = raw_input("Keep going (y/n)? You have performed " +
                           str(trial_number) + " trials so far. ")

        if answer == "n":
            keep_going = False

        else:
            au_v.learn(cl_v)
            au_v.init_ground()
            trial_number += 1
Пример #42
0
 def __init__(self, num_files, dir_num=3):
     self.NUMFILES = num_files
     self.destination = get_pathname_option("TestDriver", "spam_directories") % dir_num + "/"
     self.destination_files = listdir(self.destination)
Пример #43
0
def main():

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]
    keep_going = True
    trial_number = 1

    try:
        time_1 = time.time()
        au = ActiveUnlearnDriver.ActiveUnlearner(
            [
                msgs.HamStream(ham[1], [ham[1]]),
                msgs.HamStream(ham[2], [ham[2]])
            ],  # Training Ham
            [
                msgs.SpamStream(spam[1], [spam[1]]),
                msgs.SpamStream(spam[2], [spam[2]])
            ],  # Training Spam
            msgs.HamStream(ham[0], [ham[0]]),  # Testing Ham
            msgs.SpamStream(spam[0], [spam[0]]),  # Testing Spam
        )

        time_2 = time.time()
        train_time = time_2 - time_1
        print "Train time:", train_time, "\n"
        while keep_going:
            with open("C:\\Users\\Alex\\Desktop\\unpollute_stats\\unlearn_stats" + str(trial_number) + ".txt", 'w') \
                    as outfile:
                try:
                    outfile.write("CLUSTER AND RATE COUNTS:\n")
                    outfile.write("---------------------------\n")

                    original_detection_rate = au.driver.tester.correct_classification_rate(
                    )

                    outfile.write("0: " + str(original_detection_rate) + "\n")

                    time_start = time.time()
                    cluster_list = au.brute_force_active_unlearn(
                        outfile,
                        test=True,
                        center_iteration=False,
                        pollution_set3=True,
                        gold=True)
                    time_end = time.time()
                    unlearn_time = time_end - time_start
                    total_polluted_unlearned = 0
                    total_unlearned = 0
                    total_unpolluted_unlearned = 0
                    final_detection_rate = au.current_detection_rate

                    print "\nTallying up final counts...\n"
                    for cluster in cluster_list:
                        total_unlearned += cluster.size
                        total_polluted_unlearned += cluster.target_set3()
                        total_unpolluted_unlearned += (cluster.size -
                                                       cluster.target_set3())

                    outfile.write("\nSTATS\n")
                    outfile.write("---------------------------\n")
                    outfile.write("Initial Detection Rate: " +
                                  str(original_detection_rate) + "\n")
                    outfile.write("Final Detection Rate: " +
                                  str(final_detection_rate) + "\n")
                    outfile.write("Total Unlearned:\n")
                    outfile.write(str(total_unlearned) + "\n")
                    outfile.write("Polluted Percentage of Unlearned:\n")
                    outfile.write(
                        str(
                            float(total_polluted_unlearned) /
                            float(total_unlearned)) + "\n")
                    outfile.write("Unpolluted Percentage of Unlearned:\n")
                    outfile.write(
                        str(
                            float(total_unpolluted_unlearned) /
                            float(total_unlearned)) + "\n")
                    outfile.write("Percentage of Polluted Unlearned:\n")
                    outfile.write(
                        str(float(total_polluted_unlearned) / 1200) + "\n")
                    outfile.write("Time for training:\n")
                    outfile.write(str(train_time) + "\n")
                    outfile.write("Time for unlearning:\n")
                    outfile.write(str(unlearn_time))

                except KeyboardInterrupt:
                    outfile.flush()
                    os.fsync(outfile)
                    """
                    m.reset()
                    """
                    sys.exit()

            answer = raw_input("\nKeep going (y/n)? You have performed " +
                               str(trial_number) + " trial(s) so far. ")
            valid_input = False

            while not valid_input:
                if answer == "n":
                    keep_going = False
                    valid_input = True

                elif answer == "y":
                    for cluster in cluster_list:
                        au.learn(cluster)
                    au.init_ground()
                    trial_number += 1
                    valid_input = True

                else:
                    answer = raw_input("Please enter either y or n. ")

    except KeyboardInterrupt:
        """
        m.reset()
        """
        sys.exit()
Пример #44
0
def main():

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 4)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 4)
    ]
    injected = get_pathname_option("TestDriver", "spam_directories") % 3

    au = ActiveUnlearnDriver.ActiveUnlearner(
        [msgs.HamStream(ham[0], [ham[0]]),
         msgs.HamStream(ham[2], [ham[2]])], [
             msgs.SpamStream(spam[0], [spam[0]]),
             msgs.SpamStream(spam[2], [spam[2]])
         ], msgs.HamStream(ham[1], [ham[1]]),
        msgs.SpamStream(spam[1], [spam[1]]))

    msg = choice(
        au.driver.tester.train_examples[2])  # Randomly chosen from Ham Set3

    original_rate = au.driver.tester.correct_classification_rate()
    cluster_sizes = []
    detection_rates = []
    target_cluster_rates = []

    sizes = []
    for i in range(150, 1050, 50):
        sizes.append(i)
    for i in range(1000, 15000, 1000):
        sizes.append(i)

    for size in sizes:
        cluster = ActiveUnlearnDriver.Cluster(msg, size, au, "extreme")
        print "Clustering with size " + str(cluster.size) + "..."
        cluster_sizes.append(size)
        detection_rates.append(au.detect_rate(cluster))
        target_cluster_rates.append(
            float(cluster.target_set3()) / float(cluster.size))

    file = open("/Users/AlexYang/Desktop/clues.txt", 'w')

    features = au.driver.classifier._getclues(msg)
    i = 1
    for feature in features:
        file.write(str(i) + ") ")
        file.write(str(feature) + "\n")
        i += 1

    with open("/Users/AlexYang/Desktop/clusterstats.txt", 'w') as outfile:

        outfile.write("Clustered around: " + msg.tag)
        outfile.write("\nOriginal Rate: " + str(original_rate) + "\n")

        outfile.write(
            tabulate(
                {
                    "Cluster Sizes": cluster_sizes,
                    "Detection Rates": detection_rates,
                    "% of Targets Clustered": target_cluster_rates
                },
                headers="keys",
                tablefmt="plain"))
def main():
    import os
    import sys
    from random import choice

    sys.path.insert(-1, os.getcwd())
    sys.path.insert(-1, os.path.dirname(os.getcwd()))

    from spambayes import ActiveUnlearnDriver
    from spambayes.Options import get_pathname_option
    from spambayes import msgs

    """
    from dictionarywriter import DictionaryWriter
    """

    ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)]
    spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)]
    """
    DictionaryWriter(600).write()
    """

    keep_going = True
    trial_number = 1

    au_v = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[1], [ham[1]]),
                                                msgs.HamStream(ham[2], [ham[2]])],
                                               [msgs.SpamStream(spam[1], [spam[1]]),
                                                msgs.SpamStream(spam[3], [spam[3]])],
                                               msgs.HamStream(ham[0], [ham[0]]),
                                               msgs.SpamStream(spam[0], [spam[0]]),
                                               )
    while keep_going:
        msg = choice(au_v.driver.tester.train_examples[0])
        try:
            test_cl, counter = au_v.determine_cluster(msg)
            test_size = test_cl.size
            au_v.learn(test_cl)

        except TypeError:
            counter = 1
            test_size = "100, but fail"

        cluster_detection_rates_v = []
        cluster_spam_rates_v = []
        cluster_sizes = []

        au_v.init_ground()
        original_rate_v = au_v.driver.tester.correct_classification_rate()
        cluster_size = 100
        cluster_sizes.append(100)

        print "Clustering with size", cluster_size, "..."

        cl_v = ActiveUnlearnDriver.Cluster(msg, cluster_size, au_v, "extreme")
        cluster_spam_rates_v.append(float(cl_v.target_spam()) / float(cluster_size))
        cluster_detection_rates_v.append(au_v.start_detect_rate(cl_v))

        for i in range(1, counter + 2):
            cluster_size += 100
            cluster_sizes.append(cluster_size)

            print "Clustering with size", cluster_size, "..."

            cluster_detection_rates_v.append(au_v.continue_detect_rate(cl_v, 100))
            cluster_spam_rates_v.append(float(cl_v.target_spam()) / float(cluster_size))

        with open("C:\Users\Alex\Desktop\det_cluster_stats_v" + str(trial_number) + ".txt", 'w') as outfile:
            outfile.write("VANILLA MACHINE\n")

            outfile.write("--------------------------\n")

            outfile.write("Clustered around: " + msg.tag + "\n")

            outfile.write("--------------------------\n")

            outfile.write("Detection Rates:\n")
            outfile.write(str(original_rate_v) + "\n")

            for item in cluster_detection_rates_v:
                outfile.write(str(item) + "\n")

            outfile.write("--------------------------\n")

            outfile.write("Spam Rate:\n")
            for item in cluster_spam_rates_v:
                outfile.write(str(item) + "\n")

            outfile.write("Test Cluster Size:\n")
            outfile.write(str(test_size))

        answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trials so far. ")

        if answer == "n":
            keep_going = False

        else:
            au_v.learn(cl_v)
            au_v.init_ground()
            trial_number += 1
Пример #46
0
def main():
    import os
    import sys
    import shutil

    sys.path.insert(-1, os.getcwd())
    sys.path.insert(-1, os.path.dirname(os.getcwd()))

    from spambayes import ActiveUnlearnDriver
    from spambayes.Options import get_pathname_option
    from spambayes import msgs
    import time

    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]
    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]

    for i in range(1):
        au = ActiveUnlearnDriver.ActiveUnlearnDriver([
            msgs.HamStream(ham[0], [ham[0]]),
            msgs.HamStream(ham[2], [ham[2]]),
            msgs.HamStream(ham[3], [ham[3]])
        ], [
            msgs.SpamStream(spam[0], [spam[0]]),
            msgs.SpamStream(spam[2], [spam[2]]),
            msgs.SpamStream(spam[3], [spam[3]])
        ], msgs.HamStream(ham[2],
                          [ham[2]]), msgs.SpamStream(spam[2], [spam[2]]),
                                                     "ac-extreme")

        au.driver.test(msgs.HamStream(ham[0], [ham[0]]),
                       msgs.SpamStream(spam[0], [spam[0]]))
        au.driver.untrain(msgs.HamStream(ham[2], [ham[2]]),
                          msgs.SpamStream(spam[2], [spam[2]]))
        au.driver.untrain(msgs.HamStream(ham[3], [ham[3]]),
                          msgs.SpamStream(spam[3], [spam[3]]))
        au.driver.test(msgs.HamStream(ham[0], [ham[0]]),
                       msgs.SpamStream(spam[0], [spam[0]]))
        msg = au.driver.tester.test_examples[5]

        shutil.copy(msg.tag, "C:\Users\Alex\Desktop\clustera")
        print msg.prob

        start_time = time.time()
        cluster = (au.cluster(msg, 10))
        end_time = time.time()
        print cluster

        clueslist = []
        for clue in msg.clues:
            clueslist.append((clue[0], clue[1]))
        print clueslist

        with open("C:\Users\Alex\Desktop\clustera\cluster7.txt",
                  'w') as outfile:
            spamcounter = 0
            for sim in cluster:
                with open(sim.tag) as infile:
                    if sim.tag.endswith(".spam.txt"):
                        outfile.write("SPAMSPAMSPAMSPAMSPAM" + "\n\n")
                    if sim.tag.endswith(".ham.txt"):
                        outfile.write("HAMHAMHAMHAMHAM" + "\n\n")

                    outfile.write(infile.read())
                    outfile.write("\n\n" +
                                  "----------------------------------------" +
                                  "\n\n")

                if sim.tag.endswith(".spam.txt"):
                    spamcounter += 1

            print spamcounter

        print end_time - start_time
Пример #47
0
from string import ascii_lowercase
from os import listdir, remove
from random import choice, sample
from spambayes.Options import get_pathname_option

default_dest = get_pathname_option("TestDriver", "spam_directories") % 3


def write_dictionary_sets(number_clusters=26, x=0.5, y=200, destination=default_dest):
    destination += "/"

    letterset = {}  # A dictionary of words: Key = Letter, Value = Words beginning with that letter

    for letter in ascii_lowercase:
        letterset[letter] = []

    with open("dictionary.txt", 'r') as dictionary:
        for line in dictionary:
            letter = line[0]
            letterset[letter].append(line.strip())

    keys = sample(letterset.keys(), number_clusters)
    for letter in keys:
        print "Writing sets for letter " + letter + " ..."
        x = x                       # Percentage overlap of words between sets
        y = y                       # Number of sets per letter
        o_set = letterset[letter]   # Size of original set of words beginning with letter
        b = len(o_set)              # Size of set being pulled from
        a = int(b * x)              # Size of resultant sets

        for i in range(y):
Пример #48
0
    def close(self):
        # Close our underlying database.  Better not assume all databases
        # have close functions!
        def noop(): pass
        getattr(self.db, "close", noop)()
        getattr(self.dbm, "close", noop)()

    def store(self):
        if self.db is not None:
            self.db.sync()

# This should come from a Mark Hammond idea of a master db
# For the moment, we get the name of another file from the options,
# so that these files don't litter lots of working directories.
# Once there is a master db, this option can be removed.
message_info_db_name = get_pathname_option("Storage", "messageinfo_storage_file")
if options["Storage", "persistent_use_database"] is True or \
   options["Storage", "persistent_use_database"] == "dbm":
    msginfoDB = MessageInfoDB(message_info_db_name)
elif options["Storage", "persistent_use_database"] is False or \
     options["Storage", "persistent_use_database"] == "pickle":
    msginfoDB = MessageInfoPickle(message_info_db_name)
else:
    # Ah - now, what?  Maybe the user has mysql or pgsql or zeo,
    # or some other newfangled thing!  We don't know what to do
    # in that case, so just use a pickle, since it's the safest
    # option.
    msginfoDB = MessageInfoPickle(message_info_db_name)

class Message(email.Message.Message):
    '''An email.Message.Message extended for Spambayes'''