示例#1
0
def drive(nsets):
    print options.display()
    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]
    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]
    d = TestDriver.Driver()
    d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets),
                            hamdirs[1:], train=1),
            msgs.SpamStream("%s-%d" % (spamdirs[1], nsets),
                            spamdirs[1:], train=1))
    for i in range(nsets):
        h = hamdirs[i]
        s = spamdirs[i]
        hamstream = msgs.HamStream(h, [h], train=0)
        spamstream = msgs.SpamStream(s, [s], train=0)
        if i > 0:
            if options["CV Driver", "build_each_classifier_from_scratch"]:
                d.new_classifier()
                hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1)
                h2 = hamdirs[:]
                del h2[i]
                sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1)
                s2 = spamdirs[:]
                del s2[i]
                d.train(msgs.HamStream(hname, h2, train=1),
                        msgs.SpamStream(sname, s2, train=1))
            else:
                d.untrain(hamstream, spamstream)
        d.test(hamstream, spamstream)
        d.finishtest()
        if i < nsets - 1 and not options["CV Driver",
                                         "build_each_classifier_from_scratch"]:
            d.train(hamstream, spamstream)
    d.alldone()
示例#2
0
 def drive(nsets):

    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]

    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]

    spamhamdirs = zip(spamdirs, hamdirs)

    d = TestDriver.Driver()

    for spamdir, hamdir in spamhamdirs:

        d.new_classifier()

        d.train(msgs.HamStream(hamdir, [hamdir]),
                msgs.SpamStream(spamdir, [spamdir]))

        for sd2, hd2 in spamhamdirs:

            if (sd2, hd2) == (spamdir, hamdir):

                continue

            d.test(msgs.HamStream(hd2, [hd2]),
                   msgs.SpamStream(sd2, [sd2]))

        d.finishtest()

    d.alldone()
示例#3
0
def unlearn_compare(nsets, unsets):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets+1)]
    hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets+1)]
    spamhamdirs = zip(spamdirs, hamdirs)
    unspamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, unsets+1)]
    unhamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, unsets+1)]
    unspamhamdirs = zip(unspamdirs, unhamdirs)

    d = TestDriver.Driver()
    d.new_classifier()
    """
    for spamdir, hamdir in spamhamdirs:
        d.train(msgs.HamStream(hamdir, [hamdir]),
                msgs.SpamStream(spamdir, [spamdir]))
    """
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]),
            msgs.SpamStream(spamdirs[0], [spamdirs[0]]))
    d.train(msgs.HamStream(hamdirs[1], [hamdirs[1]]),
            msgs.SpamStream(spamdirs[1], [spamdirs[1]]))
    d.test(msgs.HamStream(hamdirs[2], [hamdirs[2]]),
           msgs.SpamStream(spamdirs[2], [spamdirs[2]]))
    d.finishtest()
    d.alldone()

    unlearn_driver(d, spamhamdirs, unspamhamdirs)
def drive():
    print options.display()

    spam = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, 5)]
    ham = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, 5)]

    d = dictionarywriter.DictionaryWriter(150, 4)
    d.write()

    keep_going = True
    trial_number = 1

    au = ActiveUnlearnDriver.ActiveUnlearner([msgs.HamStream(ham[1], [ham[1]]),
                                              msgs.HamStream(ham[2], [ham[2]])],
                                             [msgs.SpamStream(spam[1], [spam[1]]),
                                              msgs.SpamStream(spam[3], [spam[3]])],
                                             msgs.HamStream(ham[0], [ham[0]]),
                                             msgs.SpamStream(spam[0], [spam[0]]),
                                             )
    with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt", 'w') as outfile:

        while keep_going:
            chosen = set()
            current = au.select_initial()
            cluster = au.determine_cluster(current)
            chosen.add(current)
            au.driver.test(au.testing_ham, au.testing_spam)

            while not cluster:
                current = au.select_initial(chosen)
                cluster = au.determine_cluster(current)
                chosen.add(current)
                au.driver.test(au.testing_ham, au.testing_spam)

            cluster_list = list(cluster.cluster_set)

            dicts = au.driver.tester.train_examples[2]

            data = v_correlation(cluster_list, dicts)

            outfile.write("Trial " + str(trial_number) + " Percentage Overlap (Correlation): " + str(data))
            answer = raw_input("Keep going (y/n)? You have performed " + str(trial_number) + " trial(s) so far. ")

            valid_input = False

            while not valid_input:
                if answer == "n":
                    keep_going = False
                    valid_input = True

                elif answer == "y":
                    au.learn(cluster)
                    au.init_ground()
                    trial_number += 1
                    valid_input = True

                else:
                    print "Please enter either y or n."
示例#5
0
def main(args):
    global FMT

    print options.display()

    FMT = "unix"
    NSETS = 10
    SEED = 101
    MAXMSGS = None
    opts, args = getopt.getopt(args, "f:n:s:m:")
    for k, v in opts:
        if k == '-f':
            FMT = v
        if k == '-n':
            NSETS = int(v)
        if k == '-s':
            SEED = int(v)
        if k == '-m':
            MAXMSGS = int(v)

    ham, spam = args

    random.seed(SEED)

    nham = len(list(mbox(ham)))
    nspam = len(list(mbox(spam)))

    if MAXMSGS:
        nham = min(nham, MAXMSGS)
        nspam = min(nspam, MAXMSGS)

    print "ham", ham, nham
    print "spam", spam, nspam

    ihams = map(tuple, randindices(nham, NSETS))
    ispams = map(tuple, randindices(nspam, NSETS))

    driver = Driver()

    for i in range(1, NSETS):
        driver.train(mbox(ham, ihams[i]), mbox(spam, ispams[i]))

    i = 0
    for iham, ispam in zip(ihams, ispams):
        hams = mbox(ham, iham)
        spams = mbox(spam, ispam)

        if i > 0:
            driver.untrain(hams, spams)

        driver.test(hams, spams)
        driver.finishtest()

        if i < NSETS - 1:
            driver.train(hams, spams)

        i += 1
    driver.alldone()
示例#6
0
文件: mboxtest.py 项目: Xodarap/Eipi
def main(args):
    global FMT

    print options.display()

    FMT = "unix"
    NSETS = 10
    SEED = 101
    MAXMSGS = None
    opts, args = getopt.getopt(args, "f:n:s:m:")
    for k, v in opts:
        if k == '-f':
            FMT = v
        if k == '-n':
            NSETS = int(v)
        if k == '-s':
            SEED = int(v)
        if k == '-m':
            MAXMSGS = int(v)

    ham, spam = args

    random.seed(SEED)

    nham = len(list(mbox(ham)))
    nspam = len(list(mbox(spam)))

    if MAXMSGS:
        nham = min(nham, MAXMSGS)
        nspam = min(nspam, MAXMSGS)

    print "ham", ham, nham
    print "spam", spam, nspam

    ihams = map(tuple, randindices(nham, NSETS))
    ispams = map(tuple, randindices(nspam, NSETS))

    driver = Driver()

    for i in range(1, NSETS):
        driver.train(mbox(ham, ihams[i]), mbox(spam, ispams[i]))

    i = 0
    for iham, ispam in zip(ihams, ispams):
        hams = mbox(ham, iham)
        spams = mbox(spam, ispam)

        if i > 0:
            driver.untrain(hams, spams)

        driver.test(hams, spams)
        driver.finishtest()

        if i < NSETS - 1:
            driver.train(hams, spams)

        i += 1
    driver.alldone()
示例#7
0
def drive(nsets,decision):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]
    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]

    spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)]
    hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)]

    nham = len(hamfns)
    nspam = len(spamfns)
    cc = CostCounter.nodelay()

    allfns = {}
    for fn in spamfns+hamfns:
        allfns[fn] = None

    d = hammie.open('weaktest.db', False)

    hamtrain = 0
    spamtrain = 0
    n = 0
    for dir,name, is_spam in allfns.iterkeys():
        n += 1
        m=msgs.Msg(dir, name).guts
        if debug > 1:
            print "trained:%dH+%dS"%(hamtrain,spamtrain)
        scr=d.score(m)
        if debug > 1:
            print "score:%.3f"%scr
        if not decision.tooearly():
            if is_spam:
                if debug > 0:
                    print "Spam with score %.2f"%scr
                cc.spam(scr)
            else:
                if debug > 0:
                    print "Ham with score %.2f"%scr
                cc.ham(scr)
        de = decision(scr,is_spam)
        if de == TRAIN_AS_SPAM:
            d.train_spam(m)
            spamtrain += 1
        elif de == TRAIN_AS_HAM:
            d.train_ham(m)
            hamtrain += 1
        if n % 100 == 0:
            print "%5d trained:%dH+%dS wrds:%d"%(
                n, hamtrain, spamtrain, len(d.bayes.wordinfo))
            print cc
    print "="*70
    print "%5d trained:%dH+%dS wrds:%d"%(
        n, hamtrain, spamtrain, len(d.bayes.wordinfo))
    print cc
示例#8
0
def drive(nsets):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets+1)]
    hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets+1)]

    d = TestDriver.Driver()
    d.new_classifier()
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]), msgs.SpamStream(spamdirs[0], [spamdirs[0]]))
    d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]), msgs.SpamStream(spamdirs[1], [spamdirs[1]]))
    d.finishtest()
    d.alldone()
示例#9
0
def drive(nsets):
    print options.display()

    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]
    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]

    d = TestDriver.Driver()
    # Train it on all sets except the first.
    d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets),
                            hamdirs[1:], train=1),
            msgs.SpamStream("%s-%d" % (spamdirs[1], nsets),
                            spamdirs[1:], train=1))

    # Now run nsets times, predicting pair i against all except pair i.
    for i in range(nsets):
        h = hamdirs[i]
        s = spamdirs[i]
        hamstream = msgs.HamStream(h, [h], train=0)
        spamstream = msgs.SpamStream(s, [s], train=0)

        if i > 0:
            if options["CV Driver", "build_each_classifier_from_scratch"]:
                # Build a new classifier from the other sets.
                d.new_classifier()

                hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1)
                h2 = hamdirs[:]
                del h2[i]

                sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1)
                s2 = spamdirs[:]
                del s2[i]

                d.train(msgs.HamStream(hname, h2, train=1),
                        msgs.SpamStream(sname, s2, train=1))

            else:
                # Forget this set.
                d.untrain(hamstream, spamstream)

        # Predict this set.
        d.test(hamstream, spamstream)
        d.finishtest()

        if i < nsets - 1 and not options["CV Driver",
                                         "build_each_classifier_from_scratch"]:
            # Add this set back in.
            d.train(hamstream, spamstream)

    d.alldone()
def drive(nsets, decision):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets + 1)]
    hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets + 1)]

    spamfns = [(x, y, 1) for x in spamdirs for y in os.listdir(x)]
    hamfns = [(x, y, 0) for x in hamdirs for y in os.listdir(x)]

    nham = len(hamfns)
    nspam = len(spamfns)
    cc = CostCounter.nodelay()

    allfns = {}
    for fn in spamfns + hamfns:
        allfns[fn] = None

    d = hammie.open("weaktest.db", False)

    hamtrain = 0
    spamtrain = 0
    n = 0
    for dir, name, is_spam in allfns.iterkeys():
        n += 1
        m = msgs.Msg(dir, name).guts
        if debug > 1:
            print "trained:%dH+%dS" % (hamtrain, spamtrain)
        scr = d.score(m)
        if debug > 1:
            print "score:%.3f" % scr
        if not decision.tooearly():
            if is_spam:
                if debug > 0:
                    print "Spam with score %.2f" % scr
                cc.spam(scr)
            else:
                if debug > 0:
                    print "Ham with score %.2f" % scr
                cc.ham(scr)
        de = decision(scr, is_spam)
        if de == TRAIN_AS_SPAM:
            d.train_spam(m)
            spamtrain += 1
        elif de == TRAIN_AS_HAM:
            d.train_ham(m)
            hamtrain += 1
        if n % 100 == 0:
            print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo))
            print cc
    print "=" * 70
    print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo))
    print cc
示例#11
0
def drive(nsets):
    print options.display()

    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]
    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]

    d = TestDriver.Driver()
    # Train it on all sets except the first.
    d.train(
        msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:], train=1),
        msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:], train=1))

    # Now run nsets times, predicting pair i against all except pair i.
    for i in range(nsets):
        h = hamdirs[i]
        s = spamdirs[i]
        hamstream = msgs.HamStream(h, [h], train=0)
        spamstream = msgs.SpamStream(s, [s], train=0)

        if i > 0:
            if options["CV Driver", "build_each_classifier_from_scratch"]:
                # Build a new classifier from the other sets.
                d.new_classifier()

                hname = "%s-%d, except %d" % (hamdirs[0], nsets, i + 1)
                h2 = hamdirs[:]
                del h2[i]

                sname = "%s-%d, except %d" % (spamdirs[0], nsets, i + 1)
                s2 = spamdirs[:]
                del s2[i]

                d.train(msgs.HamStream(hname, h2, train=1),
                        msgs.SpamStream(sname, s2, train=1))

            else:
                # Forget this set.
                d.untrain(hamstream, spamstream)

        # Predict this set.
        d.test(hamstream, spamstream)
        d.finishtest()

        if i < nsets - 1 and not options["CV Driver",
                                         "build_each_classifier_from_scratch"]:
            # Add this set back in.
            d.train(hamstream, spamstream)

    d.alldone()
示例#12
0
def drive(num):
    print options.display()

    spamdirs = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 4)
    ]
    hamdirs = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 4)
    ]

    r = mislabeledfilemover.MislabeledFileMover(num)
    r.random_move_file()

    d = TestDriver.Driver()
    d.new_classifier()
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]),
            msgs.SpamStream(spamdirs[0], [spamdirs[0]]))
    d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]),
            msgs.SpamStream(spamdirs[2], [spamdirs[2]]))
    d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]),
           msgs.SpamStream(spamdirs[1], [spamdirs[1]]))

    guess = d.classifier.spamprob
    polluted = []
    for msg in msgs.HamStream(hamdirs[2], [hamdirs[2]]):
        msg.prob = guess(msg)
        polluted.append(msg)

    for msg in msgs.SpamStream(spamdirs[2], [spamdirs[2]]):
        msg.prob = guess(msg)
        polluted.append(msg)

    mislabeled = []
    for fp in d.tester.false_positives():
        mislabeled.append(fp)

    for fn in d.tester.false_negatives():
        mislabeled.append(fn)

    for unsure in d.unsure:
        mislabeled.append(unsure)

    d.finishtest()
    d.alldone()

    data = v_correlation(polluted, mislabeled)

    print "Percentage Overlap (Correlation): " + str(data)
def drive(num):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") %
                i for i in range(1, 4)]
    hamdirs = [get_pathname_option("TestDriver", "ham_directories") %
               i for i in range(1, 4)]

    r = mislabeledfilemover.MislabeledFileMover(num)
    r.random_move_file()

    d = TestDriver.Driver()
    d.new_classifier()
    d.train(msgs.HamStream(hamdirs[0], [hamdirs[0]]),
            msgs.SpamStream(spamdirs[0], [spamdirs[0]]))
    d.train(msgs.HamStream(hamdirs[2], [hamdirs[2]]),
            msgs.SpamStream(spamdirs[2], [spamdirs[2]]))
    d.test(msgs.HamStream(hamdirs[1], [hamdirs[1]]),
           msgs.SpamStream(spamdirs[1], [spamdirs[1]]))

    guess = d.classifier.spamprob
    polluted = []
    for msg in msgs.HamStream(hamdirs[2], [hamdirs[2]]):
        msg.prob = guess(msg)
        polluted.append(msg)

    for msg in msgs.SpamStream(spamdirs[2], [spamdirs[2]]):
        msg.prob = guess(msg)
        polluted.append(msg)

    mislabeled = []
    for fp in d.tester.false_positives():
        mislabeled.append(fp)

    for fn in d.tester.false_negatives():
        mislabeled.append(fn)

    for unsure in d.unsure:
        mislabeled.append(unsure)

    d.finishtest()
    d.alldone()

    data = v_correlation(polluted, mislabeled)

    print "Percentage Overlap (Correlation): " + str(data)
示例#14
0
def drive(nsets):
    print options.display()

    spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \
                i for i in range(1, nsets+1)]
    hamdirs  = [get_pathname_option("TestDriver", "ham_directories") % \
                i for i in range(1, nsets+1)]
    spamhamdirs = zip(spamdirs, hamdirs)

    d = TestDriver.Driver()
    for spamdir, hamdir in spamhamdirs:
        d.new_classifier()
        d.train(msgs.HamStream(hamdir, [hamdir]),
                msgs.SpamStream(spamdir, [spamdir]))
        for sd2, hd2 in spamhamdirs:
            if (sd2, hd2) == (spamdir, hamdir):
                continue
            d.test(msgs.HamStream(hd2, [hd2]),
                   msgs.SpamStream(sd2, [sd2]))
        d.finishtest()
    d.alldone()
示例#15
0
def drive():
    print options.display()

    spam = [
        get_pathname_option("TestDriver", "spam_directories") % i
        for i in range(1, 5)
    ]
    ham = [
        get_pathname_option("TestDriver", "ham_directories") % i
        for i in range(1, 5)
    ]

    d = dictionarywriter.DictionaryWriter(150, 4)
    d.write()

    keep_going = True
    trial_number = 1

    au = ActiveUnlearnDriver.ActiveUnlearner(
        [msgs.HamStream(ham[1], [ham[1]]),
         msgs.HamStream(ham[2], [ham[2]])],
        [
            msgs.SpamStream(spam[1], [spam[1]]),
            msgs.SpamStream(spam[3], [spam[3]])
        ],
        msgs.HamStream(ham[0], [ham[0]]),
        msgs.SpamStream(spam[0], [spam[0]]),
    )
    with open("C:\Users\Alex\Desktop\dict_correlation_stats.txt",
              'w') as outfile:

        while keep_going:
            chosen = set()
            current = au.select_initial()
            cluster = au.determine_cluster(current)
            chosen.add(current)
            au.driver.test(au.testing_ham, au.testing_spam)

            while not cluster:
                current = au.select_initial(chosen)
                cluster = au.determine_cluster(current)
                chosen.add(current)
                au.driver.test(au.testing_ham, au.testing_spam)

            cluster_list = list(cluster.cluster_set)

            dicts = au.driver.tester.train_examples[2]

            data = v_correlation(cluster_list, dicts)

            outfile.write("Trial " + str(trial_number) +
                          " Percentage Overlap (Correlation): " + str(data))
            answer = raw_input("Keep going (y/n)? You have performed " +
                               str(trial_number) + " trial(s) so far. ")

            valid_input = False

            while not valid_input:
                if answer == "n":
                    keep_going = False
                    valid_input = True

                elif answer == "y":
                    au.learn(cluster)
                    au.init_ground()
                    trial_number += 1
                    valid_input = True

                else:
                    print "Please enter either y or n."