示例#1
0
def main(args):
    opts, args = getopt.getopt(args, "")
    for opt, arg in opts:
        pass
    if not args:
        mboxes = [getmbox("-")]
    else:
        mboxes = [getmbox(a) for a in args]
    for mbox in mboxes:
        for msg in mbox:
            print(generate_checksum(msg))
示例#2
0
def main(args):
    opts, args = getopt.getopt(args, "")
    for opt, arg in opts:
        pass
    if not args:
        mboxes = [getmbox("-")]
    else:
        mboxes = [getmbox(a) for a in args]

    for mbox in mboxes:
        for msg in mbox:
            print generate_checksum(msg)
示例#3
0
def score(unsure, h, cls, scores, msgids=None, skipspam=False):
    """See what effect on others each msg in unsure has"""
    ham_cutoff = options["Categorization", "ham_cutoff"]
    spam_cutoff = options["Categorization", "spam_cutoff"]
    n = 0
    total = 0.0
    okalready = set()
    add = okalready.add
    for msg in getmbox(unsure):
        prob = cls.spamprob(tokenize(msg))
        n += 1
        if prob >= spam_cutoff:
            add(msg['message-id'])
        else:
            total += prob
    first_mean = total/n
    print len(okalready), "out of", n, "messages already score as spam"
    print "initial mean spam prob: %.3f" % first_mean
    print "%5s %3s %5s %5s %s" % ("prob", "new", "mean", "sdev", "msgid")
    for msg in getmbox(unsure):
        msgid = msg['message-id']
        if msgids is not None and msgid not in msgids:
            continue
        msgprob = cls.spamprob(tokenize(msg))
        if skipspam and msgprob >= spam_cutoff:
            continue
        n = j = 0
        h.train(msg, True)
        total = 0.0
        probs = []
        for trial in getmbox(unsure):
            if trial['message-id'] in okalready:
                continue
            n += 1
            if n % 10 == 0:
                counter("", n)
            prob = cls.spamprob(tokenize(trial))
            probs.append(prob)
            total += prob
            if prob >= spam_cutoff:
                j += 1
        counter("", n)
        h.untrain(msg, True)
        mean = total/n
        meankey = round(mean, 3)
        scores.setdefault(meankey, []).append(msgid)
        sdev = math.sqrt(sum([(mean-prob)**2 for prob in probs])/n)
        print "\r%.3f %3d %.3f %.3f %s" % (msgprob, j, mean, sdev, msgid)
示例#4
0
def cull(mbox_name, cullext, designation, tdict):
    print "writing new %s mbox..." % designation
    n = m = 0
    if cullext:
        culled_mbox = file(mbox_name + cullext, "w")

    for msg in mboxutils.getmbox(mbox_name):
        m += 1
        if msg["message-id"] in tdict:
            if cullext:
                culled_mbox.write(str(msg))
            n += 1
        elif not cullext:
            response = msg.imap_server.uid("STORE", msg.uid, "+FLAGS.SILENT",
                                           "(\\Deleted \\Seen)")
            command = "set %s to be deleted and seen" % (msg.uid, )
            msg.imap_server.check_response(command, response)

        sys.stdout.write("\r%5d of %5d" % (n, m))
        sys.stdout.flush()

    sys.stdout.write("\n")

    if cullext:
        culled_mbox.close()
示例#5
0
def score(h, msgs, reverse=0):
    """Score (judge) all messages from a mailbox."""
    # XXX The reporting needs work!
    mbox = mboxutils.getmbox(msgs)
    i = 0
    spams = hams = unsures = 0
    for msg in mbox:
        i += 1
        prob, clues = h.score(msg, True)
        if hasattr(msg, '_mh_msgno'):
            msgno = msg._mh_msgno
        else:
            msgno = i
        isspam = (prob >= SPAM_THRESHOLD)
        isham = (prob <= HAM_THRESHOLD)
        if isspam:
            spams += 1
            if not reverse:
                print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
                print h.formatclues(clues)
        elif isham:
            hams += 1
            if reverse:
                print "%6s %4.2f %1s" % (msgno, prob, isham and "S" or "."),
                print h.formatclues(clues)
        else:
            unsures += 1
            print "%6s %4.2f U" % (msgno, prob),
            print h.formatclues(clues)
    return (spams, hams, unsures)
示例#6
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "hd:S:H:f:",
                                   ["help", "database=", "spamfile=",
                                    "hamfile=", "feature="])
    except getopt.GetoptError as msg:
        usage(msg)
        return 1
    charset = locale.getdefaultlocale()[1]
    if not charset:
        charset = 'us-ascii'
    mapfile = spamfile = hamfile = None
    features = set()
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-d", "--database"):
            mapfile = arg
        elif opt in ("-H", "--hamfile"):
            hamfile = arg
        elif opt in ("-S", "--spamfile"):
            spamfile = arg
        elif opt in ("-f", "--feature"):
            features.add(str(arg, charset))
    if hamfile is None and spamfile is None:
        usage("At least one of -S or -H are required")
        return 1
    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1
    try:
        mapd = pickle_read(mapfile)
    except IOError:
        usage("Mapfile %s does not exist" % mapfile)
        return 1
    if not features and not args:
        usage("Require at least one feature (-f) arg or one message file")
        return 1
    if not features:
        for f in args:
            for msg in getmbox(f):
                evidence = msg.get("X-Spambayes-Evidence", "")
                evidence = re.sub(r"\s+", " ", evidence)
                l = [e.rsplit(": ", 1)[0]
                     for e in evidence.split("; ")[2:]]
                for s in l:
                    try:
                        s = make_header(decode_header(s)).__unicode__()
                    except:
                        s = str(s, 'us-ascii', 'replace')
                    features.add(s)
        if not features:
            usage("No X-Spambayes-Evidence headers found")
            return 1
    if spamfile is not None:
        spamfile = file(spamfile, "w")
    if hamfile is not None:
        hamfile = file(hamfile, "w")
    extractmessages(features, mapd, hamfile, spamfile)
示例#7
0
def score(h, msgs, reverse=0):
    """Score (judge) all messages from a mailbox."""
    # XXX The reporting needs work!
    mbox = mboxutils.getmbox(msgs)
    i = 0
    spams = hams = unsures = 0
    for msg in mbox:
        i += 1
        prob, clues = h.score(msg, True)
        if hasattr(msg, '_mh_msgno'):
            msgno = msg._mh_msgno
        else:
            msgno = i
        isspam = (prob >= SPAM_THRESHOLD)
        isham = (prob <= HAM_THRESHOLD)
        if isspam:
            spams += 1
            if not reverse:
                print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
                print h.formatclues(clues)
        elif isham:
            hams += 1
            if reverse:
                print "%6s %4.2f %1s" % (msgno, prob, isham and "S" or "."),
                print h.formatclues(clues)
        else:
            unsures += 1
            print "%6s %4.2f U" % (msgno, prob),
            print h.formatclues(clues)
    return (spams, hams, unsures)
示例#8
0
def main(profiling=False):
    h = HammieFilter()
    actions = []
    opts, args = getopt.getopt(sys.argv[1:], 'hvxd:p:nfgstGSo:P',
                               ['help', 'version', 'examples', 'option='])
    create_newdb = False
    do_profile = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-v', '--version'):
            version()
        elif opt in ('-x', '--examples'):
            examples()
        elif opt in ('-o', '--option'):
            Options.options.set_from_cmdline(arg, sys.stderr)
        elif opt == '-f':
            actions.append(h.filter)
        elif opt == '-g':
            actions.append(h.train_ham)
        elif opt == '-s':
            actions.append(h.train_spam)
        elif opt == '-t':
            actions.append(h.filter_train)
        elif opt == '-G':
            actions.append(h.untrain_ham)
        elif opt == '-S':
            actions.append(h.untrain_spam)
        elif opt == '-P':
            do_profile = True
            if not profiling:
                try:
                    import cProfile
                except ImportError:
                    pass
                else:
                    return cProfile.run("main(True)")
        elif opt == "-n":
            create_newdb = True
    h.dbname, h.usedb = storage.database_type(opts)
    if create_newdb or not os.path.exists(h.dbname):
        h.newdb()
        print("Created new database in", h.dbname, file=sys.stderr)
        if create_newdb:
            sys.exit(0)
    if actions == []:
        actions = [h.filter]
    if not args:
        args = ["-"]
    for fname in args:
        mbox = mboxutils.getmbox(fname)
        for msg in mbox:
            for action in actions:
                action(msg)
                if args == ["-"]:
                    unixfrom = msg.get_unixfrom() is not None
                else:
                    unixfrom = True
            result = mboxutils.as_string(msg, unixfrom=unixfrom)
            sys.stdout.write(result)
示例#9
0
 def untrain(h, msgs, is_spam):

    """Untrain bayes with all messages from a mailbox."""

    mbox = mboxutils.getmbox(msgs)

    i = 0

    for msg in mbox:

        i += 1

        if i % 10 == 0:

            sys.stdout.write("\r%6d" % i)

            sys.stdout.flush()

        h.untrain(msg, is_spam)

    sys.stdout.write("\r%6d" % i)

    sys.stdout.flush()

    print()
示例#10
0
def mapmessages(f, mboxtype, mapdb):
    i = 0
    for msg in getmbox(f):
        i += 1
        sys.stdout.write('\r%s: %d' % (f, i))
        sys.stdout.flush()
        msgid = msg.get("message-id")
        if msgid is None:
            continue
        for t in tokenize(msg):
            ham, spam = mapdb.get(t, ({}, {}))
            if mboxtype == "ham":
                msgids = ham.get(f, set())
                msgids.add(msgid)
                ham[f] = msgids
            else:
                msgids = spam.get(f, set())
                msgids.add(msgid)
                spam[f] = msgids
            mapdb[t] = (ham, spam)
        if options["Classifier", "x-use_bigrams"]:
            for t in Classifier()._enhance_wordstream(tokenize(msg)):
                ham, spam = mapdb.get(t, ({}, {}))
                if mboxtype == "ham":
                    msgids = ham.get(f, set())
                    msgids.add(msgid)
                    ham[f] = msgids
                else:
                    msgids = spam.get(f, set())
                    msgids.add(msgid)
                    spam[f] = msgids
                mapdb[t] = (ham, spam)
    sys.stdout.write("\n")
示例#11
0
def cull(mbox_name, cullext, designation, tdict):
    print "writing new %s mbox..." % designation
    n = m = 0
    if cullext:
        culled_mbox = file(mbox_name + cullext, "w")
        
    for msg in mboxutils.getmbox(mbox_name):
        m += 1
        if msg["message-id"] in tdict:
            if cullext:
                culled_mbox.write(str(msg))
            n += 1
        elif not cullext:
            response = msg.imap_server.uid(
                "STORE", msg.uid, "+FLAGS.SILENT", "(\\Deleted \\Seen)")
            command = "set %s to be deleted and seen" % (msg.uid,)
            msg.imap_server.check_response(command, response)
        
        sys.stdout.write("\r%5d of %5d" % (n, m))
        sys.stdout.flush()
        
    sys.stdout.write("\n")
    
    if cullext:
        culled_mbox.close()
示例#12
0
def mapmessages(f, mboxtype, mapdb):
    i = 0
    for msg in getmbox(f):
        i += 1
        sys.stdout.write('\r%s: %d' % (f, i))
        sys.stdout.flush()
        msgid = msg.get("message-id")
        if msgid is None:
            continue
        for t in tokenize(msg):
            ham, spam = mapdb.get(t, ({}, {}))
            if mboxtype == "ham":
                msgids = ham.get(f, set())
                msgids.add(msgid)
                ham[f] = msgids
            else:
                msgids = spam.get(f, set())
                msgids.add(msgid)
                spam[f] = msgids
            mapdb[t] = (ham, spam)
        if options["Classifier", "x-use_bigrams"]:
            for t in Classifier()._enhance_wordstream(tokenize(msg)):
                ham, spam = mapdb.get(t, ({}, {}))
                if mboxtype == "ham":
                    msgids = ham.get(f, set())
                    msgids.add(msgid)
                    ham[f] = msgids
                else:
                    msgids = spam.get(f, set())
                    msgids.add(msgid)
                    spam[f] = msgids
                mapdb[t] = (ham, spam)
    sys.stdout.write("\n")
示例#13
0
def learn(mbox, h, is_spam):
    i = 0
    tag = is_spam and "Spam" or "Ham"
    for msg in getmbox(mbox):
        counter(tag, i)
        i += 1
        h.train(msg, is_spam)
    print
示例#14
0
 def train(bayes, msgs, is_spam):

    """Train bayes with all messages from a mailbox."""

    mbox = mboxutils.getmbox(msgs)

    for msg in mbox:

        bayes.learn(tokenize(msg), is_spam)
示例#15
0
def extractmessages(features, mapdb, hamfile, spamfile):
    """extract messages which contain given features"""
    hamids = {}
    spamids = {}

    for feature in features:
        ham, spam = mapdb.get(feature, ([], []))
        if hamfile is not None:
            for mbox in ham:
                msgids = hamids.get(mbox, set())
                msgids.update(ham.get(mbox, set()))
                hamids[mbox] = msgids
        if spamfile is not None:
            for mbox in spam:
                msgids = spamids.get(mbox, set())
                msgids.update(spam.get(mbox, set()))
                spamids[mbox] = msgids

    # now run through each mailbox in hamids and spamids and print
    # matching messages to relevant ham or spam files
    for mailfile in hamids:
        i = 0
        msgids = hamids[mailfile]
        for msg in getmbox(mailfile):
            if msg.get("message-id") in msgids:
                i += 1
                sys.stdout.write('\r%s: %5d' % (mailfile, i))
                sys.stdout.flush()
                print >> hamfile, msg
    print

    for mailfile in spamids:
        i = 0
        msgids = spamids[mailfile]
        for msg in getmbox(mailfile):
            if msg.get("message-id") in msgids:
                i += 1
                sys.stdout.write('\r%s: %5d' % (mailfile, i))
                sys.stdout.flush()
                print >> spamfile, msg
    print
示例#16
0
def untrain(h, msgs, is_spam):
    """Untrain bayes with all messages from a mailbox."""
    mbox = mboxutils.getmbox(msgs)
    i = 0
    for msg in mbox:
        i += 1
        if i % 10 == 0:
            sys.stdout.write("\r%6d" % i)
            sys.stdout.flush()
        h.untrain(msg, is_spam)
    sys.stdout.write("\r%6d" % i)
    sys.stdout.flush()
    print
示例#17
0
文件: sb_filter.py 项目: Xodarap/Eipi
def main():
    h = HammieFilter()
    actions = []
    opts, args = getopt.getopt(sys.argv[1:], 'hxd:p:nfgstGSo:',
                               ['help', 'examples', 'option='])
    create_newdb = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-x', '--examples'):
            examples()
        elif opt in ('-o', '--option'):
            Options.options.set_from_cmdline(arg, sys.stderr)
        elif opt == '-f':
            actions.append(h.filter)
        elif opt == '-g':
            actions.append(h.train_ham)
        elif opt == '-s':
            actions.append(h.train_spam)
        elif opt == '-t':
            actions.append(h.filter_train)
        elif opt == '-G':
            actions.append(h.untrain_ham)
        elif opt == '-S':
            actions.append(h.untrain_spam)
        elif opt == "-n":
            create_newdb = True
    h.dbname, h.usedb = storage.database_type(opts)

    if create_newdb:
        h.newdb()
        sys.exit(0)

    if actions == []:
        actions = [h.filter]

    if not args:
        args = ["-"]
    for fname in args:
        mbox = mboxutils.getmbox(fname)
        for msg in mbox:
            for action in actions:
                action(msg)
                if args == ["-"]:
                    unixfrom = msg.get_unixfrom() is not None
                else:
                    unixfrom = True
            result = mboxutils.as_string(msg, unixfrom=unixfrom)
            sys.stdout.write(result)
示例#18
0
def extractmessages(features, mapdb, hamfile, spamfile):
    """extract messages which contain given features"""
    hamids = {}
    spamids = {}
    for feature in features:
        ham, spam = mapdb.get(feature, ([], []))
        if hamfile is not None:
            for mbox in ham:
                msgids = hamids.get(mbox, set())
                msgids.update(ham.get(mbox, set()))
                hamids[mbox] = msgids
        if spamfile is not None:
            for mbox in spam:
                msgids = spamids.get(mbox, set())
                msgids.update(spam.get(mbox, set()))
                spamids[mbox] = msgids
    for mailfile in hamids:
        i = 0
        msgids = hamids[mailfile]
        for msg in getmbox(mailfile):
            if msg.get("message-id") in msgids:
                i += 1
                sys.stdout.write('\r%s: %5d' % (mailfile, i))
                sys.stdout.flush()
                print(msg, file=hamfile)
    print()
    for mailfile in spamids:
        i = 0
        msgids = spamids[mailfile]
        for msg in getmbox(mailfile):
            if msg.get("message-id") in msgids:
                i += 1
                sys.stdout.write('\r%s: %5d' % (mailfile, i))
                sys.stdout.flush()
                print(msg, file=spamfile)
    print()
示例#19
0
def read_emails_from_disk(data_folder,
                          categories=None,
                          email_charset="latin1"):
    """Read emails from files in folders.
    Default email_charset="latin1", for CSDMC2010_SPAM email_charset="iso-8859-1"
    """
    filenames = []
    targets = []
    data = []

    # read data from emails
    folders = [
        f for f in sorted(listdir(data_folder)) if isdir(join(data_folder, f))
    ]

    if categories is not None:
        folders = [f for f in folders if f in categories]
    else:
        categories = [f for f in folders]

    # encoding mess {I know nothing about the magic it does.}
    from cStringIO import StringIO
    from email.generator import Generator
    fp = StringIO()
    g = Generator(fp, mangle_from_=False, maxheaderlen=60)

    for folder in folders:
        folder_path = join(data_folder, folder)
        mbox = mboxutils.getmbox(folder_path)
        print("Reading emails from folder %s" % folder_path)
        for msg in mbox:
            g.flatten(msg)
            data.append(msg.as_string())
            # set targets here
            if folder[0].lower() == 'h':
                targets.append(0)  # ham
            else:
                targets.append(1)  # spam
    data = [d.decode(email_charset, 'strict') for d in data]

    return Bunch(data=data,
                 filenames=filenames,
                 categories=categories,
                 targets=targets,
                 DESCR='Data from E-mails')
示例#20
0
def score(h, msgs, reverse=0):
    """Score (judge) all messages from a mailbox."""
    global doc_clf
    global charset

    from cStringIO import StringIO
    from email.generator import Generator
    fp = StringIO()
    g = Generator(fp, mangle_from_=False, maxheaderlen=60)

    # XXX The reporting needs work!
    mbox = mboxutils.getmbox(msgs)
    i = 0
    spams = hams = unsures = 0

    sys.stdout.write("Scoring now: ")
    for msg in mbox:
        i += 1

        #sys.stdout.write("\r${0}".format(i))
        #sys.stdout.flush()

        prob, clues = h.score(msg, True)
        if hasattr(msg, '_mh_msgno'):
            msgno = msg._mh_msgno
        else:
            msgno = i
        isspam = (prob >= SPAM_THRESHOLD)
        isham = (prob <= HAM_THRESHOLD)

        if isham:
            hams += 1
        else:
            g.flatten(msg)
            msg = msg.as_string()

            if doc_clf.predict(msg):
                spams += 1
            else:
                hams += 1

    sys.stdout.write("\r${0}".format(i))
    sys.stdout.flush()
    return (spams, hams, unsures)
示例#21
0
            options.set_from_cmdline(arg, sys.stderr)

    dbname, usedb = storage.database_type(opts)

    bayes = storage.open_storage(dbname, usedb)

    bayes.load()

    if not args:

        args = ["-"]

    for fname in args:

        mbox = mboxutils.getmbox(fname)

        for msg in mbox:

            print ShowClues(bayes, msg, markup)

 if __name__ == "__main__":

    opts, args = getopt.getopt(sys.argv[1:], 'hmd:p:o:',
                               ['help', 'option=', 'markup'])

    markup = False

    for opt, arg in opts:

        if opt in ('-m', '--markup'):
示例#22
0
def train(bayes, msgs, is_spam):
    """Train bayes with all messages from a mailbox."""
    mbox = mboxutils.getmbox(msgs)
    for msg in mbox:
        bayes.learn(tokenize(msg), is_spam)
示例#23
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'dhgn:s:v', ['help'])
    except getopt.error as msg:
        usage(1, msg)
    doglob = False
    n = None
    verbose = False
    delete_dups = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt == '-g':
            doglob = True
        elif opt == '-s':
            random.seed(int(arg))
        elif opt == '-n':
            n = int(arg)
        elif opt == '-v':
            verbose = True
        elif opt == '-d':
            delete_dups = True
    if n is None or n <= 1:
        usage(1, "an -n value > 1 is required")
    if len(args) < 2:
        usage(1, "input mbox name and output base path are required")
    inputpaths, outputbasepath = args[:-1], args[-1]
    outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)]
    for dir in outdirs:
        if not os.path.isdir(dir):
            os.makedirs(dir)
    counter = 0
    cksums = set()
    skipped = 0
    for inputpath in inputpaths:
        if doglob:
            inpaths = glob.glob(inputpath)
        else:
            inpaths = [inputpath]
        for inpath in inpaths:
            mbox = mboxutils.getmbox(inpath)
            for msg in mbox:
                astext = str(msg)
                cksum = md5(astext).hexdigest()
                if delete_dups and cksum in cksums:
                    skipped += 1
                    continue
                cksums.add(cksum)
                i = random.randrange(n)
                counter += 1
                msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
                msgfile.write(astext)
                msgfile.close()
                if verbose:
                    if counter % 100 == 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()
    if verbose:
        print()
        print(counter, "messages split into", n, "directories")
        if skipped:
            print("skipped", skipped, "duplicate messages")
示例#24
0
文件: tte.py 项目: Xodarap/Eipi
def train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose):
    smisses = hmisses = round = 0
    ham_cutoff = Options.options["Categorization", "ham_cutoff"]
    spam_cutoff = Options.options["Categorization", "spam_cutoff"]

    while round < maxrounds and (hmisses or smisses or round == 0):
        hambone = mboxutils.getmbox(ham)
        spamcan = mboxutils.getmbox(spam)
        if reverse:
            hambone = reversed(list(hambone))
            spamcan = reversed(list(spamcan))
        round += 1

        if verbose:
            print >> sys.stderr, "*** round", round, "***"

        hmisses = smisses = nmsgs = 0
        start = datetime.datetime.now()
        try:
            while not maxmsgs or nmsgs < maxmsgs:
                hammsg = hambone.next()
                spammsg = spamcan.next()

                nmsgs += 2
                sys.stdout.write("\r%5d" % nmsgs)
                sys.stdout.flush()

                score = store.spamprob(tokenize(hammsg))
                if score > ham_cutoff:
                    if verbose:
                        print >> sys.stderr, "miss ham:  %.6f %s" % (score, hammsg["message-id"])
                    hmisses += 1
                    tdict[hammsg["message-id"]] = True
                    store.learn(tokenize(hammsg), False)

                score = store.spamprob(tokenize(spammsg))
                if score < spam_cutoff:
                    if verbose:
                        print >> sys.stderr, "miss spam: %.6f %s" % (score, spammsg["message-id"])
                    smisses += 1
                    tdict[spammsg["message-id"]] = True
                    store.learn(tokenize(spammsg), True)

        except StopIteration:
            pass

        delta = datetime.datetime.now()-start
        seconds = delta.seconds + delta.microseconds/1000000

        print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
              (round, nmsgs, hmisses, smisses, seconds)

    # We count all untrained messages so the user knows what was skipped.
    # We also tag them for saving so we don't lose messages which might have
    # value in a future run
    nhamleft = 0
    try:
        while True:
            msg = hambone.next()
            tdict[msg["message-id"]] = True
            nhamleft += 1
    except StopIteration:
        if nhamleft: print nhamleft, "untrained hams"

    nspamleft = 0
    try:
        while True:
            msg = spamcan.next()
            tdict[msg["message-id"]] = True
            nspamleft += 1
    except StopIteration:
        if nspamleft: print nspamleft, "untrained spams"
示例#25
0
from six.moves import xrange


def test_spambayes(loops, messages, ham_classifier):
    # Prime the pump. This still leaves some hot functions uncompiled; these
    # will be noticed as hot during the timed loops below.
    for msg in messages:
        ham_classifier.score(msg)

    range_it = xrange(loops)
    t0 = perf.perf_counter()

    for _ in range_it:
        for msg in messages:
            ham_classifier.score(msg)

    return perf.perf_counter() - t0


if __name__ == "__main__":
    runner = perf.text_runner.TextRunner(name='spambayes')
    runner.metadata['description'] = "Run the SpamBayes benchmark."

    data_dir = os.path.join(os.path.dirname(__file__), "data")
    mailbox = os.path.join(data_dir, "spambayes_mailbox")
    ham_data = os.path.join(data_dir, "spambayes_hammie.pkl")
    msgs = list(mboxutils.getmbox(mailbox))
    ham_classifier = hammie.open(ham_data, "pickle", "r")

    runner.bench_sample_func(test_spambayes, msgs, ham_classifier)
示例#26
0
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose,
          ratio):
    round = 0
    ham_cutoff = Options.options["Categorization", "ham_cutoff"]
    spam_cutoff = Options.options["Categorization", "spam_cutoff"]

    # list-ify ham and spam iterators immediately.  We don't really want to
    # fetch the messages multiple times, and this is no worse than what happened
    # before when -R was passed.
    hambone_ = list(mboxutils.getmbox(hambox))
    spamcan_ = list(mboxutils.getmbox(spambox))

    if reverse:
        hambone_ = list(reversed(hambone_))
        spamcan_ = list(reversed(spamcan_))
    
    nspam, nham = len(spamcan_), len(hambone_)
    if ratio:
        rspam, rham = ratio
        # If the actual ratio of spam to ham in the database is better than
        # what was asked for, use that better ratio.
        if (rspam > rham) == (rspam * nham > rham * nspam):
            rspam, rham = nspam, nham

    # define some indexing constants
    ham = 0
    spam = 1
    name = ('ham','spam')
    misses = [0, 0]

    misclassified = lambda is_spam, score: (
        is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff)

    while round < maxrounds and (misses[ham] or misses[spam] or round == 0):
        round += 1
        if verbose:
            print >> sys.stderr, "*** round", round, "***"

        start = datetime.datetime.now()
        hambone = iter(hambone_)
        spamcan = iter(spamcan_)

        i = [0, 0]
        msgs_processed = 0
        misses = [0, 0]
        training_sets = [hambone, spamcan]

        while not maxmsgs or msgs_processed < maxmsgs:

            # should the next message come from hambone or spamcan?
            train_spam = i[ham] * rspam > i[spam] * rham

            try:
                train_msg = training_sets[train_spam].next()
            except StopIteration:
                break

            i[train_spam] += 1
            msgs_processed += 1
            sys.stdout.write("\r%5d" % msgs_processed)
            sys.stdout.flush()

            tokens = list(tokenize(train_msg))
            score = store.spamprob(tokens)
            selector = train_msg["message-id"] or train_msg["subject"]

            if misclassified(train_spam, score) and selector is not None:
                if verbose:
                    print >> sys.stderr, "\tmiss %s: %.6f %s" % (
                        name[train_spam], score, selector)

                misses[train_spam] += 1
                tdict[train_msg["message-id"]] = True
                store.learn(tokens, train_spam)

        delta = datetime.datetime.now()-start
        seconds = delta.seconds + delta.microseconds/1000000

        print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
              (round, msgs_processed, misses[0], misses[1], seconds)

    training_sets = [hambone, spamcan]
    
    # We count all untrained messages so the user knows what was skipped.
    # We also tag them for saving so we don't lose messages which might have
    # value in a future run
    for is_spam in ham, spam:
        nleft = 0
        try:
            while True:
                msg = training_sets[is_spam].next()
                score = store.spamprob(tokenize(msg))
                
                if misclassified(is_spam, score):
                    tdict[msg["message-id"]] = True
                    nleft += 1
                    
        except StopIteration:
            if nleft:
                print nleft, "untrained %ss" % name[is_spam]
示例#27
0
    try:
        mapd = pickle_read(mapfile)
    except IOError:
        usage("Mapfile %s does not exist" % mapfile)
        return 1

    if not features and not args:
        usage("Require at least one feature (-f) arg or one message file")
        return 1

    if not features:
        # extract significant tokens from each message and identify
        # where they came from
        for f in args:
            for msg in getmbox(f):
                evidence = msg.get("X-Spambayes-Evidence", "")
                evidence = re.sub(r"\s+", " ", evidence)
                l = [e.rsplit(": ", 1)[0]
                     for e in evidence.split("; ")[2:]]
                for s in l:
                    try:
                        s = make_header(decode_header(s)).__unicode__()
                    except:
                        s = unicode(s, 'us-ascii', 'replace')
                    features.add(s)
        if not features:
            usage("No X-Spambayes-Evidence headers found")
            return 1

    if spamfile is not None:
示例#28
0
    outdirs = [outputbasepath + ("%d" % i) for i in range(1, n + 1)]
    for dir in outdirs:
        if not os.path.isdir(dir):
            os.makedirs(dir)

    counter = 0
    cksums = set()
    skipped = 0
    for inputpath in inputpaths:
        if doglob:
            inpaths = glob.glob(inputpath)
        else:
            inpaths = [inputpath]

        for inpath in inpaths:
            mbox = mboxutils.getmbox(inpath)
            for msg in mbox:
                astext = str(msg)
                cksum = md5(astext).hexdigest()
                if delete_dups and cksum in cksums:
                    skipped += 1
                    continue
                cksums.add(cksum)
                i = random.randrange(n)
                #assert astext.endswith('\n')
                counter += 1
                msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
                msgfile.write(astext)
                msgfile.close()
                if verbose:
                    if counter % 100 == 0:
示例#29
0
def test_spambayes(iterations, timer, messages, ham_classifier):
    # Prime the pump. This still leaves some hot functions uncompiled; these
    # will be noticed as hot during the timed loops below.
    for msg in messages:
        ham_classifier.score(msg)

    times = []
    for _ in xrange(iterations):
        t0 = timer()
        for msg in messages:
            ham_classifier.score(msg)
        t1 = timer()
        times.append(t1 - t0)
    return times


if __name__ == "__main__":
    parser = optparse.OptionParser(
        usage="%prog [options]",
        description=("Run the SpamBayes benchmark."))
    util.add_standard_options_to(parser)
    options, args = parser.parse_args()

    data_dir = os.path.join(os.path.dirname(__file__), "data")
    mailbox = os.path.join(data_dir, "spambayes_mailbox")
    ham_data = os.path.join(data_dir, "spambayes_hammie.pkl")
    msgs = list(mboxutils.getmbox(mailbox))
    ham_classifier = hammie.open(ham_data, "pickle", "r")
    util.run_benchmark(options, options.num_runs, test_spambayes,
                       msgs, ham_classifier)
示例#30
0
文件: tte.py 项目: Xodarap/Eipi
        os.unlink(dbname)
    except OSError:
        pass

    store = storage.open_storage(dbname, usedb)

    tdict = {}
    train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose)

    store.store()

    if cullext is not None:
        print "writing new ham mbox..."
        n = m = 0
        newham = file(ham + cullext, "w")
        for msg in mboxutils.getmbox(ham):
            m += 1
            if msg["message-id"] in tdict:
                newham.write(str(msg))
                n += 1
            sys.stdout.write("\r%5d of %5d" % (n, m))
            sys.stdout.flush()
        sys.stdout.write("\n")
        newham.close()

        print "writing new spam mbox..."
        n = m = 0
        newspam = file(spam + cullext, "w")
        for msg in mboxutils.getmbox(spam):
            m += 1
            if msg["message-id"] in tdict:
示例#31
0
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose,
          ratio):
    smisses = hmisses = round = 0
    ham_cutoff = Options.options["Categorization", "ham_cutoff"]
    spam_cutoff = Options.options["Categorization", "spam_cutoff"]
    hambone_ = list(mboxutils.getmbox(hambox))
    spamcan_ = list(mboxutils.getmbox(spambox))
    if reverse:
        hambone_ = list(reversed(hambone_))
        spamcan_ = list(reversed(spamcan_))
    if ratio:
        rspam,rham = ratio
    else:
        rspam,rham = len(spamcan_),len(hambone_)
    ham = 0
    spam = 1
    name = ('ham','spam')
    misses = [0,0]
    misclassified = lambda is_spam, score: (
        is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff)
    while round < maxrounds and (misses[ham] or misses[spam] or round == 0):
        round += 1
        if verbose:
            print >> sys.stderr, "*** round", round, "***"
        start = datetime.datetime.now()
        hambone = iter(hambone_)
        spamcan = iter(spamcan_)
        i = [0,0]
        msgs_processed = 0
        misses = [0,0]
        training_sets = [hambone, spamcan]
        while not maxmsgs or msgs_processed < maxmsgs:
            train_spam = i[ham] * rspam > i[spam] * rham
            try:
                train_msg = training_sets[train_spam].next()
            except StopIteration:
                break;
            i[train_spam] += 1
            msgs_processed += 1
            sys.stdout.write("\r%5d" % msgs_processed)
            sys.stdout.flush()
            tokens = list(tokenize(train_msg))
            score = store.spamprob(tokens)
            selector = train_msg["message-id"] or train_msg["subject"]
            if misclassified(train_spam,score) and selector is not None:
                if verbose:
                    print >> sys.stderr, "\tmiss %s: %.6f %s" % (
                        name[train_spam], score, selector)
                misses[train_spam] += 1
                tdict[train_msg["message-id"]] = True
                store.learn(tokens, train_spam)
        delta = datetime.datetime.now()-start
        seconds = delta.seconds + delta.microseconds/1000000
        print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
              (round, msgs_processed, misses[0], misses[1], seconds)
    training_sets = [hambone,spamcan]
    for is_spam in ham,spam:
        nleft = 0
        try:
            while True:
                msg = training_sets[is_spam].next()
                score = store.spamprob(tokenize(msg))
                if misclassified(is_spam,score):
                    tdict[msg["message-id"]] = True
                    nleft += 1
        except StopIteration:
            if nleft: print nleft, "untrained %ss" % name[is_spam]
示例#32
0
    outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)]
    for dir in outdirs:
        if not os.path.isdir(dir):
            os.makedirs(dir)

    counter = 0
    cksums = set()
    skipped = 0
    for inputpath in inputpaths:
        if doglob:
            inpaths = glob.glob(inputpath)
        else:
            inpaths = [inputpath]

        for inpath in inpaths:
            mbox = mboxutils.getmbox(inpath)
            for msg in mbox:
                astext = str(msg)
                cksum = md5(astext).hexdigest()
                if delete_dups and cksum in cksums:
                    skipped += 1
                    continue
                cksums.add(cksum)
                i = random.randrange(n)
                #assert astext.endswith('\n')
                counter += 1
                msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
                msgfile.write(astext)
                msgfile.close()
                if verbose:
                    if counter % 100 == 0:
示例#33
0
def main(profiling=False):
    h = HammieFilter()
    actions = []
    opts, args = getopt.getopt(sys.argv[1:], 'hvxd:p:nfgstGSo:P',
                               ['help', 'version', 'examples', 'option='])
    create_newdb = False
    do_profile = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-v', '--version'):
            version()
        elif opt in ('-x', '--examples'):
            examples()
        elif opt in ('-o', '--option'):
            Options.options.set_from_cmdline(arg, sys.stderr)
        elif opt == '-f':
            actions.append(h.filter)
        elif opt == '-g':
            actions.append(h.train_ham)
        elif opt == '-s':
            actions.append(h.train_spam)
        elif opt == '-t':
            actions.append(h.filter_train)
        elif opt == '-G':
            actions.append(h.untrain_ham)
        elif opt == '-S':
            actions.append(h.untrain_spam)
        elif opt == '-P':
            do_profile = True
            if not profiling:
                try:
                    import cProfile
                except ImportError:
                    pass
                else:
                    return cProfile.run("main(True)")
        elif opt == "-n":
            create_newdb = True
    h.dbname, h.usedb = storage.database_type(opts)

    if create_newdb or not os.path.exists(h.dbname):
        h.newdb()
        print >> sys.stderr, "Created new database in", h.dbname
        if create_newdb:
            sys.exit(0)

    if actions == []:
        actions = [h.filter]

    if not args:
        args = ["-"]
    for fname in args:
        mbox = mboxutils.getmbox(fname)
        for msg in mbox:
            for action in actions:
                action(msg)
                if args == ["-"]:
                    unixfrom = msg.get_unixfrom() is not None
                else:
                    unixfrom = True
            result = mboxutils.as_string(msg, unixfrom=unixfrom)
            sys.stdout.write(result)
示例#34
0
Run a canned mailbox through a SpamBayes ham/spam classifier.
"""

import os.path

import perf

from spambayes import hammie, mboxutils


__author__ = "[email protected] (Skip Montanaro)"
__contact__ = "[email protected] (Collin Winter)"


def bench_spambayes(ham_classifier, messages):
    for msg in messages:
        ham_classifier.score(msg)


if __name__ == "__main__":
    runner = perf.Runner()
    runner.metadata['description'] = "Run the SpamBayes benchmark."

    data_dir = os.path.join(os.path.dirname(__file__), "data")
    mailbox = os.path.join(data_dir, "spambayes_mailbox")
    ham_data = os.path.join(data_dir, "spambayes_hammie.pkl")
    messages = list(mboxutils.getmbox(mailbox))
    ham_classifier = hammie.open(ham_data, "pickle", "r")

    runner.bench_func('spambayes', bench_spambayes, ham_classifier, messages)
示例#35
0
#!/bin/python
"""Wrapper script for testing the performance of SpamBayes.

Run a canned mailbox through a SpamBayes ham/spam classifier.
"""

import os.path
from spambayes import hammie, mboxutils

__author__ = "[email protected] (Skip Montanaro)"
__contact__ = "[email protected] (Collin Winter)"


def bench_spambayes(ham_classifier, messages):
    for msg in messages:
        ham_classifier.score(msg)


# data_dir = os.path.join(os.path.dirname(__file__), "data")
data_dir = os.path.dirname(__file__)
mailbox = os.path.join(data_dir, "spambayes_mailbox")
#mailbox = os.path.join(data_dir, "small_mailbox")
ham_data = os.path.join(data_dir, "spambayes_hammie.pkl")
messages = list(mboxutils.getmbox(mailbox))
ham_classifier = hammie.open(ham_data, "pickle", "r")
bench_spambayes(ham_classifier, messages)
示例#36
0
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose,
          ratio):
    round = 0
    ham_cutoff = Options.options["Categorization", "ham_cutoff"]
    spam_cutoff = Options.options["Categorization", "spam_cutoff"]

    # list-ify ham and spam iterators immediately.  We don't really want to
    # fetch the messages multiple times, and this is no worse than what happened
    # before when -R was passed.
    hambone_ = list(mboxutils.getmbox(hambox))
    spamcan_ = list(mboxutils.getmbox(spambox))

    if reverse:
        hambone_ = list(reversed(hambone_))
        spamcan_ = list(reversed(spamcan_))

    nspam, nham = len(spamcan_), len(hambone_)
    if ratio:
        rspam, rham = ratio
        # If the actual ratio of spam to ham in the database is better than
        # what was asked for, use that better ratio.
        if (rspam > rham) == (rspam * nham > rham * nspam):
            rspam, rham = nspam, nham

    # define some indexing constants
    ham = 0
    spam = 1
    name = ('ham', 'spam')
    misses = [0, 0]

    misclassified = lambda is_spam, score: (is_spam and score < spam_cutoff or
                                            not is_spam and score > ham_cutoff)

    while round < maxrounds and (misses[ham] or misses[spam] or round == 0):
        round += 1
        if verbose:
            print >> sys.stderr, "*** round", round, "***"

        start = datetime.datetime.now()
        hambone = iter(hambone_)
        spamcan = iter(spamcan_)

        i = [0, 0]
        msgs_processed = 0
        misses = [0, 0]
        training_sets = [hambone, spamcan]

        while not maxmsgs or msgs_processed < maxmsgs:

            # should the next message come from hambone or spamcan?
            train_spam = i[ham] * rspam > i[spam] * rham

            try:
                train_msg = training_sets[train_spam].next()
            except StopIteration:
                break

            i[train_spam] += 1
            msgs_processed += 1
            sys.stdout.write("\r%5d" % msgs_processed)
            sys.stdout.flush()

            tokens = list(tokenize(train_msg))
            score = store.spamprob(tokens)
            selector = train_msg["message-id"] or train_msg["subject"]

            if misclassified(train_spam, score) and selector is not None:
                if verbose:
                    print >> sys.stderr, "\tmiss %s: %.6f %s" % (
                        name[train_spam], score, selector)

                misses[train_spam] += 1
                tdict[train_msg["message-id"]] = True
                store.learn(tokens, train_spam)

        delta = datetime.datetime.now() - start
        seconds = delta.seconds + delta.microseconds / 1000000

        print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
              (round, msgs_processed, misses[0], misses[1], seconds)

    training_sets = [hambone, spamcan]

    # We count all untrained messages so the user knows what was skipped.
    # We also tag them for saving so we don't lose messages which might have
    # value in a future run
    for is_spam in ham, spam:
        nleft = 0
        try:
            while True:
                msg = training_sets[is_spam].next()
                score = store.spamprob(tokenize(msg))

                if misclassified(is_spam, score):
                    tdict[msg["message-id"]] = True
                    nleft += 1

        except StopIteration:
            if nleft:
                print nleft, "untrained %ss" % name[is_spam]