예제 #1
0
 def setUp(self):
     _FileCorpusBaseTest.setUp(self)
     self.directory = 'fctesthamcorpus'
     self.cache_size = 100
     self.factory = FileMessageFactory()
     self.stuff_corpus()
     self.corpus = FileCorpus(self.factory, self.directory, '?',
                              self.cache_size)
예제 #2
0
def main(argv):
    opts, args = getopt.getopt(argv, "h", ["help"])
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return

    # Create the corpuses and the factory that reads the messages.
    if options["pop3proxy", "cache_use_gzip"]:
        messageFactory = GzipFileMessageFactory()
    else:
        messageFactory = FileMessageFactory()
    sc = get_pathname_option("Storage", "spam_cache")
    hc = get_pathname_option("Storage", "ham_cache")
    spamCorpus = FileCorpus(messageFactory, sc)
    hamCorpus = FileCorpus(messageFactory, hc)

    # Read in all the trained messages.
    allTrained = {}
    for corpus, disposition in [(spamCorpus, 'Yes'), (hamCorpus, 'No')]:
        for m in corpus:
            message = mboxutils.get_message(m.getSubstance())
            message._pop3CacheDisposition = disposition
            allTrained[m.key()] = message

    # Sort the messages into the order they arrived, then work out a scaling
    # factor for the graph - 'limit' is the widest it can be in characters.
    keys = allTrained.keys()
    keys.sort()
    limit = 70
    if len(keys) < limit:
        scale = 1
    else:
        scale = len(keys) // (limit // 2)

    # Build the data - an array of cumulative success indexed by count.
    count = successful = 0
    successByCount = []
    for key in keys:
        message = allTrained[key]
        disposition = message[options["Headers", "classification_header_name"]]
        if (message._pop3CacheDisposition == disposition):
            successful += 1
        count += 1
        if count % scale == (scale - 1):
            successByCount.append(successful // scale)

    # Build the graph, as a list of rows of characters.
    size = count // scale
    graph = [[" " for i in range(size + 3)] for j in range(size)]
    for c in range(size):
        graph[c][1] = "|"
        graph[c][c + 3] = "."
        graph[successByCount[c]][c + 3] = "*"
    graph.reverse()

    # Print the graph.
    print "\n   Success of the classifier over time:\n"
    print "   . - Number of messages over time"
    print "   * - Number of correctly classified messages over time\n\n"
    for row in range(size):
        line = ''.join(graph[row])
        if row == 0:
            print line + " %d" % count
        elif row == (count - successful) // scale:
            print line + " %d" % successful
        else:
            print line
    print " " + "_" * (size + 2)
예제 #3
0
 def test_filter(self):
     self.assertEqual(len(self.corpus.msgs), 3)
     # Try again, with all messages.
     self.corpus = FileCorpus(self.factory, self.directory, '*',
                              self.cache_size)
     self.assertEqual(len(self.corpus.msgs), 4)