Exemplo n.º 1
0
 def setUp(self):
     _FileCorpusBaseTest.setUp(self)
     self.directory = 'fctesthamcorpus'
     self.cache_size = 100
     self.factory = FileMessageFactory()
     self.stuff_corpus()
     self.corpus = FileCorpus(self.factory, self.directory, '?',
                              self.cache_size)
Exemplo n.º 2
0
class FileCorpusTest(_FileCorpusBaseTest):
    def setUp(self):
        _FileCorpusBaseTest.setUp(self)
        self.directory = 'fctesthamcorpus'
        self.cache_size = 100
        self.factory = FileMessageFactory()
        self.stuff_corpus()
        self.corpus = FileCorpus(self.factory, self.directory,
                                 '?', self.cache_size)
    def stuff_corpus(self):
        """Put messages in the corpus"""
        i = 0
        for content in [good1, spam1, malformed1]:
            self.msg = self.factory.create(str(i), self.directory, content)
            self.msg.store()
            i += 1
        msg = self.factory.create("10", self.directory, good1)
        msg.store()
    def test___init__(self):
        self.assertEqual(self.corpus.directory, self.directory)
        self.assertEqual(self.corpus.filter, '?')
        self.assertEqual(self.corpus.cacheSize, self.cache_size)
    def test_filter(self):
        self.assertEqual(len(self.corpus.msgs), 3)
        self.corpus = FileCorpus(self.factory, self.directory,
                                 '*', self.cache_size)
        self.assertEqual(len(self.corpus.msgs), 4)
    def test_makeMessage_no_content(self):
        key = "testmake"
        self.corpus.makeMessage(key)
    def test_makeMessage_with_content(self):
        key = "testmake"
        content = spam1
        msg = self.corpus.makeMessage(key, content)
        self.assertEqual(msg.key(), key)
        self.assertEqual(msg.as_string(), content.replace("\n", "\r\n"))
    def test_addMessage_invalid(self):
        class msg(object):
            def key(self):
                return 'aa'
        self.assertRaises(ValueError, self.corpus.addMessage, msg())
    def test_addMessage(self):
        msg = self.factory.create("9", 'fctestspamcorpus', good1)
        self.corpus.addMessage(msg)
        self.assertEqual(msg.directory, self.directory)
        fn = os.path.join(self.directory, "9")
        f = open(fn, "rU")
        content = f.read()
        f.close()
        self.assertEqual(content, good1)
    def test_removeMessage(self):
        fn = self.msg.pathname()
        self.assertEqual(os.path.exists(fn), True)
        self.corpus.removeMessage(self.msg)
        self.assertEqual(os.path.exists(fn), False)
Exemplo n.º 3
0
 def setUp(self):
     _FileCorpusBaseTest.setUp(self)
     self.directory = 'fctesthamcorpus'
     self.cache_size = 100
     self.factory = FileMessageFactory()
     self.stuff_corpus()
     self.corpus = FileCorpus(self.factory, self.directory,
                              '?', self.cache_size)
Exemplo n.º 4
0
    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        self.mdb = spambayes.message.Message().message_info_db

        # Load stats manager.
        self.stats = Stats.Stats(options, self.mdb)

        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(storage.ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)
Exemplo n.º 5
0
def main(argv):
    opts, args = getopt.getopt(argv, "h", ["help"])
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return

    # Create the corpuses and the factory that reads the messages.
    if options["pop3proxy", "cache_use_gzip"]:
        messageFactory = GzipFileMessageFactory()
    else:
        messageFactory = FileMessageFactory()
    sc = get_pathname_option("Storage", "spam_cache")
    hc = get_pathname_option("Storage", "ham_cache")
    spamCorpus = FileCorpus(messageFactory, sc)
    hamCorpus = FileCorpus(messageFactory, hc)

    # Read in all the trained messages.
    allTrained = {}
    for corpus, disposition in [(spamCorpus, 'Yes'), (hamCorpus, 'No')]:
        for m in corpus:
            message = mboxutils.get_message(m.getSubstance())
            message._pop3CacheDisposition = disposition
            allTrained[m.key()] = message

    # Sort the messages into the order they arrived, then work out a scaling
    # factor for the graph - 'limit' is the widest it can be in characters.
    keys = allTrained.keys()
    keys.sort()
    limit = 70
    if len(keys) < limit:
        scale = 1
    else:
        scale = len(keys) // (limit // 2)

    # Build the data - an array of cumulative success indexed by count.
    count = successful = 0
    successByCount = []
    for key in keys:
        message = allTrained[key]
        disposition = message[options["Headers", "classification_header_name"]]
        if (message._pop3CacheDisposition == disposition):
            successful += 1
        count += 1
        if count % scale == (scale - 1):
            successByCount.append(successful // scale)

    # Build the graph, as a list of rows of characters.
    size = count // scale
    graph = [[" " for i in range(size + 3)] for j in range(size)]
    for c in range(size):
        graph[c][1] = "|"
        graph[c][c + 3] = "."
        graph[successByCount[c]][c + 3] = "*"
    graph.reverse()

    # Print the graph.
    print "\n   Success of the classifier over time:\n"
    print "   . - Number of messages over time"
    print "   * - Number of correctly classified messages over time\n\n"
    for row in range(size):
        line = ''.join(graph[row])
        if row == 0:
            print line + " %d" % count
        elif row == (count - successful) // scale:
            print line + " %d" % successful
        else:
            print line
    print " " + "_" * (size + 2)
Exemplo n.º 6
0
class FileCorpusTest(_FileCorpusBaseTest):
    def setUp(self):
        _FileCorpusBaseTest.setUp(self)
        self.directory = 'fctesthamcorpus'
        self.cache_size = 100
        self.factory = FileMessageFactory()
        self.stuff_corpus()
        self.corpus = FileCorpus(self.factory, self.directory, '?',
                                 self.cache_size)

    def stuff_corpus(self):
        """Put messages in the corpus"""
        i = 0
        for content in [good1, spam1, malformed1]:
            self.msg = self.factory.create(str(i), self.directory, content)
            self.msg.store()
            i += 1

        # Put in a message that won't match the filter.
        msg = self.factory.create("10", self.directory, good1)
        msg.store()

    def test___init__(self):
        self.assertEqual(self.corpus.directory, self.directory)
        self.assertEqual(self.corpus.filter, '?')
        self.assertEqual(self.corpus.cacheSize, self.cache_size)

    def test_filter(self):
        self.assertEqual(len(self.corpus.msgs), 3)
        # Try again, with all messages.
        self.corpus = FileCorpus(self.factory, self.directory, '*',
                                 self.cache_size)
        self.assertEqual(len(self.corpus.msgs), 4)

    def test_makeMessage_no_content(self):
        key = "testmake"
        self.corpus.makeMessage(key)

    def test_makeMessage_with_content(self):
        key = "testmake"
        content = spam1
        msg = self.corpus.makeMessage(key, content)
        self.assertEqual(msg.key(), key)
        self.assertEqual(msg.as_string(), content.replace("\n", "\r\n"))

    def test_addMessage_invalid(self):
        class msg(object):
            def key(self):
                return 'aa'

        self.assertRaises(ValueError, self.corpus.addMessage, msg())

    def test_addMessage(self):
        msg = self.factory.create("9", 'fctestspamcorpus', good1)
        self.corpus.addMessage(msg)
        self.assertEqual(msg.directory, self.directory)
        fn = os.path.join(self.directory, "9")
        f = open(fn, "rU")
        content = f.read()
        f.close()
        self.assertEqual(content, good1)

    def test_removeMessage(self):
        fn = self.msg.pathname()
        self.assertEqual(os.path.exists(fn), True)
        self.corpus.removeMessage(self.msg)
        self.assertEqual(os.path.exists(fn), False)
Exemplo n.º 7
0
    def setup(self):
        # Can't import this at the top because it's circular.
        # XXX Someone smarter than me, please figure out the right
        # XXX way to do this.
        from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory

        username = options["globals", "proxy_username"]
        password = options["globals", "proxy_password"]
        server = options["globals", "proxy_server"]
        if server.find(":") != -1:
            server, port = server.split(':', 1)
        else:
            port = 8080
        if server:
            # Build a new opener that uses a proxy requiring authorization
            proxy_support = urllib2.ProxyHandler({"http" : \
                                                  "http://%s:%s@%s:%d" % \
                                                  (username, password,
                                                   server, port)})
            opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
        else:
            # Build a new opener without any proxy information.
            opener = urllib2.build_opener(urllib2.HTTPHandler)

        # Install it
        urllib2.install_opener(opener)

        # Setup the cache for retrieved urls
        age = options["URLRetriever", "x-cache_expiry_days"] * 24 * 60 * 60
        dir = options["URLRetriever", "x-cache_directory"]
        if not os.path.exists(dir):
            # Create the directory.
            if options["globals", "verbose"]:
                print >> sys.stderr, "Creating URL cache directory"
            os.makedirs(dir)

        self.urlCorpus = ExpiryFileCorpus(age,
                                          FileMessageFactory(),
                                          dir,
                                          cacheSize=20)
        # Kill any old information in the cache
        self.urlCorpus.removeExpiredMessages()

        # Setup caches for unretrievable urls
        self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
        self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
        if os.path.exists(self.bad_url_cache_name):
            try:
                self.bad_urls = pickle_read(self.bad_url_cache_name)
            except (IOError, ValueError):
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad URL pickle, using new."
                self.bad_urls = {
                    "url:non_resolving": (),
                    "url:non_html": (),
                    "url:unknown_error": ()
                }
        else:
            if options["globals", "verbose"]:
                print "URL caches don't exist: creating"
            self.bad_urls = {
                "url:non_resolving": (),
                "url:non_html": (),
                "url:unknown_error": ()
            }
        if os.path.exists(self.http_error_cache_name):
            try:
                self.http_error_urls = pickle_read(self.http_error_cache_name)
            except IOError, ValueError:
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad HHTP error pickle, using new."
                self.http_error_urls = {}
Exemplo n.º 8
0
class  FileCorpusTest (_FileCorpusBaseTest) :
	def setUp(self):

        _FileCorpusBaseTest.setUp(self)

        self.directory = 'fctesthamcorpus'

        self.cache_size = 100

        self.factory = FileMessageFactory()

        self.stuff_corpus()

        self.corpus = FileCorpus(self.factory, self.directory,
                                 '?', self.cache_size)
 def stuff_corpus(self):

        """Put messages in the corpus"""

        i = 0

        for content in [good1, spam1, malformed1]:

            self.msg = self.factory.create(str(i), self.directory, content)

            self.msg.store()

            i += 1

        msg = self.factory.create("10", self.directory, good1)

        msg.store()
 def test___init__(self):

        self.assertEqual(self.corpus.directory, self.directory)

        self.assertEqual(self.corpus.filter, '?')

        self.assertEqual(self.corpus.cacheSize, self.cache_size)
 def test_filter(self):

        self.assertEqual(len(self.corpus.msgs), 3)

        self.corpus = FileCorpus(self.factory, self.directory,
                                 '*', self.cache_size)

        self.assertEqual(len(self.corpus.msgs), 4)
 def test_makeMessage_no_content(self):

        key = "testmake"

        self.corpus.makeMessage(key)
 def test_makeMessage_with_content(self):

        key = "testmake"

        content = spam1

        msg = self.corpus.makeMessage(key, content)

        self.assertEqual(msg.key(), key)

        self.assertEqual(msg.as_string(), content.replace("\n", "\r\n"))
 def test_addMessage_invalid(self):

        class msg(object):

            def key(self):

                return 'aa'

        self.assertRaises(ValueError, self.corpus.addMessage, msg())
 def test_addMessage(self):

        msg = self.factory.create("9", 'fctestspamcorpus', good1)

        self.corpus.addMessage(msg)

        self.assertEqual(msg.directory, self.directory)

        fn = os.path.join(self.directory, "9")

        f = open(fn)

        content = f.read()

        f.close()

        self.assertEqual(content, good1)
 def test_removeMessage(self):

        fn = self.msg.pathname()

        self.assertEqual(os.path.exists(fn), True)

        self.corpus.removeMessage(self.msg)

        self.assertEqual(os.path.exists(fn), False)

class  ExpiryFileCorpusTest (FileCorpusTest) :
	def setUp(self):

        _FileCorpusBaseTest.setUp(self)

        self.cache_size = 100

        self.directory = 'fctesthamcorpus'

        self.factory = FileMessageFactory()

        self.stuff_corpus()

        self.corpus = ExpiryFileCorpus(1.0, self.factory, self.directory,
                                       '?', self.cache_size)

def suite():

    suite = unittest.TestSuite()

    clses = (FileMessageFactoryTest,
             GzipFileMessageFactoryTest,
             FileMessageTest,
             GzipFileMessageTest,
             FileCorpusTest,
             ExpiryFileCorpusTest,
             )

    for cls in clses:

        suite.addTest(unittest.makeSuite(cls))

    return suite
 if __name__=='__main__':

    sb_test_support.unittest_main(argv=sys.argv + ['suite'])

 if __name__=='__main__':

    sb_test_support.unittest_main(argv=sys.argv + ['suite'])