Exemplo n.º 1
0
 def onTrain(self, file, text, which):
     """Train on an uploaded or pasted message."""
     self._writePreamble(_("Train"))
     content = file or text
     isSpam = (which == _('Train as Spam'))
     if file:
         content = self._convertToMbox(content)
     content = content.replace('\r\n', '\n').replace('\r', '\n')
     messages = self._convertUploadToMessageList(content)
     if isSpam:
         desired_corpus = "spamCorpus"
     else:
         desired_corpus = "hamCorpus"
     if hasattr(self, desired_corpus):
         corpus = getattr(self, desired_corpus)
     else:
         if hasattr(self, "state"):
             corpus = getattr(self.state, desired_corpus)
             setattr(self, desired_corpus, corpus)
             self.msg_name_func = self.state.getNewMessageName
         else:
             if isSpam:
                 fn = storage.get_pathname_option("Storage",
                                                  "spam_cache")
             else:
                 fn = storage.get_pathname_option("Storage",
                                                  "ham_cache")
             storage.ensureDir(fn)
             if options["Storage", "cache_use_gzip"]:
                 factory = FileCorpus.GzipFileMessageFactory()
             else:
                 factory = FileCorpus.FileMessageFactory()
             age = options["Storage", "cache_expiry_days"]*24*60*60
             corpus = FileCorpus.ExpiryFileCorpus(age, factory, fn,
                                       '[0123456789\-]*', cacheSize=20)
             setattr(self, desired_corpus, corpus)
             class UniqueNamer(object):
                 count = -1
                 def generate_name(self):
                     self.count += 1
                     return "%10.10d-%d" % (long(time.time()), self.count)
             Namer = UniqueNamer()
             self.msg_name_func = Namer.generate_name
     self.write("<b>" + _("Training") + "...</b>\n")
     self.flush()
     for message in messages:
         key = self.msg_name_func()
         msg = corpus.makeMessage(key, message)
         msg.setId(key)
         corpus.addMessage(msg)
         msg.RememberTrained(isSpam)
         self.stats.RecordTraining(not isSpam)
     self._doSave()
     self.write(_("%sOK. Return %sHome%s or train again:%s") %
                ("<p>", "<a href='home'>", "</a", "</p>"))
     self.write(self._buildTrainBox())
     self._writePostamble()
Exemplo n.º 2
0
    def train_mime(self, msg_text, encoding, is_spam):
        if self.state.bayes is None:
            self.state.create_workers()
        # Get msg_text into canonical string representation.
        # Make sure we have a unicode object...
        if isinstance(msg_text, str):
            msg_text = unicode(msg_text, encoding)
        # ... then encode it as utf-8.
        if isinstance(msg_text, unicode):
            msg_text = msg_text.encode("utf-8")
        msg = message_from_string(msg_text,
                                  _class=spambayes.message.SBHeaderMessage)
        if is_spam:
            desired_corpus = "spamCorpus"
        else:
            desired_corpus = "hamCorpus"
        if hasattr(self, desired_corpus):
            corpus = getattr(self, desired_corpus)
        else:
            if hasattr(self, "state"):
                corpus = getattr(self.state, desired_corpus)
                setattr(self, desired_corpus, corpus)
                self.msg_name_func = self.state.getNewMessageName
            else:
                if is_spam:
                    fn = storage.get_pathname_option("Storage", "spam_cache")
                else:
                    fn = storage.get_pathname_option("Storage", "ham_cache")
                storage.ensureDir(fn)
                if options["Storage", "cache_use_gzip"]:
                    factory = FileCorpus.GzipFileMessageFactory()
                else:
                    factory = FileCorpus.FileMessageFactory()
                age = options["Storage", "cache_expiry_days"] * 24 * 60 * 60
                corpus = FileCorpus.ExpiryFileCorpus(age,
                                                     factory,
                                                     fn,
                                                     '[0123456789\-]*',
                                                     cacheSize=20)
                setattr(self, desired_corpus, corpus)

                class UniqueNamer(object):
                    count = -1

                    def generate_name(self):
                        self.count += 1
                        return "%10.10d-%d" % (long(time.time()), self.count)

                Namer = UniqueNamer()
                self.msg_name_func = Namer.generate_name
        key = self.msg_name_func()
        mime_message = unicode(msg.as_string(), "utf-8").encode("utf-8")
        msg = corpus.makeMessage(key, mime_message)
        msg.setId(key)
        corpus.addMessage(msg)
        msg.RememberTrained(is_spam)
Exemplo n.º 3
0
    def create_workers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        if self.is_test:
            self.use_db = "pickle"
            self.db_name = '_core_server.pickle'   # This is never saved.
        if not hasattr(self, "db_name"):
            self.db_name, self.use_db = storage.database_type([])
        self.bayes = storage.open_storage(self.db_name, self.use_db)

        # Load stats manager.
        self.stats = Stats.Stats(options,
                                 spambayes.message.Message().message_info_db)

        self.build_status_strings()

        # Don't set up the caches and training objects when running the
        # self-test, so as not to clutter the filesystem.
        if not self.is_test:
            # Create/open the Corpuses.  Use small cache sizes to avoid
            # hogging lots of memory.
            sc = get_pathname_option("Storage", "core_spam_cache")
            hc = get_pathname_option("Storage", "core_ham_cache")
            uc = get_pathname_option("Storage", "core_unknown_cache")
            for d in [sc, hc, uc]:
                storage.ensureDir(d)
            if self.gzip_cache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spam_trainer = storage.SpamTrainer(self.bayes)
            self.ham_trainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spam_trainer)
            self.hamCorpus.addObserver(self.ham_trainer)
Exemplo n.º 4
0
 def train_mime(self, msg_text, encoding, is_spam):
     if self.state.bayes is None:
         self.state.create_workers()
     # Get msg_text into canonical string representation.
     # Make sure we have a unicode object...
     if isinstance(msg_text, str):
         msg_text = unicode(msg_text, encoding)
     # ... then encode it as utf-8.
     if isinstance(msg_text, unicode):
         msg_text = msg_text.encode("utf-8")
     msg = message_from_string(msg_text,
                               _class=spambayes.message.SBHeaderMessage)
     if is_spam:
         desired_corpus = "spamCorpus"
     else:
         desired_corpus = "hamCorpus"
     if hasattr(self, desired_corpus):
         corpus = getattr(self, desired_corpus)
     else:
         if hasattr(self, "state"):
             corpus = getattr(self.state, desired_corpus)
             setattr(self, desired_corpus, corpus)
             self.msg_name_func = self.state.getNewMessageName
         else:
             if is_spam:
                 fn = storage.get_pathname_option("Storage",
                                                  "spam_cache")
             else:
                 fn = storage.get_pathname_option("Storage",
                                                  "ham_cache")
             storage.ensureDir(fn)
             if options["Storage", "cache_use_gzip"]:
                 factory = FileCorpus.GzipFileMessageFactory()
             else:
                 factory = FileCorpus.FileMessageFactory()
             age = options["Storage", "cache_expiry_days"]*24*60*60
             corpus = FileCorpus.ExpiryFileCorpus(age, factory, fn,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)
             setattr(self, desired_corpus, corpus)
             class UniqueNamer(object):
                 count = -1
                 def generate_name(self):
                     self.count += 1
                     return "%10.10d-%d" % (long(time.time()), self.count)
             Namer = UniqueNamer()
             self.msg_name_func = Namer.generate_name
     key = self.msg_name_func()
     mime_message = unicode(msg.as_string(), "utf-8").encode("utf-8")
     msg = corpus.makeMessage(key, mime_message)
     msg.setId(key)
     corpus.addMessage(msg)
     msg.RememberTrained(is_spam)
Exemplo n.º 5
0
 def createWorkers(self):
     """Using the options that were initialised in __init__ and then
     possibly overridden by the driver code, create the Bayes object,
     the Corpuses, the Trainers and so on."""
     print("Loading database...", end=' ')
     if self.isTest:
         self.useDB = "pickle"
         self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
     if not hasattr(self, "DBName"):
         self.DBName, self.useDB = storage.database_type([])
     self.bayes = storage.open_storage(self.DBName, self.useDB)
     self.mdb = spambayes.message.Message().message_info_db
     self.stats = Stats.Stats(options, self.mdb)
     self.buildStatusStrings()
     if not self.isTest:
         sc = get_pathname_option("Storage", "spam_cache")
         hc = get_pathname_option("Storage", "ham_cache")
         uc = get_pathname_option("Storage", "unknown_cache")
         for d in [sc, hc, uc]:
             storage.ensureDir(d)
         if self.gzipCache:
             factory = GzipFileMessageFactory()
         else:
             factory = FileMessageFactory()
         age = options["Storage", "cache_expiry_days"]*24*60*60
         self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                            '[0123456789\-]*',
                                            cacheSize=20)
         self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                           '[0123456789\-]*',
                                           cacheSize=20)
         self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
         self.spamCorpus.removeExpiredMessages()
         self.hamCorpus.removeExpiredMessages()
         self.unknownCorpus.removeExpiredMessages()
         self.spamTrainer = storage.SpamTrainer(self.bayes)
         self.hamTrainer = storage.HamTrainer(self.bayes)
         self.spamCorpus.addObserver(self.spamTrainer)
         self.hamCorpus.addObserver(self.hamTrainer)
Exemplo n.º 6
0
    def create_workers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        if self.is_test:
            self.use_db = "pickle"
            self.db_name = '_core_server.pickle'  # This is never saved.
        if not hasattr(self, "db_name"):
            self.db_name, self.use_db = storage.database_type([])
        self.bayes = storage.open_storage(self.db_name, self.use_db)

        # Load stats manager.
        self.stats = Stats.Stats(options,
                                 spambayes.message.Message().message_info_db)

        self.build_status_strings()

        # Don't set up the caches and training objects when running the
        # self-test, so as not to clutter the filesystem.
        if not self.is_test:
            # Create/open the Corpuses.  Use small cache sizes to avoid
            # hogging lots of memory.
            sc = get_pathname_option("Storage", "core_spam_cache")
            hc = get_pathname_option("Storage", "core_ham_cache")
            uc = get_pathname_option("Storage", "core_unknown_cache")
            for d in [sc, hc, uc]:
                storage.ensureDir(d)
            if self.gzip_cache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"] * 24 * 60 * 60
            self.spamCorpus = ExpiryFileCorpus(age,
                                               factory,
                                               sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age,
                                              factory,
                                              hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age,
                                                  factory,
                                                  uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spam_trainer = storage.SpamTrainer(self.bayes)
            self.ham_trainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spam_trainer)
            self.hamCorpus.addObserver(self.ham_trainer)