def train(self, msg, isSpam): try: use_cached = options["smtpproxy", "use_cached_message"] except KeyError: use_cached = True if use_cached: id = self.extractSpambayesID(msg) if id is None: print "Could not extract id" return self.train_cached_message(id, isSpam) # Otherwise, train on the forwarded/bounced message. msg = sbheadermessage_from_string(msg) id = msg.setIdFromPayload() msg.delSBHeaders() if id is None: # No id, so we don't have any reliable method of remembering # information about this message, so we just assume that it # hasn't been trained before. We could generate some sort of # checksum for the message and use that as an id (this would # mean that we didn't need to store the id with the message) # but that might be a little unreliable. self.classifier.learn(msg.asTokens(), isSpam) else: if msg.GetTrained() == (not isSpam): self.classifier.unlearn(msg.asTokens(), not isSpam) msg.RememberTrained(None) if msg.GetTrained() is None: self.classifier.learn(msg.asTokens(), isSpam) msg.RememberTrained(isSpam)
def extractSpambayesID(self, data): msg = sbheadermessage_from_string(data) # The nicest MUA is one that forwards the header intact. id = msg.get(options["Headers", "mailid_header_name"]) if id is not None: return id # Some MUAs will put it in the body somewhere, while others will # put it in an attached MIME message. id = self._find_id_in_text(msg.as_string()) if id is not None: return id # the message might be encoded for part in textparts(msg): # Decode, or take it as-is if decoding fails. try: text = part.get_payload(decode=True) except: text = part.get_payload(decode=False) if text is not None: text = try_to_repair_damaged_base64(text) if text is not None: id = self._find_id_in_text(text) return id return None