class Hammie(hammie.Hammie): def __init__(self, bayes): hammie.Hammie.__init__(self, bayes) self.tokenizer = Tokenizer() def _scoremsg(self, msg, evidence=False): return self.bayes.spamprob(self.tokenizer.tokenize(msg), evidence) def train(self, msg, is_spam, add_header=False): self.bayes.learn(self.tokenizer.tokenize(msg), is_spam)
message = self.urlCorpus.makeMessage(url_key, fake_message_string) self.urlCorpus.addMessage(message) else: fake_message_string = cached_message.as_string() msg = message_from_string(fake_message_string) # We don't want to do full header tokenising, as this is # optimised for messages, not webpages, so we just do the # basic stuff. bht = options["Tokenizer", "basic_header_tokenize"] bhto = options["Tokenizer", "basic_header_tokenize_only"] options["Tokenizer", "basic_header_tokenize"] = True options["Tokenizer", "basic_header_tokenize_only"] = True tokens = Tokenizer().tokenize(msg) pf = options["URLRetriever", "x-web_prefix"] tokens = ["%s%s" % (pf, tok) for tok in tokens] # Undo the changes options["Tokenizer", "basic_header_tokenize"] = bht options["Tokenizer", "basic_header_tokenize_only"] = bhto return tokens def _base_url(self, url): # To try and speed things up, and to avoid following # unique URLS, we convert the URL to as basic a form # as we can - so http://www.massey.ac.nz/~tameyer/index.html?you=me # would become http://massey.ac.nz and http://id.example.com # would become http://example.com url += '/'
def __init__(self, bayes): hammie.Hammie.__init__(self, bayes) self.tokenizer = Tokenizer()