def setUp(self): self.training = [['spam', './tests/fixtures/plain.eml'], ['ham', './tests/fixtures/small.eml'], ['scram', './tests/fixtures/plain.eml']] self.trainer = SpamTrainer(self.training) with io.open('./tests/fixtures/plain.eml', 'rb') as eml_file: self.email = EmailObject(eml_file)
def parse_emails(keyfile): emails = [] print "Parsing emails for " + keyfile for line in io.open(keyfile, 'rb'): label, file = line.rstrip().split(' ') emails.append(EmailObject(io.open(file, 'rb'), category=label)) print "Done parsing files for " + keyfile return emails
def parse_emails(keyfile): emails = [] print("Parsing emails for " + keyfile) for line in io.open(keyfile, 'r'): label, file = line.rstrip().split(' ') with io.open(file, 'rb') as eml_file: emails.append(EmailObject(eml_file, category=label)) print("Done parsing files for " + keyfile) return emails
def train(self): for category, file in self.to_train: email = EmailObject(io.open(file, 'rb')) self.categories.add(category) for token in Tokenizer.unique_tokenizer(email.body()): self.training[category][token] += 1 self.totals['_all'] += 1 self.totals[category] += 1 self.to_train = {}
def train(self): y = [] for category, file in self.to_train: with io.open(file, "rb") as eml_file: email = EmailObject(eml_file) self.categories.add(category) y.append(1 if category == "spam" else 0) for token in Tokenizer.unique_tokenizer(email.body()): self.training[category][token] += 1 self.totals["_all"] += 1 self.totals[category] += 1 if self.to_train: y = np.array(y) self.class_log_prior["spam"] = math.log(sum(y == 1) / y.shape[0]) self.class_log_prior["ham"] = math.log(sum(y == 0) / y.shape[0]) self.B = len( set(self.training["spam"].keys()).union( set(self.training["ham"].keys()) ) ) self.to_train = {}
def setUp(self): with io.open('fixtures/html.eml', 'rb') as html_file: self.html = html_file.read().decode('utf-8') html_file.seek(0) self.html_email = EmailObject(html_file)
def setUp(self): self.multipart_file = './tests/fixtures/multipart.eml' with io.open(self.multipart_file, 'rb') as multipart: self.text = multipart.read().decode('utf-8') multipart.seek(0) self.multipart_email = EmailObject(multipart)
def setUp(self): self.plain_file = 'fixtures/plain.eml' with io.open(self.plain_file, 'rb') as plaintext: self.text = plaintext.read().decode('utf-8') plaintext.seek(0) self.plain_email = EmailObject(plaintext)