def train(self): for category, file in self.to_train: email = EmailObject(io.open(file, 'rb')) self.categories.add(category) for token in Tokenizer.unique_tokenizer(email.body()): self.training[category][token] += 1 self.totals['_all'] += 1 self.totals[category] += 1 self.to_train = {}
class TestHTMLEmail(unittest.TestCase): def setUp(self): with io.open('fixtures/html.eml', 'rb') as html_file: self.html = html_file.read().decode('utf-8') html_file.seek(0) self.html_email = EmailObject(html_file) def test_parses_stores_inner_text_html(self): body = "\n\n".join(self.html.split("\n\n")[1:]) expected = BeautifulSoup(body, 'html.parser').text actual_body = self.html_email.body() self.assertEqual(actual_body, expected) def test_stores_subject(self): expected_subject = re.search("Subject: (.*)", self.html).group(1) actual_subject = self.html_email.subject() self.assertEqual(actual_subject, expected_subject)
class TestHTMLEmail(unittest.TestCase): def setUp(self): with io.open('./tests/fixtures/html.eml', 'rb') as html_file: self.html = html_file.read().decode('utf-8') html_file.seek(0) self.html_email = EmailObject(html_file) def test_parses_stores_inner_text_html(self): body = "\n\n".join(self.html.split("\n\n")[1:]) expected = BeautifulSoup(body, 'html.parser').text actual_body = self.html_email.body() self.assertEqual(actual_body, expected) def test_stores_subject(self): expected_subject = re.search("Subject: (.*)", self.html).group(1) actual_subject = self.html_email.subject() self.assertEqual(actual_subject, expected_subject)
class TestPlaintextEmailObject(unittest.TestCase): CLRF = "\n\n" def setUp(self): self.plain_file = './tests/fixtures/plain.eml' with io.open(self.plain_file, 'rb') as plaintext: self.text = plaintext.read().decode('utf-8') plaintext.seek(0) self.plain_email = EmailObject(plaintext) def test_parse_plain_body(self): body = self.CLRF.join(self.text.split(self.CLRF)[1:]) self.assertEqual(self.plain_email.body(), body) def test_parses_the_subject(self): subject = re.search("Subject: (.*)", self.text).group(1) self.assertEqual(self.plain_email.subject(), subject)
class TestPlaintextEmailObject(unittest.TestCase): CLRF = "\n\n" def setUp(self): self.plain_file = 'fixtures/plain.eml' with io.open(self.plain_file, 'rb') as plaintext: self.text = plaintext.read().decode('utf-8') plaintext.seek(0) self.plain_email = EmailObject(plaintext) def test_parse_plain_body(self): body = self.CLRF.join(self.text.split(self.CLRF)[1:]) self.assertEqual(self.plain_email.body(), body) def test_parses_the_subject(self): subject = re.search("Subject: (.*)", self.text).group(1) self.assertEqual(self.plain_email.subject(), subject)
class TestMultipartEmailObject(unittest.TestCase): def setUp(self): self.multipart_file = './tests/fixtures/multipart.eml' with io.open(self.multipart_file, 'rb') as multipart: self.text = multipart.read().decode('utf-8') multipart.seek(0) self.multipart_email = EmailObject(multipart) def test_parse_concatenated_body_of_text(self): internal_mail = self.multipart_email.mail assert internal_mail.is_multipart() body = b'' for part in internal_mail.walk(): if re.match("text/plain", part.get_content_type()): body += part.get_payload(decode=True) elif re.match("text/html", part.get_content_type()): body += part.get_payload(decode=True) body = body.decode() self.assertEqual(self.multipart_email.body(), body) def test_stores_subject(self): subject = re.search("Subject: (.*)", self.text).group(1) self.assertEqual(self.multipart_email.subject(), subject)
def train(self): y = [] for category, file in self.to_train: with io.open(file, "rb") as eml_file: email = EmailObject(eml_file) self.categories.add(category) y.append(1 if category == "spam" else 0) for token in Tokenizer.unique_tokenizer(email.body()): self.training[category][token] += 1 self.totals["_all"] += 1 self.totals[category] += 1 if self.to_train: y = np.array(y) self.class_log_prior["spam"] = math.log(sum(y == 1) / y.shape[0]) self.class_log_prior["ham"] = math.log(sum(y == 0) / y.shape[0]) self.B = len( set(self.training["spam"].keys()).union( set(self.training["ham"].keys()) ) ) self.to_train = {}