Пример #1
0
  def train(self):
    for category, file in self.to_train:
      email = EmailObject(io.open(file, 'rb'))

      self.categories.add(category)
      
      for token in Tokenizer.unique_tokenizer(email.body()):
        self.training[category][token] += 1
        self.totals['_all'] += 1
        self.totals[category] += 1

    self.to_train = {}
Пример #2
0
class TestHTMLEmail(unittest.TestCase):
    def setUp(self):
        with io.open('fixtures/html.eml', 'rb') as html_file:
            self.html = html_file.read().decode('utf-8')
            html_file.seek(0)
            self.html_email = EmailObject(html_file)

    def test_parses_stores_inner_text_html(self):
        body = "\n\n".join(self.html.split("\n\n")[1:])
        expected = BeautifulSoup(body, 'html.parser').text
        actual_body = self.html_email.body()
        self.assertEqual(actual_body, expected)

    def test_stores_subject(self):
        expected_subject = re.search("Subject: (.*)", self.html).group(1)
        actual_subject = self.html_email.subject()
        self.assertEqual(actual_subject, expected_subject)
Пример #3
0
class TestHTMLEmail(unittest.TestCase):
  def setUp(self):
    with io.open('./tests/fixtures/html.eml', 'rb') as html_file:
      self.html = html_file.read().decode('utf-8')
      html_file.seek(0)
      self.html_email = EmailObject(html_file)

  def test_parses_stores_inner_text_html(self):
    body = "\n\n".join(self.html.split("\n\n")[1:])
    expected = BeautifulSoup(body, 'html.parser').text
    actual_body = self.html_email.body()
    self.assertEqual(actual_body, expected)

  def test_stores_subject(self):
    expected_subject = re.search("Subject: (.*)", self.html).group(1)
    actual_subject = self.html_email.subject()
    self.assertEqual(actual_subject, expected_subject)
class TestPlaintextEmailObject(unittest.TestCase):
  CLRF = "\n\n"

  def setUp(self):
    self.plain_file = './tests/fixtures/plain.eml'
    with io.open(self.plain_file, 'rb') as plaintext:
      self.text = plaintext.read().decode('utf-8')
      plaintext.seek(0)
      self.plain_email = EmailObject(plaintext)

  def test_parse_plain_body(self):
    body = self.CLRF.join(self.text.split(self.CLRF)[1:])
    self.assertEqual(self.plain_email.body(), body)

  def test_parses_the_subject(self):
    subject = re.search("Subject: (.*)", self.text).group(1)
    self.assertEqual(self.plain_email.subject(), subject)
Пример #5
0
class TestPlaintextEmailObject(unittest.TestCase):
    CLRF = "\n\n"

    def setUp(self):
        self.plain_file = 'fixtures/plain.eml'
        with io.open(self.plain_file, 'rb') as plaintext:
            self.text = plaintext.read().decode('utf-8')
            plaintext.seek(0)
            self.plain_email = EmailObject(plaintext)

    def test_parse_plain_body(self):
        body = self.CLRF.join(self.text.split(self.CLRF)[1:])
        self.assertEqual(self.plain_email.body(), body)

    def test_parses_the_subject(self):
        subject = re.search("Subject: (.*)", self.text).group(1)
        self.assertEqual(self.plain_email.subject(), subject)
class TestMultipartEmailObject(unittest.TestCase):
    def setUp(self):
        self.multipart_file = './tests/fixtures/multipart.eml'
        with io.open(self.multipart_file, 'rb') as multipart:
            self.text = multipart.read().decode('utf-8')
            multipart.seek(0)
            self.multipart_email = EmailObject(multipart)

    def test_parse_concatenated_body_of_text(self):
        internal_mail = self.multipart_email.mail
        assert internal_mail.is_multipart()

        body = b''
        for part in internal_mail.walk():
            if re.match("text/plain", part.get_content_type()):
                body += part.get_payload(decode=True)
            elif re.match("text/html", part.get_content_type()):
                body += part.get_payload(decode=True)
        body = body.decode()
        self.assertEqual(self.multipart_email.body(), body)

    def test_stores_subject(self):
        subject = re.search("Subject: (.*)", self.text).group(1)
        self.assertEqual(self.multipart_email.subject(), subject)
Пример #7
0
    def train(self):
        y = []
        for category, file in self.to_train:
            with io.open(file, "rb") as eml_file:
                email = EmailObject(eml_file)

            self.categories.add(category)
            y.append(1 if category == "spam" else 0)

            for token in Tokenizer.unique_tokenizer(email.body()):
                self.training[category][token] += 1
                self.totals["_all"] += 1
                self.totals[category] += 1

        if self.to_train:
            y = np.array(y)
            self.class_log_prior["spam"] = math.log(sum(y == 1) / y.shape[0])
            self.class_log_prior["ham"] = math.log(sum(y == 0) / y.shape[0])
            self.B = len(
                set(self.training["spam"].keys()).union(
                    set(self.training["ham"].keys())
                )
            )
            self.to_train = {}
class TestMultipartEmailObject(unittest.TestCase):
  def setUp(self):
    self.multipart_file = './tests/fixtures/multipart.eml'
    with io.open(self.multipart_file, 'rb') as multipart:
      self.text = multipart.read().decode('utf-8')
      multipart.seek(0)
      self.multipart_email = EmailObject(multipart)

  def test_parse_concatenated_body_of_text(self):
    internal_mail = self.multipart_email.mail
    assert internal_mail.is_multipart()

    body = b''
    for part in internal_mail.walk():
      if re.match("text/plain", part.get_content_type()):
        body += part.get_payload(decode=True)
      elif re.match("text/html", part.get_content_type()):
        body += part.get_payload(decode=True)
    body = body.decode()
    self.assertEqual(self.multipart_email.body(), body)

  def test_stores_subject(self):
    subject = re.search("Subject: (.*)", self.text).group(1)
    self.assertEqual(self.multipart_email.subject(), subject)