Exemplo n.º 1
0
 def setUp(self):
     self.subject = "dummySubject"
     self.body = "dummyBody"
     self.analyzer = EmailAnalyzer()
     self.clean_subject = [
         "best", "quick", "netco"
     ]  # données pour mocker "return_value" du "clean_text"
     self.clean_body = [
         "prescription", "drug", "overview", "operations"
     ]  # données pour mocker "return_value" du "clean_text"
     self.spam_ham_body_prob_true = (
         0,
         0,
     )  # données pour mocker "return_value" du "spam_ham_body_prob"
     self.subject_spam_ham_prob_true = (
         0,
         0,
     )  # données pour mocker "return_value" du "subject_spam_ham_prob"
     self.spam_ham_body_prob_false = (
         0,
         0,
     )  # données pour mocker "return_value" du "spam_ham_body_prob"
     self.subject_spam_ham_prob_false = (
         0,
         0,
     )  # données pour mocker "return_value" du "subject_spam_ham_prob"
     self.vocab = (
         {
             "spam_sub": {
                 "best": 1 / 4,
                 "online": 1 / 4,
                 "medicine": 1 / 4,
                 "here": 1 / 4,
             },
             "ham_sub": {
                 "netco": 1 / 3,
                 "due": 1 / 3,
                 "diligence": 1 / 3
             },
             "spam_body": {
                 "prescription": 1 / 5,
                 "drug": 1 / 5,
                 "simple": 1 / 5,
                 "quick": 1 / 5,
                 "affordable": 1 / 5
             },
             "ham_body": {
                 "big": 1 / 6,
                 "pig": 1 / 6,
                 "met": 1 / 6,
                 "today": 1 / 6,
                 "overview": 1 / 6,
                 "operations": 1 / 6
             }
         }
     )  # vocabulaire avec les valeurs de la probabilité pour mocker "return_value" du "load_dict"
     self.spam_ham_body_prob_expected = 0, 0  # valeurs de la probabilité attendus
     self.subject_spam_ham_prob_expected = 0, 0  # valeurs de la probabilité attendus
Exemplo n.º 2
0
 def test_body_spam_ham_prob_Returns_expected_probability(
         self, mock_load_vocab):
     """
     Il faut mocker la fonction "load_dict"
     Il faut vérifier que probabilité est calculée correctement donné le "body" à l'entrée
     (ces probabilites devron etre calcule selon l'enonce dans le TP1 )
     """
     email_analyzer = EmailAnalyzer()
     mock_load_vocab.return_value = self.vocab
     self.assertEqual(email_analyzer.body_spam_ham_prob(self.body_true),
                      self.body_spam_ham_prob_expected)
Exemplo n.º 3
0
    def test_is_spam_Returns_True_if_spam_prob_is_higher(
            self, mock_subject_spam_ham_prob, mock_body_spam_ham_prob,
            mock_clean_text):
        """
        Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob".
        La sortie de la fonction doit être True si probabilité spam > probabilité ham
        (ces probabilites devron etre calcule selon l'enonce dans le TP1 )
        """
        mock_clean_text.return_value = self.clean_subject_true
        mock_subject_spam_ham_prob.return_value = self.subject_spam_ham_prob_true

        mock_clean_text.return_value = self.clean_body_true
        mock_body_spam_ham_prob.return_value = self.body_spam_ham_prob_true

        email_analyzer = EmailAnalyzer()
        self.assertEqual(
            email_analyzer.is_spam(self.subject_true, self.body_true), True)
Exemplo n.º 4
0
def evaluate():
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    total = 0
    analyzer = EmailAnalyzer()
    with open("200-mails.json") as email_file:
        new_emails = json.load(email_file)

    counter = 0
    for e_mail in new_emails["dataset"]:
        counter += 1
        print(counter)
        new_email = e_mail["mail"]
        subject = new_email["Subject"]
        body = new_email["Body"]
        spam = new_email["Spam"]

        if ((analyzer.is_spam(subject, body))) and (spam == "true"):
            tp += 1
        if (not (analyzer.is_spam(subject, body))) and (spam == "false"):
            tn += 1
        if ((analyzer.is_spam(subject, body))) and (spam == "false"):
            fp += 1
        if (not (analyzer.is_spam(subject, body))) and (spam == "true"):
            fn += 1
        total += 1
    print("Accuracy: ", (tp + tn) / (tp + tn + fp + fn))
    print("Precision: ", tp / (tp + fp))
    print("Recall: ", tp / (tp + fn))
    return True
Exemplo n.º 5
0
def evaluate(is_log_estimation, is_log_combination, clean_text_mode, k):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    total = 0
    analyzer = EmailAnalyzer()
    with open("test_set.json") as email_file:
        new_emails = json.load(email_file)

    i = 0
    email_count = len(new_emails["dataset"])

    print("Evaluating emails ")
    for e_mail in new_emails["dataset"]:
        i += 1
        print("\rEmail " + str(i) + "/" + str(email_count), end="")

        new_email = e_mail["mail"]
        subject = new_email["Subject"]
        body = new_email["Body"]
        spam = new_email["Spam"]

        if ((analyzer.is_spam(subject, body, is_log_estimation,
                              is_log_combination, clean_text_mode,
                              k))) and (spam == "true"):
            tp += 1
        if (not (analyzer.is_spam(subject, body, is_log_estimation,
                                  is_log_combination, clean_text_mode,
                                  k))) and (spam == "false"):
            tn += 1
        if ((analyzer.is_spam(subject, body, is_log_estimation,
                              is_log_combination, clean_text_mode,
                              k))) and (spam == "false"):
            fp += 1
        if (not (analyzer.is_spam(subject, body, is_log_estimation,
                                  is_log_combination, clean_text_mode,
                                  k))) and (spam == "true"):
            fn += 1
        total += 1

    print("")
    print("\nAccuracy: ", round((tp + tn) / (tp + tn + fp + fn), 2))
    if (tp + fp == 0):
        print("Precision: ", 0)
    else:
        print("Precision: ", round(tp / (tp + fp), 2))
    print("Recall: ", round(tp / (tp + fn), 2))
    return True
Exemplo n.º 6
0
def evaluate(is_log_estimation, is_log_combo, calculation_mode):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    total = 0
    analyzer = EmailAnalyzer()
    with open("200-mails.json") as email_file:
        new_emails = json.load(email_file)

    counter = 0
    for e_mail in new_emails["dataset"]:
        counter += 1
        print(counter)
        new_email = e_mail["mail"]
        subject = new_email["Subject"]
        body = new_email["Body"]
        spam = new_email["Spam"]

        if ((analyzer.is_spam(subject, body, is_log_estimation, False,
                              0))) and (spam == "true"):
            tp += 1
        if (not (analyzer.is_spam(subject, body, False, False,
                                  0))) and (spam == "false"):
            tn += 1
        if ((analyzer.is_spam(subject, body, False, False,
                              0))) and (spam == "false"):
            fp += 1
        if (not (analyzer.is_spam(subject, body, False, False,
                                  0))) and (spam == "true"):
            fn += 1
        total += 1

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    print("Accuracy: ", accuracy)
    print("Precision: ", tp / (tp + fp))
    print("Recall: ", tp / (tp + fn))

    return accuracy, precision, recall
Exemplo n.º 7
0
def evaluate(estimation_option, combination_option):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    total = 0
    analyzer = EmailAnalyzer()
    with open("test-emails.json") as email_file:
        new_emails = json.load(email_file)

    for e_mail in new_emails["dataset"]:
        new_email = e_mail["mail"]
        subject = new_email["Subject"]
        body = new_email["Body"]
        spam = new_email["Spam"]

        if (analyzer.is_spam_with_params(subject, body, estimation_option,
                                         combination_option, cleaning_mode,
                                         0.3)) and (spam == "true"):
            tp += 1
        if (not (analyzer.is_spam_with_params(
                subject, body, estimation_option, combination_option,
                cleaning_mode, 0.3))) and (spam == "false"):
            tn += 1
        if (analyzer.is_spam_with_params(subject, body, estimation_option,
                                         combination_option, cleaning_mode,
                                         0.3)) and (spam == "false"):
            fp += 1
        if (not (analyzer.is_spam_with_params(
                subject, body, estimation_option, combination_option,
                cleaning_mode, 0.3))) and (spam == "true"):
            fn += 1
        total += 1
    print("Accuracy: ", (tp + tn) / (tp + tn + fp + fn))
    print("Precision: ", tp / (tp + fp))
    print("Recall: ", tp / (tp + fn))
    return True
Exemplo n.º 8
0
 def test_is_spam_function_two_returns_true_vns_test_four(self):
     return_val = EmailAnalyzer.is_spam_function_two(False, 20, 65)
     self.assertTrue(return_val)
Exemplo n.º 9
0
 def test_is_spam_function_two_returns_true_vns_test_three(self):
     return_val = EmailAnalyzer.is_spam_function_two(True, 20, 80)
     self.assertTrue(return_val)
Exemplo n.º 10
0
 def test_is_spam_function_two_returns_false_pic_test_one(self):
     return_val = EmailAnalyzer.is_spam_function_two(False, 80, 65)
     self.assertFalse(return_val)
Exemplo n.º 11
0
 def test_is_spam_function_one_returns_false_icc_test_twenty(self):
     return_val = EmailAnalyzer.is_spam_function_one(True, 35, 80, 60)
     self.assertTrue(return_val)
Exemplo n.º 12
0
 def test_is_spam_function_one_returns_true_icc_test_nineteen(self):
     return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 60)
     self.assertTrue(return_val)
Exemplo n.º 13
0
 def test_is_spam_function_one_returns_false_icc_test_seventeen(self):
     return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80)
     self.assertFalse(return_val)
Exemplo n.º 14
0
class TestEmailAnalyzer(unittest.TestCase):
    def setUp(self):
        self.analyzer = EmailAnalyzer()
        self.subject_true = " no more outdated software ! upgrade !"
        self.body_true = "we get you the best deal ! skip the retail box and save !\namazing special # 1 :\nadobe - photoshop 7 premiere 7 illustrator 10 = only $ 120\namazing special # 2 :\nwindows xp professional + microsoft office xp professional = only $ 80\namazing special # 3 :\nadobe photoshop cs + adobe illustrator cs + adobe indesign cs\namazing special # 4 :\n"
        self.clean_subject_true = [
            'more', 'oudat', 'software', 'upgrade'
        ]  # données pour mocker "return_value" du "clean_text"
        self.clean_body_true = [
            'get', 'best', 'deal', 'skip', 'retail', 'box', 'sav', 'amaz',
            'special', 'adobe', 'photoshop', 'premiere', 'illustrator', 'only',
            'windows', 'xp', 'professional', 'microsoft', 'office', 'cs',
            'indesign'
        ]  # données pour mocker "return_value" du "clean_text"
        self.subject_false = "re :"
        self.body_false = "we are using it for other things . mary joyce and robert have discussed with mcmahon and bowen .\n- - - - - original message - - - - -\nfrom : kitchen louise\nsent : monday december 10 2001 8 : 26 am\nto : oxley david\nsubject :\nwhat happens to the money in wachovia ?\nlouise kitchen\nchief operating \n"
        self.clean_subject_false = [
            're'
        ]  # données pour mocker "return_value" du "clean_text"
        self.clean_body_false = [
            'us', 'other', 'thing', 'mary', 'joyce', 'robert', 'discuss',
            'mcmahon', 'bowen', 'original', 'message', 'kitchen', 'louise',
            'sent', 'monday', "december", 'oxley', 'david', 'subject',
            'happen', 'money', 'wachovia'
        ]
        self.spam_ham_body_prob_true = (
            1,
            (1 / 6),
        )  # données pour mocker "return_value" du "spam_ham_body_prob"
        self.subject_spam_ham_prob_true = (
            (2 / 3),
            (1 / 6),
        )  # données pour mocker "return_value" du "subject_spam_ham_prob"
        self.spam_ham_body_prob_false = (
            (1 / 4),
            (2 / 6),
        )  # données pour mocker "return_value" du "spam_ham_body_prob"
        self.subject_spam_ham_prob_false = (
            0,
            (1 / 2),
        )  # données pour mocker "return_value" du "subject_spam_ham_prob"
        self.vocab = (
            {
                "p_sub_spam": {
                    "upgrade": 1 / 3,
                    "software": 1 / 3
                },
                "p_sub_ham": {
                    "re": 1 / 2,
                    "annoucement": 1 / 6,
                    "more": 1 / 6
                },
                "p_body_spam": {
                    "best": 1 / 4,
                    "deal": 1 / 4,
                    "skip": 1 / 4,
                    "special": 1 / 4,
                    "money": 1 / 4
                },
                "p_body_ham": {
                    "today": 1 / 6,
                    "professional": 1 / 6,
                    "meet": 1 / 6,
                    "discuss": 1 / 6,
                    "sent": 1 / 6
                }
            }
        )  # vocabulaire avec les valeurs de la probabilité pour mocker "return_value" du "load_dict"
        # valeurs de la probabilité attendus : (0.5925*1/(256*pow(6,17))), (0.4075*1/pow(6,21))
        self.spam_ham_body_prob_expected = (1.3673419333309543e-16,
                                            1.8575963755415577e-17)
        # valeurs de la probabilité attendus : (0.5925*1/81, 0.4075*1/6*1/4*1/4*1/4)
        self.subject_spam_ham_prob_expected = (0.007314814814814815,
                                               0.0010611979166666665)

    def tearDown(self):
        pass

    @patch("email_analyzer.EmailAnalyzer.clean_text")
    @patch("email_analyzer.EmailAnalyzer.spam_ham_body_prob")
    @patch("email_analyzer.EmailAnalyzer.spam_ham_subject_prob")
    def test_is_spam_Returns_True_if_spam_prob_is_higher(
            self, mock_subject_spam_ham_prob, mock_spam_ham_body_prob,
            mock_clean_text):
        mock_subject_spam_ham_prob.return_value = self.subject_spam_ham_prob_true
        mock_spam_ham_body_prob.return_value = self.spam_ham_body_prob_true
        return_val = self.analyzer.is_spam(self.subject_true, self.body_true)
        self.assertTrue(return_val)
        """
        Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob".
        La sortie de la fonction doit être True si probabilité spam > probabilité ham
        (ces probabilites devron etre calcule selon l'enonce dans le TP1 )
        """

    @patch("email_analyzer.EmailAnalyzer.clean_text")
    @patch("email_analyzer.EmailAnalyzer.spam_ham_body_prob")
    @patch("email_analyzer.EmailAnalyzer.spam_ham_subject_prob")
    def test_is_spam_Returns_False_if_spam_prob_is_lower(
            self, mock_subject_spam_ham_prob, mock_spam_ham_body_prob,
            mock_clean_text):
        mock_subject_spam_ham_prob.return_value = self.subject_spam_ham_prob_false
        mock_spam_ham_body_prob.return_value = self.spam_ham_body_prob_false
        return_val = self.analyzer.is_spam(self.subject_false, self.body_false)
        self.assertFalse(return_val)
        """
        Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob".
        La sortie de la fonction doit être False si probabilité spam  probabilité ham
        (ces probabilites devron etre calcule selon l'enonce dans le TP1 )
        """

    @patch("email_analyzer.EmailAnalyzer.load_dict")
    def test_spam_ham_body_prob_Returns_expected_probability(
            self, mock_load_dict):
        mock_load_dict.return_value = self.vocab
        self.assertEqual(
            self.analyzer.spam_ham_body_prob(self.clean_body_true),
            self.spam_ham_body_prob_expected)
        """
        Il faut mocker la fonction "load_dict"
        Il faut vérifier que probabilité est calculée correctement donné le "body" à l'entrée
        (ces probabilites devron etre calcule selon l'enonce dans le TP1 )
        """

    @patch("email_analyzer.EmailAnalyzer.load_dict")
    def test_subject_spam_ham_prob_Returns_expected_probability(
            self, mock_load_dict):
        mock_load_dict.return_value = self.vocab
        self.assertEqual(
            self.analyzer.spam_ham_subject_prob(self.clean_subject_true),
            self.subject_spam_ham_prob_expected)
        """
Exemplo n.º 15
0
 def test_is_spam_function_one_returns_false_acc_test_eight(self):
     return_val = EmailAnalyzer.is_spam_function_one(False, 20, 70, 80)
     self.assertFalse(return_val)
Exemplo n.º 16
0
 def test_is_spam_function_one_returns_true_acc_test_five(self):
     return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 80)
     self.assertTrue(return_val)
Exemplo n.º 17
0
class TestEmailAnalyzer(unittest.TestCase):
    def setUp(self):
        self.subject = "dummySubject"
        self.body = "dummyBody"
        self.analyzer = EmailAnalyzer()
        self.clean_subject = ["best", "quick", "netco"]  # données pour mocker "return_value" du "clean_text"
        self.clean_body = ["prescription", "drug", "overview",
                           "operations"]  # données pour mocker "return_value" du "clean_text"
        self.spam_ham_body_prob_true = (
            0,
            0,
        )  # données pour mocker "return_value" du "spam_ham_body_prob"
        self.subject_spam_ham_prob_true = (
            0,
            0,
        )  # données pour mocker "return_value" du "subject_spam_ham_prob"
        self.spam_ham_body_prob_false = (
            0,
            0,
        )  # données pour mocker "return_value" du "spam_ham_body_prob"
        self.subject_spam_ham_prob_false = (
            0,
            0,
        )  # données pour mocker "return_value" du "subject_spam_ham_prob"
        self.vocab = (
            {
                "spam_sub": {
                    "best": 1 / 4,
                    "online": 1 / 4,
                    "medicine": 1 / 4,
                    "here": 1 / 4,
                },
                "ham_sub": {
                    "netco": 1 / 3,
                    "due": 1 / 3,
                    "diligence": 1 / 3
                },
                "spam_body": {
                    "prescription": 1 / 5,
                    "drug": 1 / 5,
                    "simple": 1 / 5,
                    "quick": 1 / 5,
                    "affordable": 1 / 5
                },
                "ham_body": {
                    "big": 1 / 6,
                    "pig": 1 / 6,
                    "met": 1 / 6,
                    "today": 1 / 6,
                    "overview": 1 / 6,
                    "operations": 1 / 6
                }
            }
        )  # vocabulaire avec les valeurs de la probabilité pour mocker "return_value" du "load_dict"
        self.spam_ham_body_prob_expected = 0, 0  # valeurs de la probabilité attendus
        self.subject_spam_ham_prob_expected = 0, 0  # valeurs de la probabilité attendus

    def tearDown(self):
        pass

    ### Tests pour l'Active clause coverage

    def test_is_spam_function_one_returns_true_acc_test_one(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 60)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_acc_test_two(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_true_acc_test_three(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 20, 50, 50)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_acc_test_four(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 35, 50, 50)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_true_acc_test_five(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 80)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_acc_test_six(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 20, 70, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_true_acc_test_seven(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 20, 70, 50)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_acc_test_eight(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 20, 70, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_false_acc_test_nine(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 20, 76, 50)
        self.assertFalse(return_val)

    # Tests pour l'Inactive Clause Coverage
    # P est clause majeure:
    def test_is_spam_function_one_returns_false_icc_test_one(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_false_icc_test_two(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_true_icc_test_three(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 15, 65, 50)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_icc_test_four(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 15, 65, 50)
        self.assertTrue(return_val)

    # H est clause majeure
    def test_is_spam_function_one_returns_true_icc_test_five(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 20, 65, 60)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_true_icc_test_six(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 15, 65, 60)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_icc_test_seven(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 40, 65, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_false_icc_test_eight(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 15, 65, 80)
        self.assertFalse(return_val)

    # T1 clause majeure
    def test_is_spam_function_one_returns_false_icc_test_nine(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_false_icc_test_ten(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 35, 50, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_true_icc_test_eleven(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 20, 50, 60)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_icc_test_twelve(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 20, 65, 60)
        self.assertTrue(return_val)

    # T2 clause majeure
    def test_is_spam_function_one_returns_false_icc_test_thirteen(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_false_icc_test_fourteen(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 50)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_true_icc_test_fifteen(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 80)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_icc_test_sixteen(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 50)
        self.assertTrue(return_val)

    # T3 clause majeure
    def test_is_spam_function_one_returns_false_icc_test_seventeen(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 35, 65, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_false_icc_test_eighteen(self):
        return_val = EmailAnalyzer.is_spam_function_one(False, 35, 80, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_one_returns_true_icc_test_nineteen(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 60)
        self.assertTrue(return_val)

    def test_is_spam_function_one_returns_false_icc_test_twenty(self):
        return_val = EmailAnalyzer.is_spam_function_one(True, 35, 80, 60)
        self.assertTrue(return_val)

    # Critère IC
    def test_is_spam_function_two_returns_false_ic_test_one(self):
        return_val = EmailAnalyzer.is_spam_function_two(False, 80, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_two_returns_true_ic_test_two(self):
        return_val = EmailAnalyzer.is_spam_function_two(True, 20, 65)
        self.assertTrue(return_val)

    # Critère PIC
    def test_is_spam_function_two_returns_false_pic_test_one(self):
        return_val = EmailAnalyzer.is_spam_function_two(False, 80, 65)
        self.assertFalse(return_val)

    def test_is_spam_function_two_returns_false_pic_test_two(self):
        return_val = EmailAnalyzer.is_spam_function_two(False, 20, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_two_returns_true_pic_test_three(self):
        return_val = EmailAnalyzer.is_spam_function_two(True, 80, 65)
        self.assertTrue(return_val)

    def test_is_spam_function_two_returns_true_pic_test_four(self):
        return_val = EmailAnalyzer.is_spam_function_two(False, 20, 50)
        self.assertTrue(return_val)

    # Critère VNS
    def test_is_spam_function_two_returns_true_vns_test_one(self):
        return_val = EmailAnalyzer.is_spam_function_two(True, 80, 80)
        self.assertTrue(return_val)

    def test_is_spam_function_two_returns_true_vns_test_two(self):
        return_val = EmailAnalyzer.is_spam_function_two(True, 80, 65)
        self.assertTrue(return_val)

    def test_is_spam_function_two_returns_true_vns_test_three(self):
        return_val = EmailAnalyzer.is_spam_function_two(True, 20, 80)
        self.assertTrue(return_val)

    def test_is_spam_function_two_returns_true_vns_test_four(self):
        return_val = EmailAnalyzer.is_spam_function_two(False, 20, 65)
        self.assertTrue(return_val)

    def test_is_spam_function_two_returns_false_vns_test_five(self):
        return_val = EmailAnalyzer.is_spam_function_two(False, 20, 80)
        self.assertFalse(return_val)

    def test_is_spam_function_two_returns_false_vns_test_six(self):
        return_val = EmailAnalyzer.is_spam_function_two(False, 80, 60)
        self.assertFalse(return_val)

    def test_is_spam_function_two_returns_false_vns_test_seven(self):
        return_val = EmailAnalyzer.is_spam_function_two(False, 80, 80)
        self.assertFalse(return_val)

    @patch("email_analyzer.EmailAnalyzer.subject_spam_ham_prob")
    @patch("email_analyzer.EmailAnalyzer.spam_ham_body_prob")
    def test_is_spam_Returns_True_if_spam_prob_is_higher(
            self, mock_subject_spam_ham_prob, mock_spam_ham_body_prob
    ):
        """
        Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob".
        La sortie de la fonction doit être True si probabilité spam > probabilité ham
        (ces probabilites devron etre calcule selon l'enonce dans le TP1 )
        """
        mock_subject_spam_ham_prob.return_value = (10, 0)
        mock_spam_ham_body_prob.return_value = (10, 0)
        is_spam_return_val = self.analyzer.is_spam("dummySubject", "dummyBody")
        self.assertTrue(is_spam_return_val)

    @patch("email_analyzer.EmailAnalyzer.spam_ham_body_prob")
    @patch("email_analyzer.EmailAnalyzer.subject_spam_ham_prob")
    def test_is_spam_Returns_False_if_spam_prob_is_lower(
            self, mock_subject_spam_ham_prob, mock_spam_ham_body_prob
    ):
        """
        Il faut mocker les fonctions "spam_ham_body_prob" et "subject_spam_ham_prob".
        La sortie de la fonction doit être False si probabilité spam  probabilité ham
        (ces probabilites devron etre calcule selon l'enonce dans le TP1 )
        """
        mock_subject_spam_ham_prob.return_value = (0, 10)
        mock_spam_ham_body_prob.return_value = (0, 10)
        is_spam_return_val = self.analyzer.is_spam("dummySubject", "dummyBody")
        self.assertFalse(is_spam_return_val)

    @patch("email_analyzer.EmailAnalyzer.clean_text")
    @patch("email_analyzer.EmailAnalyzer.calculate_ham_divided_by_email")
    @patch("email_analyzer.EmailAnalyzer.calculate_spam_divided_by_email")
    @patch("email_analyzer.EmailAnalyzer.load_dict")
    def test_spam_ham_body_prob_Returns_expected_probability(self, mock_load_dict, mock_calculate_spam_divided_by_email,
                                                             mock_calculate_ham_divided_by_email, mock_clean_text):
        """
        Il faut mocker la fonction "load_dict"
        Il faut vérifier que probabilité est calculée correctement donné le "body" à l'entrée
        (ces probabilites devront etre calcule selon l'enonce dans le TP1 )
        """
        mock_load_dict.return_value = self.vocab
        mock_calculate_ham_divided_by_email.return_value = 1 / 2
        mock_calculate_spam_divided_by_email.return_value = 1 / 2
        mock_clean_text.return_value = self.clean_body
        expected_return_value = ((0.5 * 0.2 * 0.2), (1 / 2 * 1 / 6 * 1 / 6))
        self.assertEqual(self.analyzer.spam_ham_body_prob(self.body), expected_return_value)

    @patch("email_analyzer.EmailAnalyzer.clean_text")
    @patch("email_analyzer.EmailAnalyzer.calculate_ham_divided_by_email")
    @patch("email_analyzer.EmailAnalyzer.calculate_spam_divided_by_email")
    @patch("email_analyzer.EmailAnalyzer.load_dict")
    def test_subject_spam_ham_prob_Returns_expected_probability(self, mock_load_dict,
                                                                mock_calculate_spam_divided_by_email,
                                                                mock_calculate_ham_divided_by_email, mock_clean_text):
        """
        Il faut mocker la fonction "load_dict"
        il faut vérifier que probabilité est calculée correctement donné le "sujet" a l'entrée
        (ces probabilites devron etre calcule selon l'enonce dans le TP1 )
        """
        mock_load_dict.return_value = self.vocab
        mock_calculate_ham_divided_by_email.return_value = 1 / 2
        mock_calculate_spam_divided_by_email.return_value = 1 / 2
        mock_clean_text.return_value = self.clean_subject
        expected_return_value = ((0.5 * 0.25), (0.5*1/3))
        self.assertEqual(self.analyzer.subject_spam_ham_prob(self.subject), expected_return_value)
Exemplo n.º 18
0
 def test_is_spam_function_two_returns_false_vns_test_seven(self):
     return_val = EmailAnalyzer.is_spam_function_two(False, 80, 80)
     self.assertFalse(return_val)
Exemplo n.º 19
0
 def __init__(self):
     self.email_file = "train_set.json"
     self.crud = CRUD()
     self.e_mail = EmailAnalyzer()
Exemplo n.º 20
0
for dirpath, subdirs, files in os.walk(rootdir):
    for file in files:
        if not file.startswith('.'):     # Ignore hidden files
            email_filenames.append(os.path.join(dirpath, file))

"""
Calling the function, and testing if the # of elements are the same
"""

for filename in email_filenames:

	with open(filename, "r") as f:
		data = f.read()

	email_df = json_normalize(json.loads(EmailAnalyzer().parse(data)))
	print(email_df)

"""
Attempting to iterate through the files and populate a larger JSON object

"""

dic_tmp = {}

for x in range(len(email_filenames)):
    key_name = email_filenames[x]
    with open(email_filenames[x], "r") as f:
            data2 = f.read()
    data2_parsed = EmailAnalyzer().parse(data2)
    dic_tmp[key_name] = data2_parsed
Exemplo n.º 21
0
 def setUp(self):
     self.analyzer = EmailAnalyzer()
     self.subject_true = " no more outdated software ! upgrade !"
     self.body_true = "we get you the best deal ! skip the retail box and save !\namazing special # 1 :\nadobe - photoshop 7 premiere 7 illustrator 10 = only $ 120\namazing special # 2 :\nwindows xp professional + microsoft office xp professional = only $ 80\namazing special # 3 :\nadobe photoshop cs + adobe illustrator cs + adobe indesign cs\namazing special # 4 :\n"
     self.clean_subject_true = [
         'more', 'oudat', 'software', 'upgrade'
     ]  # données pour mocker "return_value" du "clean_text"
     self.clean_body_true = [
         'get', 'best', 'deal', 'skip', 'retail', 'box', 'sav', 'amaz',
         'special', 'adobe', 'photoshop', 'premiere', 'illustrator', 'only',
         'windows', 'xp', 'professional', 'microsoft', 'office', 'cs',
         'indesign'
     ]  # données pour mocker "return_value" du "clean_text"
     self.subject_false = "re :"
     self.body_false = "we are using it for other things . mary joyce and robert have discussed with mcmahon and bowen .\n- - - - - original message - - - - -\nfrom : kitchen louise\nsent : monday december 10 2001 8 : 26 am\nto : oxley david\nsubject :\nwhat happens to the money in wachovia ?\nlouise kitchen\nchief operating \n"
     self.clean_subject_false = [
         're'
     ]  # données pour mocker "return_value" du "clean_text"
     self.clean_body_false = [
         'us', 'other', 'thing', 'mary', 'joyce', 'robert', 'discuss',
         'mcmahon', 'bowen', 'original', 'message', 'kitchen', 'louise',
         'sent', 'monday', "december", 'oxley', 'david', 'subject',
         'happen', 'money', 'wachovia'
     ]
     self.spam_ham_body_prob_true = (
         1,
         (1 / 6),
     )  # données pour mocker "return_value" du "spam_ham_body_prob"
     self.subject_spam_ham_prob_true = (
         (2 / 3),
         (1 / 6),
     )  # données pour mocker "return_value" du "subject_spam_ham_prob"
     self.spam_ham_body_prob_false = (
         (1 / 4),
         (2 / 6),
     )  # données pour mocker "return_value" du "spam_ham_body_prob"
     self.subject_spam_ham_prob_false = (
         0,
         (1 / 2),
     )  # données pour mocker "return_value" du "subject_spam_ham_prob"
     self.vocab = (
         {
             "p_sub_spam": {
                 "upgrade": 1 / 3,
                 "software": 1 / 3
             },
             "p_sub_ham": {
                 "re": 1 / 2,
                 "annoucement": 1 / 6,
                 "more": 1 / 6
             },
             "p_body_spam": {
                 "best": 1 / 4,
                 "deal": 1 / 4,
                 "skip": 1 / 4,
                 "special": 1 / 4,
                 "money": 1 / 4
             },
             "p_body_ham": {
                 "today": 1 / 6,
                 "professional": 1 / 6,
                 "meet": 1 / 6,
                 "discuss": 1 / 6,
                 "sent": 1 / 6
             }
         }
     )  # vocabulaire avec les valeurs de la probabilité pour mocker "return_value" du "load_dict"
     # valeurs de la probabilité attendus : (0.5925*1/(256*pow(6,17))), (0.4075*1/pow(6,21))
     self.spam_ham_body_prob_expected = (1.3673419333309543e-16,
                                         1.8575963755415577e-17)
     # valeurs de la probabilité attendus : (0.5925*1/81, 0.4075*1/6*1/4*1/4*1/4)
     self.subject_spam_ham_prob_expected = (0.007314814814814815,
                                            0.0010611979166666665)
Exemplo n.º 22
0
class RENEGE:
    """Class pour realiser le filtrage du spam en utilisant vocabular.json file et
    CRUD et EmalAnalyze classes"""

    def __init__(self):
        self.email_file = "800-mails.json"
        self.crud = CRUD()
        self.e_mail = EmailAnalyzer()

    def calculate_user_trust(self, user_id):
        #extracting json data
        date_of_first_seen_message = self.crud.get_user_data(user_id, "Date_of_first_seen_message")
        date_of_last_seen_message = self.crud.get_user_data(user_id, "Date_of_last_seen_message")
        n_ham = self.crud.get_user_data(user_id, "HamN")
        n_spam = self.crud.get_user_data(user_id, "SpamN")
        groups = self.crud.get_user_data(user_id, "Groups")

        #calculate the sum of trust values of all groups
        sum_trust = 0
        for group in groups :
            group_id = self.crud.get_group_id(group)
            sum_trust += self.crud.get_group_data(group_id, 'Trust')

        #now that we have all the needed vars, calculate trust1, trust2 and trust
        trust1 = (date_of_last_seen_message * n_ham) / (date_of_first_seen_message * (n_ham + n_spam))
        trust2 = sum_trust / len(groups)

        trust = (trust1 + trust2) / 2

        if trust2 < 50:
            trust = trust2
        if trust1 > 100:
            trust = 100

        #before returning a value, check if trust is between 0 and 100
        if trust < 0:
            trust = 0
        elif trust > 100:
            trust = 100
            
        return trust

    def classify_emails(self, calculation_mode, is_log_est, is_log_combo):
        '''
        fonction deja implemente
        Description: fonction pour commencer l'analyse des e-mails.
        Sortie: bool, 'True' pour succes, 'False' dans le cas de failure.
        '''
        try:
            self.process_email(self.get_email(), calculation_mode, is_log_est, is_log_combo)
            return True
        except Exception:
            raise Exception

    def process_email(self, new_emails, calculation_mode, is_log_est, is_log_combo):
        '''
        Description: fonction pour analyser chaque nouvel e-mail dans le 
        dictionare. Elle gere l'ajout des nouveux utilisateurs et/ou modification
        de l'information existante sur les utilisateurs et groupes.
        Sortie: bool, 'True' pour succes, 'False' dans le cas de failure.
        '''
        print(type(new_emails))
        emails = new_emails["dataset"]
        for email in emails:
            email_adr = email['mail']['From']
            date = email['mail']['Date']
            spam = email['mail']['Spam'] == 'true'
            subject = email['mail']['Subject']
            body = email['mail']['Body']
            user_id = self.crud.get_user_id(email_adr)

            if user_id:
                self.update_user_info(email_adr, date, spam)
            else:
                self.add_user_info(email_adr, date)
            
            # trust de l'utilisateur
            trust = self.crud.get_user_data(user_id, "Trust")
            # moyenne du trust de tous les groupes
            user_group = self.crud.get_user_data(user_id, "Groups")
            sum_trust = 0
            groups = self.crud.read_groups_file()
            for group in groups :
                if group['List_of_members'] == user_group:
                    sum_trust += group['Trust']
            avg_group_trust = sum_trust / len(groups)
            # difference de jours entre last_seen_msg et first_seen_msg
            user_activity = self.substract_dates(self.crud.get_user_data(email_adr, "Date_of_last_seen_message"), 
            self.crud.get_user_data(email_adr, "Date_of_first_seen_message"))

            if(calculation_mode == 1):
                spam = self.e_mail.is_spam_function_one(spam, user_activity, trust, avg_group_trust)
            elif(calculation_mode == 2):
                spam = self.e_mail.is_spam_function_two(spam, trust, avg_group_trust)
            elif(calculation_mode == 0):
                spam = self.e_mail.is_spam(subject, body, is_log_est, is_log_combo, 0)

            # mettre a jour l'utilisateur avec la nouvelle valeur de spam
            self.update_user_info(email_adr, date, spam)

        return True
    
    def substract_dates(self, last_seen_msg, first_seen_msg):
        # calcul de difference de jours entre le dernier et le premier message
        last_seen_msg = str(last_seen_msg).split("-")
        first_seen_msg = str(first_seen_msg).split("-")
        user_activity = date(int(last_seen_msg[0]), int(last_seen_msg[1]), int(last_seen_msg[2]))
                        - date(int(first_seen_msg[0]), int(first_seen_msg[1]), int(first_seen_msg[2]))
        # convertir la soustraction en int
        user_activity = int((str(user_activity).split(" "))[0])

        return user_activity
Exemplo n.º 23
0
 def test_is_spam_function_one_returns_false_acc_test_two(self):
     return_val = EmailAnalyzer.is_spam_function_one(True, 35, 65, 80)
     self.assertFalse(return_val)
Exemplo n.º 24
0
 def test_is_spam_function_one_returns_true_icc_test_three(self):
     return_val = EmailAnalyzer.is_spam_function_one(False, 15, 65, 50)
     self.assertTrue(return_val)
Exemplo n.º 25
0
 def test_is_spam_function_one_returns_false_acc_test_four(self):
     return_val = EmailAnalyzer.is_spam_function_one(False, 35, 50, 50)
     self.assertFalse(return_val)
Exemplo n.º 26
0
 def test_is_spam_function_one_returns_false_icc_test_four(self):
     return_val = EmailAnalyzer.is_spam_function_one(True, 15, 65, 50)
     self.assertTrue(return_val)
Exemplo n.º 27
0
 def test_is_spam_function_one_returns_true_acc_test_seven(self):
     return_val = EmailAnalyzer.is_spam_function_one(False, 20, 70, 50)
     self.assertTrue(return_val)
Exemplo n.º 28
0
 def test_is_spam_function_one_returns_false_icc_test_twelve(self):
     return_val = EmailAnalyzer.is_spam_function_one(False, 20, 65, 60)
     self.assertTrue(return_val)
Exemplo n.º 29
0
 def __init__(self):
     self.email_file = "800-mails.json"
     self.crud = CRUD()
     self.e_mail = EmailAnalyzer()
Exemplo n.º 30
0
 def test_is_spam_function_one_returns_false_icc_test_sixteen(self):
     return_val = EmailAnalyzer.is_spam_function_one(True, 20, 50, 50)
     self.assertTrue(return_val)