class EmailAnalyzer: """Classe pour classifier les e-mails comme spam ou non spam (ham)""" def __init__(self): self.vocab = "vocabulary.json" self.cleaning = TextCleaning() self.words = VocabularyCreator() def is_spam(self, subject_orig, body_orig): ''' Description: fonction pour verifier si e-mail est spam ou ham, en calculant les probabilites d'etre spam et ham, donnee le sujet et le texte d'email. Sortie: 'True' - si l'email est spam, 'False' - si email est ham. ''' pSpamSubject, pHamSubject = self.subject_spam_ham_prob(subject_orig) pSpamBody, pHamBody = self.spam_ham_body_prob(body_orig) # calcul de pSpam et pHam en faisant la moyenne pSpam = (pSpamSubject + pSpamBody) / 2 pHam = (pHamSubject + pHamBody) / 2 return pSpam > pHam def spam_ham_body_prob(self, body): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' vocabulary = self.load_dict() # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() pSpamBody = pSpam pHamBody = pHam # calcul de probabilite de spam ou ham dans le body body = self.clean_text(body) for word in body: if word in dict(vocabulary['spam_body']): pSpamBody *= dict(vocabulary['spam_body'])[word] if word in dict(vocabulary['ham_body']): pHamBody *= dict(vocabulary['ham_body'])[word] if pSpam == pSpamBody: pSpamBody = 0 elif pHam == pHamBody: pHamBody = 0 return pSpamBody, pHamBody def subject_spam_ham_prob(self, subject): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' vocabulary = self.load_dict() # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() pSpamSubject = pSpam pHamSubject = pHam # calcul de probabilite de spam ou ham dans le sujet subject = self.clean_text(subject) spam_dict = dict(vocabulary['spam_sub']) ham_dict = dict(vocabulary['ham_sub']) for word in subject: if word in spam_dict: pSpamSubject *= spam_dict[word] if word in ham_dict: pHamSubject *= ham_dict[word] if pSpam == pSpamSubject: pSpamSubject = 0 elif pHam == pHamSubject: pHamSubject = 0 return pSpamSubject, pHamSubject def calculate_spam_divided_by_email(self): # pragma: no cover return self.words.count_spam() / self.words.count_emails() def calculate_ham_divided_by_email(self): # pragma: no cover return self.words.count_ham() / self.words.count_emails() def load_dict(self): # pragma: no cover with open(self.vocab) as file: vocabulary = json.load(file) return vocabulary def clean_text(self, text): # pragma: no cover return self.cleaning.clean_text(text)
class EmailAnalyzer: """Classe pour classifier les e-mails comme spam ou non spam (ham)""" def __init__(self): self.vocab = "vocabulary.json" self.cleaning = TextCleaning() self.words = VocabularyCreator() @staticmethod def is_spam_function_one(is_msg_spam, user_historic_in_days, user_trust, user_group_trust): p = is_msg_spam h = user_historic_in_days < 30 t1 = user_trust < 60 t2 = user_group_trust < 70 t3 = user_trust > 75 result = p and (h and t1 or t2) or h and t2 and not t3 return result @staticmethod def is_spam_function_two(is_msg_spam, user_trust, user_group_trust): p = is_msg_spam t2 = user_group_trust < 70 t3 = user_trust > 75 result = p or not t3 and t2 return result def is_spam(self, subject_orig, body_orig, isLogEstimation, isLogCombination, k): ''' Description: fonction pour verifier si e-mail est spam ou ham, en calculant les probabilites d'etre spam et ham, donnee le sujet et le texte d'email. Sortie: 'True' - si l'email est spam, 'False' - si email est ham. ''' # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() if (isLogEstimation): pSpamSubject, pHamSubject = self.subject_spam_ham_log_prob(subject_orig, pSpam, pHam) pSpamBody, pHamBody = self.subject_spam_ham_log_prob(body_orig, pSpam, pHam) estimationpSpamSubject = math.log10(pSpam) + pSpamSubject estimationpHamSubject = math.log10(pHam) + pHamSubject estimationpSpamBody = math.log10(pSpam) + pSpamBody estimationpHamBody = math.log10(pHam) + pHamBody else: pSpamSubject, pHamSubject = self.subject_spam_ham_prob(subject_orig) pSpamBody, pHamBody = self.spam_ham_body_prob(body_orig) estimationpSpamSubject = pSpam * pSpamSubject estimationpHamSubject = pHam * pHamSubject estimationpSpamBody = pSpam * pSpamBody estimationpHamBody = pHam * pHamBody if (isLogCombination): # s'assurer que l'estimation est strictement plus grand que 0 afin de pouvoir faire le logarithme # seul ceux qui sont strictement positif auront appliquer la fonction math.log10 if (estimationpSpamSubject > 0): estimationpSpamSubject = math.log10(estimationpSpamSubject) if (estimationpHamSubject > 0): estimationpHamSubject = math.log10(estimationpHamSubject) if (estimationpSpamBody > 0): estimationpSpamBody = math.log10(estimationpSpamBody) if (estimationpHamBody > 0): estimationpHamBody = math.log10(estimationpHamBody) # s'assurer que la valeur de k est entre 0 et 1 # si elle est plus grand que 1, le rendre en une valeur entre 0 et 1 if (k > 1): k = k / math.pow(10, len(str(k))) elif (k < 0): k = 0 # la formule de combinaison de prob est pareil pour les 2 options # a l'exception de la valeur des parametres d'estimation qui auront applique le logarithme si approprie combinationpSpam = k * estimationpSpamSubject + (1 - k) * estimationpSpamBody combinationpHam = k * estimationpHamSubject + (1 - k) * estimationpHamBody return combinationpSpam > combinationpHam def subject_spam_ham_log_prob(self, subject, pSpam, pHam): vocabulary = self.load_dict() pSpamSubject = pSpam pHamSubject = pHam # calcul de probabilite de spam ou ham dans le body subject = self.clean_text(subject) for word in subject: if word in dict(vocabulary['spam_body']): pSpamSubject += dict(vocabulary['spam_body'])[word] if word in dict(vocabulary['ham_body']): pHamSubject += dict(vocabulary['ham_body'])[word] # logarithme de pSpamBody et pHamBody pHamSubject = math.log10(pHamSubject) pSpamSubject = math.log10(pSpamSubject) if pSpam == pSpamSubject: pSpamBody = 0 elif pHam == pHamSubject: pHamBody = 0 return pSpamBody, pHamBody def spam_ham_body_log_prob(self, body, pSpam, pHam): vocabulary = self.load_dict() pSpamBody = pSpam pHamBody = pHam # calcul de probabilite de spam ou ham dans le body body = self.clean_text(body) for word in body: if word in dict(vocabulary['spam_body']): pSpamBody += dict(vocabulary['spam_body'])[word] if word in dict(vocabulary['ham_body']): pHamBody += dict(vocabulary['ham_body'])[word] # logarithme de pSpamBody et pHamBody pHamBody = math.log10(pHamBody) pSpamBody = math.log10(pSpamBody) if pSpam == pSpamBody: pSpamBody = 0 elif pHam == pHamBody: pHamBody = 0 return pSpamBody, pHamBody def spam_ham_body_prob(self, body): ''' Description: fonction pour calculer la probabilite que le 'body' d'email est spam ou ham. Sortie: probabilite que email body est spam, probabilite que email body est ham. ''' vocabulary = self.load_dict() # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() pSpamBody = pSpam pHamBody = pHam # calcul de probabilite de spam ou ham dans le body body = self.clean_text(body) for word in body: if word in dict(vocabulary['spam_body']): pSpamBody *= dict(vocabulary['spam_body'])[word] if word in dict(vocabulary['ham_body']): pHamBody *= dict(vocabulary['ham_body'])[word] if pSpam == pSpamBody: pSpamBody = 0 elif pHam == pHamBody: pHamBody = 0 return pSpamBody, pHamBody def subject_spam_ham_prob(self, subject): ''' Description: fonction pour calculer la probabilite que le sujet d'email est spam ou ham. Sortie: probabilite que email subject est spam, probabilite que email subject est ham. ''' vocabulary = self.load_dict() # nombre de mots spam ou ham / nombre de mots total dans les emails pSpam = self.calculate_spam_divided_by_email() pHam = self.calculate_ham_divided_by_email() pSpamSubject = pSpam pHamSubject = pHam # calcul de probabilite de spam ou ham dans le sujet subject = self.clean_text(subject) spam_dict = dict(vocabulary['spam_sub']) ham_dict = dict(vocabulary['ham_sub']) for word in subject: if word in spam_dict: pSpamSubject *= spam_dict[word] if word in ham_dict: pHamSubject *= ham_dict[word] if pSpam == pSpamSubject: pSpamSubject = 0 elif pHam == pHamSubject: pHamSubject = 0 return pSpamSubject, pHamSubject def calculate_spam_divided_by_email(self): # pragma: no cover return self.words.count_spam() / self.words.count_emails() def calculate_ham_divided_by_email(self): # pragma: no cover return self.words.count_ham() / self.words.count_emails() def load_dict(self): # pragma: no cover with open(self.vocab) as file: vocabulary = json.load(file) return vocabulary def clean_text(self, text): # pragma: no cover return self.cleaning.clean_text(text, 0)
class TestVocabularyCreator(unittest.TestCase): def setUp(self): self.mails = { "dataset": [{ "mail": { "Subject": " best online medicine here", "From": "*****@*****.**", "Date": "2004-11-18", "Body": "get any prescription drug you want !\nsimple quick and affordable !", "Spam": "true", "File": "enronds//enron3/spam/1429.2004-11-18.BG.spam.txt" } }, { "mail": { "Subject": " netco due diligence", "From": "*****@*****.**", "Date": "2002-01-02", "Body": "big pig :\nmet with them today and gave an overview of operations .\n", "Spam": "false", "File": "enronds//enron3/ham/4774.2002-01-02.kitchen.ham.txt" } }] } # données pour mocker "return_value" du "load_dict" self.clean_subject_spam = [ "best", "online", "medicine", "here" ] # données pour mocker "return_value" du "clean_text" self.clean_body_spam = [ "prescription", "drug", "simple", "quick", "affordable" ] # données pour mocker "return_value" du "clean_text" self.clean_subject_ham = [ "netco", "due", "diligence" ] # données pour mocker "return_value" du "clean_text" self.clean_body_ham = [ "big", "pig", "met", "today", "overview", "operations" ] # données pour mocker "return_value" du "clean_text" self.vocab_expected = { "spam_sub": { "best": 1 / 4, "online": 1 / 4, "medicine": 1 / 4, "here": 1 / 4, }, "ham_sub": { "netco": 1 / 3, "due": 1 / 3, "diligence": 1 / 3 }, "spam_body": { "prescription": 1 / 5, "drug": 1 / 5, "simple": 1 / 5, "quick": 1 / 5, "affordable": 1 / 5 }, "ham_body": { "big": 1 / 6, "pig": 1 / 6, "met": 1 / 6, "today": 1 / 6, "overview": 1 / 6, "operations": 1 / 6 } } # vocabulaire avec les valuers de la probabilité calculées correctement self.vocabulary_creator = VocabularyCreator() def tearDown(self): pass @patch("vocabulary_creator.VocabularyCreator.load_dict") @patch("vocabulary_creator.VocabularyCreator.clean_text") @patch("vocabulary_creator.VocabularyCreator.write_data_to_vocab_file") def test_create_vocab_spam_Returns_vocabulary_with_correct_values( self, mock_write_data_to_vocab_file, mock_clean_text, mock_load_dict): """Description: Tester qu'un vocabulaire avec les probabilités calculées correctement va être retourné. Il faut mocker les fonctions "load dict" (utiliser self.mails comme un return value simulé),"clean text" (cette fonction va être appelé quelques fois, pour chaque appel on va simuler la return_value different, pour cela il faut utiliser side_effect (vois l'exemple dans l'énonce)) et "write_data_to_vocab_file" qui va simuler "return True" au lieu d'écrire au fichier "vocabulary.json". if faut utiliser self.assertEqual(appele_a_create_vocab(), self.vocab_expected) """ mock_load_dict.return_value = self.mails list_of_values = [ self.clean_subject_spam, self.clean_body_spam, self.clean_subject_ham, self.clean_body_ham ] mock_clean_text.side_effect = list_of_values mock_write_data_to_vocab_file.return_value = True self.assertEqual(self.vocabulary_creator.create_vocab(), self.vocab_expected) @patch("vocabulary_creator.VocabularyCreator.load_dict") def test_count_spam_should_return_correct_number_of_spam( self, mock_load_dict): mock_load_dict.return_value = self.mails self.assertEqual(self.vocabulary_creator.count_spam(), 1) @patch("vocabulary_creator.VocabularyCreator.load_dict") def test_count_emails_should_return_correct_number_of_emails( self, mock_load_dict): mock_load_dict.return_value = self.mails self.assertEqual(self.vocabulary_creator.count_emails(), 2) @patch("vocabulary_creator.VocabularyCreator.load_dict") def test_count_ham_should_return_correct_number_of_hams( self, mock_load_dict): mock_load_dict.return_value = self.mails self.assertEqual(self.vocabulary_creator.count_ham(), 1)