示例#1
0
class EmailAnalyzer:
    """Classe pour classifier les e-mails comme spam ou non spam (ham)"""
    def __init__(self):
        self.vocab = "vocabulary.json"
        self.cleaning = TextCleaning()
        self.words = VocabularyCreator()

    def is_spam(self, subject_orig, body_orig):
        '''
        Description: fonction pour verifier si e-mail est spam ou ham,
        en calculant les probabilites d'etre spam et ham, 
        donnee le sujet et le texte d'email. 
        Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
        '''

        pSpamSubject, pHamSubject = self.subject_spam_ham_prob(subject_orig)
        pSpamBody, pHamBody = self.spam_ham_body_prob(body_orig)

        # calcul de pSpam et pHam en faisant la moyenne
        pSpam = (pSpamSubject + pSpamBody) / 2
        pHam = (pHamSubject + pHamBody) / 2

        return pSpam > pHam

    def spam_ham_body_prob(self, body):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''

        vocabulary = self.load_dict()

        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        pSpamBody = pSpam
        pHamBody = pHam

        # calcul de probabilite de spam ou ham dans le body
        body = self.clean_text(body)
        for word in body:
            if word in dict(vocabulary['spam_body']):
                pSpamBody *= dict(vocabulary['spam_body'])[word]
            if word in dict(vocabulary['ham_body']):
                pHamBody *= dict(vocabulary['ham_body'])[word]

        if pSpam == pSpamBody:
            pSpamBody = 0
        elif pHam == pHamBody:
            pHamBody = 0

        return pSpamBody, pHamBody

    def subject_spam_ham_prob(self, subject):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''

        vocabulary = self.load_dict()

        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        pSpamSubject = pSpam
        pHamSubject = pHam

        # calcul de probabilite de spam ou ham dans le sujet
        subject = self.clean_text(subject)
        spam_dict = dict(vocabulary['spam_sub'])
        ham_dict = dict(vocabulary['ham_sub'])
        for word in subject:
            if word in spam_dict:
                pSpamSubject *= spam_dict[word]
            if word in ham_dict:
                pHamSubject *= ham_dict[word]

        if pSpam == pSpamSubject:
            pSpamSubject = 0
        elif pHam == pHamSubject:
            pHamSubject = 0

        return pSpamSubject, pHamSubject

    def calculate_spam_divided_by_email(self):  # pragma: no cover
        return self.words.count_spam() / self.words.count_emails()

    def calculate_ham_divided_by_email(self):  # pragma: no cover
        return self.words.count_ham() / self.words.count_emails()

    def load_dict(self):  # pragma: no cover
        with open(self.vocab) as file:
            vocabulary = json.load(file)

        return vocabulary

    def clean_text(self, text):  # pragma: no cover
        return self.cleaning.clean_text(text)
示例#2
0
class EmailAnalyzer:
    """Classe pour classifier les e-mails comme spam ou non spam (ham)"""

    def __init__(self):
        self.vocab = "vocabulary.json"
        self.cleaning = TextCleaning()
        self.words = VocabularyCreator()

    @staticmethod
    def is_spam_function_one(is_msg_spam, user_historic_in_days, user_trust, user_group_trust):
        p = is_msg_spam
        h = user_historic_in_days < 30
        t1 = user_trust < 60
        t2 = user_group_trust < 70
        t3 = user_trust > 75
        result = p and (h and t1 or t2) or h and t2 and not t3
        return result

    @staticmethod
    def is_spam_function_two(is_msg_spam, user_trust, user_group_trust):
        p = is_msg_spam
        t2 = user_group_trust < 70
        t3 = user_trust > 75
        result = p or not t3 and t2
        return result

    def is_spam(self, subject_orig, body_orig, isLogEstimation, isLogCombination, k):
        '''
        Description: fonction pour verifier si e-mail est spam ou ham,
        en calculant les probabilites d'etre spam et ham, 
        donnee le sujet et le texte d'email. 
        Sortie: 'True' - si l'email est spam, 'False' - si email est ham.
        '''
        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        if (isLogEstimation):
            pSpamSubject, pHamSubject = self.subject_spam_ham_log_prob(subject_orig, pSpam, pHam)
            pSpamBody, pHamBody = self.subject_spam_ham_log_prob(body_orig, pSpam, pHam)
            estimationpSpamSubject = math.log10(pSpam) + pSpamSubject
            estimationpHamSubject = math.log10(pHam) + pHamSubject
            estimationpSpamBody = math.log10(pSpam) + pSpamBody
            estimationpHamBody = math.log10(pHam) + pHamBody
        else:
            pSpamSubject, pHamSubject = self.subject_spam_ham_prob(subject_orig)
            pSpamBody, pHamBody = self.spam_ham_body_prob(body_orig)
            estimationpSpamSubject = pSpam * pSpamSubject
            estimationpHamSubject = pHam * pHamSubject
            estimationpSpamBody = pSpam * pSpamBody
            estimationpHamBody = pHam * pHamBody

        if (isLogCombination):
            # s'assurer que l'estimation est strictement plus grand que 0 afin de pouvoir faire le logarithme
            # seul ceux qui sont strictement positif auront appliquer la fonction math.log10
            if (estimationpSpamSubject > 0):
                estimationpSpamSubject = math.log10(estimationpSpamSubject)
            if (estimationpHamSubject > 0):
                estimationpHamSubject = math.log10(estimationpHamSubject)
            if (estimationpSpamBody > 0):
                estimationpSpamBody = math.log10(estimationpSpamBody)
            if (estimationpHamBody > 0):
                estimationpHamBody = math.log10(estimationpHamBody)

        # s'assurer que la valeur de k est entre 0 et 1
        # si elle est plus grand que 1, le rendre en une valeur entre 0 et 1
        if (k > 1):
            k = k / math.pow(10, len(str(k)))
        elif (k < 0):
            k = 0
        # la formule de combinaison de prob est pareil pour les 2 options
        # a l'exception de la valeur des parametres d'estimation qui auront applique le logarithme si approprie
        combinationpSpam = k * estimationpSpamSubject + (1 - k) * estimationpSpamBody
        combinationpHam = k * estimationpHamSubject + (1 - k) * estimationpHamBody

        return combinationpSpam > combinationpHam

    def subject_spam_ham_log_prob(self, subject, pSpam, pHam):
        vocabulary = self.load_dict()

        pSpamSubject = pSpam
        pHamSubject = pHam

        # calcul de probabilite de spam ou ham dans le body
        subject = self.clean_text(subject)
        for word in subject:
            if word in dict(vocabulary['spam_body']):
                pSpamSubject += dict(vocabulary['spam_body'])[word]
            if word in dict(vocabulary['ham_body']):
                pHamSubject += dict(vocabulary['ham_body'])[word]

        # logarithme de pSpamBody et pHamBody
        pHamSubject = math.log10(pHamSubject)
        pSpamSubject = math.log10(pSpamSubject)

        if pSpam == pSpamSubject:
            pSpamBody = 0
        elif pHam == pHamSubject:
            pHamBody = 0

        return pSpamBody, pHamBody

    def spam_ham_body_log_prob(self, body, pSpam, pHam):
        vocabulary = self.load_dict()

        pSpamBody = pSpam
        pHamBody = pHam

        # calcul de probabilite de spam ou ham dans le body
        body = self.clean_text(body)
        for word in body:
            if word in dict(vocabulary['spam_body']):
                pSpamBody += dict(vocabulary['spam_body'])[word]
            if word in dict(vocabulary['ham_body']):
                pHamBody += dict(vocabulary['ham_body'])[word]

        # logarithme de pSpamBody et pHamBody
        pHamBody = math.log10(pHamBody)
        pSpamBody = math.log10(pSpamBody)

        if pSpam == pSpamBody:
            pSpamBody = 0
        elif pHam == pHamBody:
            pHamBody = 0

        return pSpamBody, pHamBody
    
    def spam_ham_body_prob(self, body):
        '''
        Description: fonction pour calculer la probabilite
        que le 'body' d'email est spam ou ham.
        Sortie: probabilite que email body est spam, probabilite
        que email body est ham.
        '''

        vocabulary = self.load_dict()

        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        pSpamBody = pSpam
        pHamBody = pHam

        # calcul de probabilite de spam ou ham dans le body
        body = self.clean_text(body)
        for word in body:
            if word in dict(vocabulary['spam_body']):
                pSpamBody *= dict(vocabulary['spam_body'])[word]
            if word in dict(vocabulary['ham_body']):
                pHamBody *= dict(vocabulary['ham_body'])[word]

        if pSpam == pSpamBody:
            pSpamBody = 0
        elif pHam == pHamBody:
            pHamBody = 0

        return pSpamBody, pHamBody

    def subject_spam_ham_prob(self, subject):
        '''
        Description: fonction pour calculer la probabilite
        que le sujet d'email est spam ou ham.
        Sortie: probabilite que email subject est spam, probabilite
        que email subject est ham.
        '''

        vocabulary = self.load_dict()

        # nombre de mots spam ou ham / nombre de mots total dans les emails
        pSpam = self.calculate_spam_divided_by_email()
        pHam = self.calculate_ham_divided_by_email()

        pSpamSubject = pSpam
        pHamSubject = pHam

        # calcul de probabilite de spam ou ham dans le sujet
        subject = self.clean_text(subject)
        spam_dict = dict(vocabulary['spam_sub'])
        ham_dict = dict(vocabulary['ham_sub'])
        for word in subject:
            if word in spam_dict:
                pSpamSubject *= spam_dict[word]
            if word in ham_dict:
                pHamSubject *= ham_dict[word]

        if pSpam == pSpamSubject:
            pSpamSubject = 0
        elif pHam == pHamSubject:
            pHamSubject = 0

        return pSpamSubject, pHamSubject

    def calculate_spam_divided_by_email(self):  # pragma: no cover
        return self.words.count_spam() / self.words.count_emails()

    def calculate_ham_divided_by_email(self):  # pragma: no cover
        return self.words.count_ham() / self.words.count_emails()

    def load_dict(self):  # pragma: no cover
        with open(self.vocab) as file:
            vocabulary = json.load(file)

        return vocabulary

    def clean_text(self, text):  # pragma: no cover
        return self.cleaning.clean_text(text, 0)
示例#3
0
class TestVocabularyCreator(unittest.TestCase):
    def setUp(self):
        self.mails = {
            "dataset": [{
                "mail": {
                    "Subject": " best online medicine here",
                    "From": "*****@*****.**",
                    "Date": "2004-11-18",
                    "Body":
                    "get any prescription drug you want !\nsimple quick and affordable !",
                    "Spam": "true",
                    "File": "enronds//enron3/spam/1429.2004-11-18.BG.spam.txt"
                }
            }, {
                "mail": {
                    "Subject": " netco due diligence",
                    "From": "*****@*****.**",
                    "Date": "2002-01-02",
                    "Body":
                    "big pig :\nmet with them today and gave an overview of operations .\n",
                    "Spam": "false",
                    "File":
                    "enronds//enron3/ham/4774.2002-01-02.kitchen.ham.txt"
                }
            }]
        }  # données pour mocker "return_value" du "load_dict"
        self.clean_subject_spam = [
            "best", "online", "medicine", "here"
        ]  # données pour mocker "return_value" du "clean_text"
        self.clean_body_spam = [
            "prescription", "drug", "simple", "quick", "affordable"
        ]  # données pour mocker "return_value" du "clean_text"
        self.clean_subject_ham = [
            "netco", "due", "diligence"
        ]  # données pour mocker "return_value" du "clean_text"
        self.clean_body_ham = [
            "big", "pig", "met", "today", "overview", "operations"
        ]  # données pour mocker "return_value" du "clean_text"
        self.vocab_expected = {
            "spam_sub": {
                "best": 1 / 4,
                "online": 1 / 4,
                "medicine": 1 / 4,
                "here": 1 / 4,
            },
            "ham_sub": {
                "netco": 1 / 3,
                "due": 1 / 3,
                "diligence": 1 / 3
            },
            "spam_body": {
                "prescription": 1 / 5,
                "drug": 1 / 5,
                "simple": 1 / 5,
                "quick": 1 / 5,
                "affordable": 1 / 5
            },
            "ham_body": {
                "big": 1 / 6,
                "pig": 1 / 6,
                "met": 1 / 6,
                "today": 1 / 6,
                "overview": 1 / 6,
                "operations": 1 / 6
            }
        }  # vocabulaire avec les valuers de la probabilité calculées correctement
        self.vocabulary_creator = VocabularyCreator()

    def tearDown(self):
        pass

    @patch("vocabulary_creator.VocabularyCreator.load_dict")
    @patch("vocabulary_creator.VocabularyCreator.clean_text")
    @patch("vocabulary_creator.VocabularyCreator.write_data_to_vocab_file")
    def test_create_vocab_spam_Returns_vocabulary_with_correct_values(
            self, mock_write_data_to_vocab_file, mock_clean_text,
            mock_load_dict):
        """Description: Tester qu'un vocabulaire avec les probabilités calculées
        correctement va être retourné. Il faut mocker les fonctions "load dict"
         (utiliser self.mails comme un return value simulé),"clean text"
         (cette fonction va être appelé quelques fois, pour chaque appel on
         va simuler la return_value different, pour cela il faut utiliser
         side_effect (vois l'exemple dans l'énonce)) et
         "write_data_to_vocab_file" qui va simuler "return True" au lieu
         d'écrire au fichier "vocabulary.json".
         if faut utiliser self.assertEqual(appele_a_create_vocab(), self.vocab_expected)
        """
        mock_load_dict.return_value = self.mails
        list_of_values = [
            self.clean_subject_spam, self.clean_body_spam,
            self.clean_subject_ham, self.clean_body_ham
        ]

        mock_clean_text.side_effect = list_of_values
        mock_write_data_to_vocab_file.return_value = True
        self.assertEqual(self.vocabulary_creator.create_vocab(),
                         self.vocab_expected)

    @patch("vocabulary_creator.VocabularyCreator.load_dict")
    def test_count_spam_should_return_correct_number_of_spam(
            self, mock_load_dict):
        mock_load_dict.return_value = self.mails
        self.assertEqual(self.vocabulary_creator.count_spam(), 1)

    @patch("vocabulary_creator.VocabularyCreator.load_dict")
    def test_count_emails_should_return_correct_number_of_emails(
            self, mock_load_dict):
        mock_load_dict.return_value = self.mails
        self.assertEqual(self.vocabulary_creator.count_emails(), 2)

    @patch("vocabulary_creator.VocabularyCreator.load_dict")
    def test_count_ham_should_return_correct_number_of_hams(
            self, mock_load_dict):
        mock_load_dict.return_value = self.mails
        self.assertEqual(self.vocabulary_creator.count_ham(), 1)