Exemplo n.º 1
    def train(self, path):
        '''this function creates list of strings
        (bad_words) that are most likely to trigger
        a spam in the test dataset of emails'''

        #these constants worked the best
        MOST_COMMON_S = 600
        MOST_COMMON_H = 5000
        CHECKED_WORD_LEN = 12
        FACTOR = 20

        words = []
        ham_string = ''
        '''this part concentrares all hams to one
        huge string, than creates list of all specific
        words in hams, and dictionary of those words
        with frequency of their appearance'''
        tc = TrainingCorpus(path)
        for fname, body in tc.hams():
            ham_string += body
            body = body.translate(str.maketrans('.', ' '))
            words = words + (body.lower().split(' '))

        counter_ham = Counter(words)
        ham_words_dict = dict(counter_ham.most_common(MOST_COMMON_H))
        ham_words_list = list(ham_words_dict.keys())

        words = []
        '''same for spams except the huge string part'''
        for fname, body in tc.spams():
            body = body.translate(str.maketrans('.', ' '))
            words = words + (body.lower().split(' '))

        couter_spam = Counter(words)
        spam_words_dict = dict(couter_spam.most_common(MOST_COMMON_S))
        spam_words_list = list(spam_words_dict.keys())
        '''this part creates the bad_words list'''
        for word in spam_words_list:

            if word not in ham_string:

            elif len(
            ) > CHECKED_WORD_LEN and word in ham_words_list and word in spam_words_list:
                if (spam_words_dict[word] > (ham_words_dict[word] * FACTOR)):

class TrainingCorpusTest(unittest.TestCase):
    def setUp(self):
        """Prepare fake corpus with !truth.txt file."""
        self.email_dict = create_corpus_dictionary()
        self.true_class = create_classification_for(self.email_dict.keys())
        truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME)
        save_classification_to_file(self.true_class, fname=truth_filepath)
        with replaced_open():
            self.tc = TrainingCorpus(CORPUS_DIR)

    def tearDown(self):

    def test_getClass(self):
        """Test the get_class method."""
        for key, exp_class in self.true_class.items():
            with replaced_open():
                obs_class = self.tc.get_class(key)
                exp_class, obs_class,
                'The expected class of email {} is {}, but {} was observed'.
                format(key, exp_class, obs_class))

    def test_isSpam(self):
        """Test the is_spam method."""
        for key, exp_class in self.true_class.items():
            exp_spam = (exp_class == SPAM_TAG)
            with replaced_open():
                obs_spam = self.tc.is_spam(key)
                exp_spam, obs_spam,
                'The email {} spamminess: expected {}, observed {}.'.format(
                    key, str(exp_spam), str(obs_spam)))

    def test_isHam(self):
        """Test the is_ham method."""
        for key, exp_class in self.true_class.items():
            exp_ham = (exp_class == HAM_TAG)
            with replaced_open():
                obs_ham = self.tc.is_ham(key)
                exp_ham, obs_ham,
                'The email {} hamminess: expected {}, observed {}.'.format(
                    key, str(exp_ham), str(obs_ham)))

    def test_spams(self):
        """Test spams() method."""
        obs_num_spams = 0
        with replaced_open():
            for fname, contents in self.tc.spams():
                obs_num_spams += 1
                # Validate results
                self.assertEqual(self.true_class[fname], SPAM_TAG,
                                 'Non-spam email returned by spams() method.')
                    self.email_dict[fname], contents,
                    'The read file contents are not equal to the expected contents.'
        c = Counter(self.true_class.values())
        exp_num_spams = c[SPAM_TAG]
            exp_num_spams, obs_num_spams,
            'The spams() method did not return the right number of spams.')

    def test_hams(self):
        """Test hams() method."""
        obs_num_hams = 0
        with replaced_open():
            for fname, contents in self.tc.hams():
                obs_num_hams += 1
                # Validate results
                self.assertEqual(self.true_class[fname], HAM_TAG,
                                 'Spam email returned by hams() method.')
                    self.email_dict[fname], contents,
                    'The read file contents are not equal to the expected contents.'
        c = Counter(self.true_class.values())
        exp_num_hams = c[HAM_TAG]
            exp_num_hams, obs_num_hams,
            'The hams() method did not return the right number of hams.')
class TrainingCorpusTest(unittest.TestCase):
    def setUp(self):
        """Prepare fake corpus with !truth.txt file."""
        self.email_dict = create_corpus_dictionary()
        self.true_class = create_classification_for(self.email_dict.keys())
        truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME)
        save_classification_to_file(self.true_class, fname=truth_filepath)
        self.tc = TrainingCorpus(CORPUS_DIR)

    def tearDown(self):

    def test_getClass(self):
        """Test the get_class method."""
        for key, exp_class in self.true_class.items():
            obs_class = self.tc.get_class(key)
                "The expected class of email {} is {}, but {} was observed".format(key, exp_class, obs_class),

    def test_isSpam(self):
        """Test the is_spam method."""
        for key, exp_class in self.true_class.items():
            exp_spam = exp_class == SPAM_TAG
            obs_spam = self.tc.is_spam(key)
                "The email {} spamminess: expected {}, observed {}.".format(key, str(exp_spam), str(obs_spam)),

    def test_isHam(self):
        """Test the is_ham method."""
        for key, exp_class in self.true_class.items():
            exp_ham = exp_class == HAM_TAG
            obs_ham = self.tc.is_ham(key)
                "The email {} hamminess: expected {}, observed {}.".format(key, str(exp_ham), str(obs_ham)),

    def test_spams(self):
        """Test spams() method."""
        obs_num_spams = 0
        for fname, contents in self.tc.spams():
            obs_num_spams += 1
            # Validate results
            self.assertEqual(self.true_class[fname], SPAM_TAG, "Non-spam email returned by spams() method.")
                self.email_dict[fname], contents, "The read file contents are not equal to the expected contents."
        c = Counter(self.true_class.values())
        exp_num_spams = c[SPAM_TAG]
        self.assertEqual(exp_num_spams, obs_num_spams, "The spams() method did not return the right number of spams.")

    def test_hams(self):
        """Test hams() method."""
        obs_num_hams = 0
        for fname, contents in self.tc.hams():
            obs_num_hams += 1
            # Validate results
            self.assertEqual(self.true_class[fname], HAM_TAG, "Spam email returned by hams() method.")
                self.email_dict[fname], contents, "The read file contents are not equal to the expected contents."
        c = Counter(self.true_class.values())
        exp_num_hams = c[HAM_TAG]
        self.assertEqual(exp_num_hams, obs_num_hams, "The hams() method did not return the right number of hams.")