def test(self, test_corpus_dir): """ Tests given emails for being SPAM or HAM :param test_corpus_dir: directory with emails """ test_corpus = Corpus( test_corpus_dir) # instance of corpus for walking emails self.load_from_memory( ) # load memory, in case no training was made, if training was made it makes it better for name, msg in test_corpus.emails( ): # for name of file and the email values = [] for a in msg.keys(): # for all email header parts a = a.lower() if self.classification.get(a): values.append( self.get_stat(a, msg) ) # get status of this part, values closer to 1 mean SPAM values.append( self.get_stat_payload(msg)) # get status of body/payload final_stat = sum(values) / (len(values)) # get final value if final_stat > 0.5: # if value is closer to being SPAM self.predictions[name] = 'SPAM' else: # if value is close to HAM self.predictions[name] = 'OK' utils.write_classification_to_file( test_corpus_dir, self.predictions) # safe created classification self.save_to_memory( ) # save the memory in case something new was learned on training data
def test(self, dir_path): emails = self.get_email_files(dir_path) pred_dict = {} for mail in emails: pred_dict[mail] = self.analyse_email(emails[mail]) write_classification_to_file(pred_dict, os.path.join(dir_path, "!prediction.txt"))
def test(self, directory): corp = Corpus(directory) result = {} if self.trained: for fname, body in corp.emails(): SpamSum = 0 HamSum = 0 examined = Email(fname, body) for spam in self.spams: SpamSum += Email.compare_emails(examined, spam) for ham in self.hams: HamSum += Email.compare_emails(examined, ham) if SpamSum / len(self.spams) > HamSum / len(self.hams): result[fname] = 'SPAM' else: result[fname] = 'OK' write_classification_to_file( directory + os.path.sep + '!prediction.txt', result) else: for fname, body in corp.emails(): counter = 0 for word in self.blacklist: if word in body: counter += 1 if counter > 3: result[fname] = 'SPAM' else: result[fname] = 'OK' write_classification_to_file( directory + os.path.sep + '!prediction.txt', result)
def test(self, dir): EASING = 0.095 SLICING = 38 cls_dict = dict() file_name_with_data = dict() for filename in os.listdir(dir): if filename[0] == "!": continue f = open(dir + '/' + filename, 'r', encoding="utf8") file_name_with_data.update({filename: f.read()}) for file_name, email_content in file_name_with_data.items(): a, b = 1.0, 1.0 for word, spamicity in \ sorted( \ [(w, 0.5 if self.spamicity.get(w) == None else self.spamicity[w]) \ for w in self.get_tokens(email_content)], \ key=lambda x: 0.5 - math.fabs(0.5 - x[1]) \ )[0:SLICING]: a *= math.fabs(spamicity - EASING) b *= 1.0 - spamicity + EASING cls_dict.update( {file_name: ("SPAM" if (a / (a + b)) >= 1.0 else "OK")}) utils.write_classification_to_file(cls_dict, dir + "/!prediction.txt")
def test(self, path_to_corpus_to_evaluate): files = os.listdir(path_to_corpus_to_evaluate) cls_dict = dict() type_list = ['SPAM', 'OK'] for name in files: cls_dict.update({str(name): type_list[random.randint(0, 1)]}) utils.write_classification_to_file(cls_dict, path_to_corpus_to_evaluate+'\\!prediction.txt')
def test(dir_path): result_dict = {} files = os.listdir(dir_path) for email in files: if email[0] != '!': result_dict[email] = "OK" utils.write_classification_to_file( os.path.join(dir_path, '!prediction.txt'), result_dict)
def test(dir_path): result_dict = {} files = os.listdir(dir_path) values = ["SPAM", "OK"] for email in files: if email[0] != '!': result_dict[email] = values[random.randint(0, 1)] utils.write_classification_to_file( os.path.join(dir_path, '!prediction3.txt'), result_dict)
def test_correctlyFormattedDict(self): input = create_classification() save_classification_to_file(input, REFERENCENAME) with replaced_open(): write_classification_to_file(input, FILENAME) # Validate results self.assertListEqual(list(io.open(REFERENCENAME)), list(io.open(FILENAME)), 'Items in written files are not equal.')
def test_returnEmptyFile_forEmptyDict(self): input = dict() save_classification_to_file(input, REFERENCENAME) with replaced_open(): write_classification_to_file(input, FILENAME) # Validate results self.assertListEqual(list(io.open(REFERENCENAME)), list(io.open(FILENAME)), 'Items in written files are not equal.')
def test(self, test_corpus_dir): ''' Creates dict of classification and writes it to the file :param test_corpus_dir: path to test dir :return: None ''' # Prepare "global" variables c = Corpus(test_corpus_dir) class_dict = {} # Iterate over emails with generator in Corpus for email in c.emails(): # Declare probabilities - will be modified spam_probability = 0 ham_probability = 0 # Get word statistics of email - word frequency and word count word_stats = self.get_word_count_for_mail(email[1]) word_freq = word_stats[0] word_count = word_stats[1] # Compute spamines of words spaminesses = [] for word in word_freq: s = self.get_spaminnes_of_word(word) if s is not None: spaminesses.append(s) # Caluclates needed parts for further computation product = self.prod(spaminesses) one_without_spammineses = self.one_without_spaminesses(spaminesses) lower = product + one_without_spammineses # We cannot divide by zero if lower != 0: overall_spaminess = product / (product + one_without_spammineses) else: overall_spaminess = 0 # Final decision if overall_spaminess >= 0.5: class_dict.update({email[0]: "SPAM"}) else: class_dict.update({email[0]: "OK"}) # Creates !prediction.txt file utils.write_classification_to_file( test_corpus_dir + "/!prediction.txt", class_dict)
def test(self, test_dir): test_files = os.listdir(test_dir) for file in test_files: test_file_path = test_dir test_file_path += '/' + file mail = self.get_email(test_file_path) mail_words = self.get_email_message(mail) word_ratings = [] """Setting word spam ratings""" for word in mail_words: if word in self.vocabulary: word_ratings.append( self.word_spaminess.get(word, self.init_spam_likelihood)) else: word_ratings.append(self.init_spam_likelihood) """Paul Graham - A Plan for Spam method.""" if len(word_ratings) == 0: self.test_files_result_dict[file] = self.decision_table[1] continue elif len(word_ratings) >= 20: """To avoid rounding to zero""" word_ratings.sort() word_ratings = word_ratings[:10] + word_ratings[-10:] """Combining individual probabilities of that the message containing a spam word""" """I'm assuming that the words present in the message are independent events. So that's why I'm multiplying all the word ratings.""" """Product of all word_spaminess in the message.""" spam_rating_product = reduce(lambda x, y: x * y, word_ratings) """Product of all word_haminess in the message.""" ham_rating_product = reduce(lambda x, y: x * y, map(lambda x: 1.0 - x, word_ratings)) result = spam_rating_product / (spam_rating_product + ham_rating_product) """After the email's spam probability is computed over all words in the email, and if the total exceeds a certain threshold, the filter will mark the email as a spam.""" if result >= 0.95: self.test_files_result_dict[file] = self.decision_table[1] else: self.test_files_result_dict[file] = self.decision_table[0] write_classification_to_file(self.test_files_result_dict, test_dir + '/!prediction.txt')
def test(self, dir_path): no_tests_done = 0 rather_positive = 0 corpus = Corpus(dir_path) clasif = dict() for name, mail in corpus.emails(): # Test strong filters result = self.test_strong_filters(name, mail) if result != -1: # Strong filters were decisive clasif[name] = result continue # Skip to the next iteration score = 0 tests_done = 0 # Test normal filters result = self.test_word_filters(name, mail) score += result[0] tests_done += result[1] # Test word filters result = self.test_word_filters(name, mail) score += result[0] tests_done += result[1] if tests_done == 0: no_tests_done += 1 # print("No tests were done for " + name) clasif[name] = NEGATIVE elif score / tests_done > POSITIVITY_THRESHOLD: clasif[name] = POSITIVE else: if score / tests_done > 0.50: rather_positive += 1 clasif[name] = NEGATIVE utils.write_classification_to_file(clasif, dir_path + "/!prediction.txt")
def test(self, path_to_corpus_to_evaluate): files = os.listdir(path_to_corpus_to_evaluate) cls_dict = dict() for name in files: cls_dict.update({str(name): 'OK'}) utils.write_classification_to_file(cls_dict, path_to_corpus_to_evaluate)
def test(self, prediction_corpus_path): write_classification_to_file( self.dictionary, prediction_corpus_path + '/!prediction.txt')