def test(self, directory): corp = Corpus(directory) result = {} if self.trained: for fname, body in corp.emails(): SpamSum = 0 HamSum = 0 examined = Email(fname, body) for spam in self.spams: SpamSum += Email.compare_emails(examined, spam) for ham in self.hams: HamSum += Email.compare_emails(examined, ham) if SpamSum / len(self.spams) > HamSum / len(self.hams): result[fname] = 'SPAM' else: result[fname] = 'OK' write_classification_to_file( directory + os.path.sep + '!prediction.txt', result) else: for fname, body in corp.emails(): counter = 0 for word in self.blacklist: if word in body: counter += 1 if counter > 3: result[fname] = 'SPAM' else: result[fname] = 'OK' write_classification_to_file( directory + os.path.sep + '!prediction.txt', result)
def test(self, test_corpus_dir): """ Tests given emails for being SPAM or HAM :param test_corpus_dir: directory with emails """ test_corpus = Corpus( test_corpus_dir) # instance of corpus for walking emails self.load_from_memory( ) # load memory, in case no training was made, if training was made it makes it better for name, msg in test_corpus.emails( ): # for name of file and the email values = [] for a in msg.keys(): # for all email header parts a = a.lower() if self.classification.get(a): values.append( self.get_stat(a, msg) ) # get status of this part, values closer to 1 mean SPAM values.append( self.get_stat_payload(msg)) # get status of body/payload final_stat = sum(values) / (len(values)) # get final value if final_stat > 0.5: # if value is closer to being SPAM self.predictions[name] = 'SPAM' else: # if value is close to HAM self.predictions[name] = 'OK' utils.write_classification_to_file( test_corpus_dir, self.predictions) # safe created classification self.save_to_memory( ) # save the memory in case something new was learned on training data
def test(self, path): corp = Corpus(path) bs = Bayesian() count = 0 sender_bl = load_pickle('sender_bl.pickle') # scan email and define if msg is SPAM or HAM # first check if sender occurs in sender Blacklist # then count spamicity of the word using the Bayes approach for fname, body in corp.emails(): sender = find_sender(body) if sender in sender_bl: self.tag_it(path, fname, 'SPAM') continue spamicity_list = [] count += 1 tokens = tokenize(body) # compute spamicity for each word and create list of the values for el in tokens: word_spamicity = [el, bs.word_spamicity(el)] spamicity_list.append(word_spamicity) # prepare list for Bayes spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))] # remove duplicates from list spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True) prediction = bs.bayes_pred(spamicity_list[:15]) # Consider only 15 'words' if prediction > 0.9 or sender in sender_bl: self.tag_it(path, fname, 'SPAM') else: self.tag_it(path, fname, 'OK')
def test(self, path): '''This function determines which emails contain at least one "bad word", those are marked as spams, the others as ham, then the classification is written to !prediction.txt file.''' c = Corpus(path) for fname, body in c.emails(): SPAM = False body = body.translate(str.maketrans('.', ' ')) for word in self.bad_words: if word in body: SPAM = True break if SPAM: self.final_dict[fname] = 'SPAM' else: self.final_dict[fname] = 'OK' names = os.listdir(path) path0 = os.getcwd() os.chdir(path) fd = open('!prediction.txt', 'w', encoding='utf-8') for name in names: if name[0] != '!': fd.write(name + ' ' + self.final_dict[name] + '\n') fd.close() os.chdir(path0) pass
def test_atom_filter(initialized_filter, train_dir, test_dir): train_corp = TrainingCorpus(train_dir) test_corp = Corpus(test_dir) filter = initialized_filter filter.train(train_corp) prediction = dict() for name, mail in test_corp.emails(): result = filter.test(mail) if result == -1: continue elif result > POSITIVITY_THRESHOLD: prediction[name] = POSITIVE else: prediction[name] = NEGATIVE truth = read_classification_from_file(test_dir + '/' + TRUTHFILE) conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE) conf_matrix.compute_from_dicts(truth, prediction) matrix_dict = conf_matrix.as_dict() # For testing purposes print(matrix_dict) score = quality_score(matrix_dict['tp'], \ matrix_dict['tn'], \ matrix_dict['fp'], \ matrix_dict['fn']) return score
def test(self, path): corp = Corpus(path) bs = Bayesian() count = 0 sender_bl = load_pickle('sender_bl.pickle') # scan email and define if msg is SPAM or HAM # first check if sender occurs in sender Blacklist # then count spamicity of the word using the Bayes approach for fname, body in corp.emails(): sender = find_sender(body) if sender in sender_bl: self.tag_it(path, fname, 'SPAM') continue spamicity_list = [] count += 1 tokens = tokenize(body) # compute spamicity for each word and create list of the values for el in tokens: word_spamicity = [el, bs.word_spamicity(el)] spamicity_list.append(word_spamicity) # prepare list for Bayes spamicity_list = [ list(i) for i in set(map(tuple, spamicity_list)) ] # remove duplicates from list spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True) prediction = bs.bayes_pred( spamicity_list[:15]) # Consider only 15 'words' if prediction > 0.9 or sender in sender_bl: self.tag_it(path, fname, 'SPAM') else: self.tag_it(path, fname, 'OK')
def train( self, file_path, batch_size=10, learning_rate=0.1, lr_decay=0.05, epochs=1000, momentum=0.0, tuning=False ): # analogous to PLR_filter, tuning parameter plots out a graph of mean loss over epochs if tuning: og_lr = learning_rate # original learning rate x_plt = [] # x-axis (epochs) y_plt = [] # y-axis (mean loss) corpus = Corpus(file_path) truth_dict = utils.read_classification_from_file(file_path + "/!truth.txt") got_data = True mails_getter = corpus.emails() batches = [] while got_data: batch = [] for i in range(batch_size): try: email = next(mails_getter) batch.append( (email[1], 1 if truth_dict[email[0]] == self.pos_tag else 0)) except StopIteration: got_data = False break batches.append(batch) for e in range(epochs): if tuning: steps = 0 print(learning_rate) self.init_momentums() loss = 0 for batch in batches: batch_vectors = [(m[0].get_feature_vector_lr()) for m in batch] y = [m[1] for m in batch] loss += self.gradient_descent(y, batch_vectors, learning_rate, momentum) if tuning: steps += 1 print(f"trained on epoch #{e +1}") learning_rate *= 1 / (1 + lr_decay * e) if tuning: y_plt.append(loss / steps) x_plt.append(e) if tuning: plt.plot(x_plt, y_plt) plt.title( f"lr:{og_lr} lrd:{lr_decay} bs:{batch_size} m: {momentum} e:{epochs}" ) plt.xlabel("epochs") plt.ylabel("mean loss") plt.show()
def test(self, mails_path): try: os.remove(mails_path + "/!prediction.txt") except: pass corpus = Corpus(mails_path) with open(mails_path + "/!prediction.txt", 'a', encoding='utf-8') as f: for mail in corpus.emails(): res = self.evaluate_mail(mail[1]) f.write(f"{mail[0]} {self.pos_tag if res else self.neg_tag}\n")
def __init__(self, folder): self.folder = folder self.spams = [] self.hams = [] corp = Corpus(folder) for fname, content in corp.emails(): if self.is_ham(fname): self.hams.append(Email(fname, content)) else: self.spams.append(Email(fname, content))
def test_corpusContainsOnlyEmails(self): """Test reading the corpus with email messages only.""" corpus = Corpus(CORPUS_DIR) # Exercise the SUT observed = {} for fname, contents in corpus.emails(): observed[fname] = contents # Verify the results self.assertEqual(len(self.expected), len(observed), 'The emails() method did not generate all the corpus files.') self.assertEqual(self.expected, observed, 'The read file contents are not equal to the expected contents.')
def test(self, test_corpus_dir): test_corpus = Corpus(test_corpus_dir) with open(os.path.join(test_corpus_dir, "!prediction.txt"), "w+") as a_file: for filename, body in test_corpus.emails(): if self.bayesian_combination(body) > 0.9 or self.get_email_adress(body) in self.black_list: decision = "SPAM" else: if self.get_email_adress(body) in self.white_list: decision = "OK" else: decision = "OK" a_file.write(filename + " " + decision + "\n")
def test(self, test_corpus_dir): ''' Creates dict of classification and writes it to the file :param test_corpus_dir: path to test dir :return: None ''' # Prepare "global" variables c = Corpus(test_corpus_dir) class_dict = {} # Iterate over emails with generator in Corpus for email in c.emails(): # Declare probabilities - will be modified spam_probability = 0 ham_probability = 0 # Get word statistics of email - word frequency and word count word_stats = self.get_word_count_for_mail(email[1]) word_freq = word_stats[0] word_count = word_stats[1] # Compute spamines of words spaminesses = [] for word in word_freq: s = self.get_spaminnes_of_word(word) if s is not None: spaminesses.append(s) # Caluclates needed parts for further computation product = self.prod(spaminesses) one_without_spammineses = self.one_without_spaminesses(spaminesses) lower = product + one_without_spammineses # We cannot divide by zero if lower != 0: overall_spaminess = product / (product + one_without_spammineses) else: overall_spaminess = 0 # Final decision if overall_spaminess >= 0.5: class_dict.update({email[0]: "SPAM"}) else: class_dict.update({email[0]: "OK"}) # Creates !prediction.txt file utils.write_classification_to_file( test_corpus_dir + "/!prediction.txt", class_dict)
def test_corpusContainsOnlyEmails(self): """Test reading the corpus with email messages only.""" corpus = Corpus(CORPUS_DIR) # Exercise the SUT observed = {} with replaced_open(): for fname, contents in corpus.emails(): observed[fname] = contents # Verify the results self.assertEqual( len(self.expected), len(observed), 'The emails() method did not generate all the corpus files.') self.assertEqual( self.expected, observed, 'The read file contents are not equal to the expected contents.')
def test_corpusContainsAlsoSpecialFiles(self): """Test reading the corpus with special files.""" # Add a special file into the corpus dir save_file_to_corpus_dir( fname=SPECIAL_FILENAME, contents='fake', dirname=CORPUS_DIR) corpus = Corpus(CORPUS_DIR) # Exercise the SUT observed = {} for fname, contents in corpus.emails(): observed[fname] = contents # Verify the results self.assertEqual(len(self.expected), len(observed), 'The emails() method did not generate all the corpus files.') self.assertEqual(self.expected, observed, 'The read file contents are not equal to the expected contents.')
def train(self, file_path, batch_size=10, learning_rate=0.1, lr_decay=0.05, epochs=1000, momentum=0.0): corpus = Corpus(file_path) truth_dict = utils.read_classification_from_file(file_path + "/!truth.txt") got_data = True mails_getter = corpus.emails() batches = [] # loads all data from directory in batches of given size while got_data: batch = [] # loads a batch of given size, a smaller one if out of data for i in range(batch_size): try: email = next(mails_getter) batch.append( (email[1], 1 if truth_dict[email[0]] == self.pos_tag else 0)) except StopIteration: got_data = False break batches.append(batch) for e in range(epochs): # trains multiple times on all batches self.init_momentums() for batch in batches: # performs gradient descent on each bach # gets feature vectors for batch feature_vectors = [ (m[0].get_feature_vector_plr()) for m in batch ] # gets feature vectors of the batch y = [m[1] for m in batch] # gets the truth vector of the batch for i in range( self.subvector_count ): # weights for each subvector are trained separately subvector_batch = [ v[i] for v in feature_vectors ] # isolates a subvector from all vectors self.gradient_descent(i, y, subvector_batch, learning_rate, momentum) print(f"trained on epoch #{e +1}") learning_rate *= 1 / (1 + lr_decay * e)
def test_corpusContainsAlsoSpecialFiles(self): """Test reading the corpus with special files.""" # Add a special file into the corpus dir save_file_to_corpus_dir(fname=SPECIAL_FILENAME, contents='fake', dirname=CORPUS_DIR) corpus = Corpus(CORPUS_DIR) # Exercise the SUT observed = {} with replaced_open(): for fname, contents in corpus.emails(): observed[fname] = contents # Verify the results self.assertEqual( len(self.expected), len(observed), 'The emails() method did not generate all the corpus files.') self.assertEqual( self.expected, observed, 'The read file contents are not equal to the expected contents.')
def train(self, file_path): self.content_spam_dict = {} self.content_ham_dict = {} class_dict = utils.read_classification_from_file(file_path + '/!truth.txt') corpus = Corpus(file_path) email_generator = corpus.emails() content_counter_spam = Counter() content_counter_ham = Counter() content_wordcount_spam = 0 content_wordcount_ham = 0 spam_count = 0 ham_count = 0 every_word_content = set() for mail in email_generator: content_words = self.string_to_words(mail[1].content_no_html) content_counter = Counter(content_words) for word in content_words: every_word_content.add(word) if class_dict[mail[0]] == self.pos_tag: spam_count += 1 content_counter_spam += content_counter content_wordcount_spam += len(content_words) else: ham_count += 1 content_counter_ham += content_counter content_wordcount_ham += len(content_words) for word in every_word_content: content_counter_ham[word] += 1 content_counter_spam[word] += 1 self.content_spam_dict[ word] = content_counter_spam[word] / content_wordcount_spam self.content_ham_dict[ word] = content_counter_ham[word] / content_wordcount_ham self.spam_probability = spam_count / (spam_count + ham_count) self.ham_probability = ham_count / (spam_count + ham_count) self.trained = True
def test(self, path): """ Tests given emails for being SPAM or HAM :param path: directory with emails """ emails = Corpus(path) with open(path + "/!prediction.txt", 'w', encoding="utf-8") as f: self.alpha = self.calculate_alpha(emails) for filename, message in emails.emails(): words = raw_email_to_list_of_words(message) spam_probability, spam_probability_overflow = self.calculate_email_probability( words, True) ham_probability, ham_probability_overflow = self.calculate_email_probability( words, False) if decision(spam_probability_overflow, spam_probability, ham_probability_overflow, ham_probability): f.write(filename + " SPAM\n") self.pred_dict[filename] = "SPAM" else: f.write(filename + " OK\n") self.pred_dict[filename] = "OK"
def test(self, dir_path): no_tests_done = 0 rather_positive = 0 corpus = Corpus(dir_path) clasif = dict() for name, mail in corpus.emails(): # Test strong filters result = self.test_strong_filters(name, mail) if result != -1: # Strong filters were decisive clasif[name] = result continue # Skip to the next iteration score = 0 tests_done = 0 # Test normal filters result = self.test_word_filters(name, mail) score += result[0] tests_done += result[1] # Test word filters result = self.test_word_filters(name, mail) score += result[0] tests_done += result[1] if tests_done == 0: no_tests_done += 1 # print("No tests were done for " + name) clasif[name] = NEGATIVE elif score / tests_done > POSITIVITY_THRESHOLD: clasif[name] = POSITIVE else: if score / tests_done > 0.50: rather_positive += 1 clasif[name] = NEGATIVE utils.write_classification_to_file(clasif, dir_path + "/!prediction.txt")
def set_truth(path): f = open(os.path.join(path, "!truth.txt"), 'wt') a = Corpus('/Users/eygene/Desktop/spam-data-12-s75-h25/3') for name, body in a.emails(): f.write(name + ' ' + 'SPAM' + '\n')