def test_atom_filter(initialized_filter, train_dir, test_dir): train_corp = TrainingCorpus(train_dir) test_corp = Corpus(test_dir) filter = initialized_filter filter.train(train_corp) prediction = dict() for name, mail in test_corp.emails(): result = filter.test(mail) if result == -1: continue elif result > POSITIVITY_THRESHOLD: prediction[name] = POSITIVE else: prediction[name] = NEGATIVE truth = read_classification_from_file(test_dir + '/' + TRUTHFILE) conf_matrix = BinaryConfusionMatrix(POSITIVE, NEGATIVE) conf_matrix.compute_from_dicts(truth, prediction) matrix_dict = conf_matrix.as_dict() # For testing purposes print(matrix_dict) score = quality_score(matrix_dict['tp'], \ matrix_dict['tn'], \ matrix_dict['fp'], \ matrix_dict['fn']) return score
def train(self, train_corpus_dir): ''' Trains my silly filter :param train_corpus_dir: path to train dir :return: None ''' tc = TrainingCorpus(train_corpus_dir) tc.return_spam_ham_count() # Get word frequencies spam = tc.get_spam_word_count_dict_and_avg() ham = tc.get_ham_word_count_dict_and_avg() self.spam_word_count_dict = spam[0] self.ham_word_count_dict = ham[0] self.spam_word_count_avg = spam[1] self.ham_word_count_avg = ham[1] # Remove intersection of them from them # intersection = self.spam_word_count_dict & self.ham_word_count_dict # self.spam_word_count_dict -= intersection # self.ham_word_count_dict -= intersection # Totally number of spam and ham counts from test suite counts = tc.return_spam_ham_count() self.spam_count = counts[0] self.ham_count = counts[1]
def setUp(self): """Prepare fake corpus with !truth.txt file.""" self.email_dict = create_corpus_dictionary() self.true_class = create_classification_for(self.email_dict.keys()) create_corpus_dir_from_dictionary(self.email_dict) truth_filepath = os.path.join(CORPUS_DIR, TRUTH_FILENAME) save_classification_to_file(self.true_class, fname=truth_filepath) with replaced_open(): self.tc = TrainingCorpus(CORPUS_DIR)
def train(self, path): '''this function creates list of strings (bad_words) that are most likely to trigger a spam in the test dataset of emails''' #these constants worked the best MOST_COMMON_S = 600 MOST_COMMON_H = 5000 CHECKED_WORD_LEN = 12 FACTOR = 20 words = [] ham_string = '' '''this part concentrares all hams to one huge string, than creates list of all specific words in hams, and dictionary of those words with frequency of their appearance''' tc = TrainingCorpus(path) for fname, body in tc.hams(): ham_string += body body = body.translate(str.maketrans('.', ' ')) words = words + (body.lower().split(' ')) counter_ham = Counter(words) ham_words_dict = dict(counter_ham.most_common(MOST_COMMON_H)) ham_words_list = list(ham_words_dict.keys()) words = [] '''same for spams except the huge string part''' for fname, body in tc.spams(): body = body.translate(str.maketrans('.', ' ')) words = words + (body.lower().split(' ')) couter_spam = Counter(words) spam_words_dict = dict(couter_spam.most_common(MOST_COMMON_S)) spam_words_list = list(spam_words_dict.keys()) '''this part creates the bad_words list''' for word in spam_words_list: if word not in ham_string: self.bad_words.append(word) elif len( word ) > CHECKED_WORD_LEN and word in ham_words_list and word in spam_words_list: if (spam_words_dict[word] > (ham_words_dict[word] * FACTOR)): self.bad_words.append(word) pass
def train(self, email_adress): global all_words, spam_words, probability_spam, count_spams, count_emails hemails_with_body = TrainingCorpus(email_adress).hams() semails_with_body = TrainingCorpus(email_adress).spams() hwords = TrainingCorpus(email_adress).get_words(hemails_with_body) swords = TrainingCorpus(email_adress).get_words(semails_with_body) all_words = TrainingCorpus(email_adress).all_words(hwords, swords) # all words with their count spam_words = TrainingCorpus(email_adress).spam_words(swords) # spam words with their count count_spams = TrainingCorpus.count_spams(email_adress) # count of all spam's emails count_emails = TrainingCorpus.count_emails(email_adress) # count of all emails probability_spam = count_spams / count_emails # probability that email is spam pass
def train(self, path): """ Trains the corpus on given emails dataset :param path: directory with emails """ self.truth_dict = read_classification_from_file(path + "/!truth.txt") emails = TrainingCorpus(path) spam_words, num_of_spam_emails = self.list_spam_ham_words(emails, True) ham_words, num_of_ham_emails = self.list_spam_ham_words(emails, False) self.portion_of_spam_emails = num_of_spam_emails / ( num_of_spam_emails + num_of_ham_emails) self.all_words = Counter(join_spam_and_ham_words( spam_words, ham_words)) self.num_of_spam_words = len(spam_words) self.num_of_ham_words = len(ham_words) self.num_of_all_words = len(self.all_words)
def train(self, dir_path): corpus = TrainingCorpus(dir_path) for filt in self.strong_filters + self.normal_filters + self.word_filters: # print("Training " + filt.__class__.__name__) filt.train(corpus)
def train(self, directory): trainer = TrainingCorpus(directory) self.spams = trainer.spams self.hams = trainer.hams self.trained = True
from basefilter import WordFilter from trainingcorpus import TrainingCorpus import inspect import wordfilters c = TrainingCorpus('./1') for name, obj in inspect.getmembers(wordfilters): if inspect.isclass(obj): if obj.__module__ == "wordfilters": a = obj() a.train(c) print(name, a.bayes_val)