def train_and_test_model(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.7) classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > 0.7) # (actual, predicted) for _, is_spam, spam_probability in classified) print(counts) classified.sort(key=lambda row: row[2]) spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:] hammiest_spams = list(filter(lambda row: row[1], classified))[:5] print("spammiest_hams", spammiest_hams) print("hammiest_spams", hammiest_spams) words = sorted(classifier.word_probs, key=p_spam_given_word) spammiest_words = words[-5:] hammiest_words = words[:5] print("spammiest_words", spammiest_words) print("hammiest_words", hammiest_words)
def train_and_test_model(path): random.seed(0) # just so you get the same answers as me data = get_subject_data(path) print(f'data: {len(data)}') train_data, test_data = split_data(data, 0.75) print(f'Train data size: {len(train_data)}') print(f'Test data size: {len(test_data)}') classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > .8) # (actual, predicted) for _, is_spam, spam_probability in classified) print(counts) classified.sort(key=lambda row: row[2]) spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:] hammiest_spams = list(filter(lambda row: row[1], classified))[:5] print("\nspammiest_hams", spammiest_hams) print("\nhammiest_spams", hammiest_spams) spammiest_words = classifier.word_probs.sort_values(by='prob_is_spam').tail().index.to_list() hammiest_words = classifier.word_probs.sort_values(by='prob_not_spam').tail().index.to_list() print("\nspammiest_words", spammiest_words) print("\nhammiest_words", hammiest_words)
def train_and_test_model(path): data = get_subject_data(path) random.seed(0) train_data, test_data = split_data(data, .75) classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > .5) for _, is_spam, spam_probability in classified) print counts classified.sort(key=lambda row: row[2]) spammiest_hams = filter(lambda row: not row[1], classified)[-5:] hammiest_spams = filter(lambda row: row[1], classified)[:5] print "spammiest_hams", spammiest_hams print "hammiest_spams", hammiest_spams words = sorted(classifier.word_probs, key=p_spam_given_word) spammiest_words = words[-5:] hammiest_words = words[:5] print "spammiest_words", spammiest_words print "hammiest_words", hammiest_words
def train_and_test_model(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) for _, is_spam, spam_probability in classified) print counts classified.sort(key=lambda row: row[2]) spammiest_hams = filter(lambda row: not row[1], classified)[-5:] hammiest_spams = filter(lambda row: row[1], classified)[:5] print "spammiest_hams", spammiest_hams print "hammiest_spams", hammiest_spams words = sorted(classifier.word_probs, key=p_spam_given_word) spammiest_words = words[-5:] hammiest_words = words[:5] print "spammiest_words", spammiest_words print "hammiest_words", hammiest_words
def main(): path = "resources/spam_data/*/*" data: List[Message] = [] for filename in glob.glob(path): is_spam = "ham" not in filename with open(filename, errors='ignore') as email_file: for line in email_file: if line.startswith("Subject: "): subject = line.lstrip("Subject: ") data.append(Message(subject, is_spam)) break random.seed(0) train_messages, test_messages = split_data(data, 0.75) model = NaiveBayesClassifier() model.train(train_messages) predictions = [(message, model.predict(message.text)) for message in test_messages] confusion_matrix = Counter((message.is_spam, spam_probability > 0.5) for message, spam_probability in predictions) print(confusion_matrix) def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float: prob_if_spam, prob_if_ham = model._probabilities(token) return prob_if_spam / (prob_if_spam + prob_if_ham) words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model)) print("spamiest_words", words[-10:]) print("hamiest_words", words[:10])
def train_and_test_model(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.75) #splits data, 25% training, 75% test (used on both solo_artist and band) #print train_data #print test_data classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(tweets, is_solo_artist, classifier.classify(tweets)) for tweets, is_solo_artist in test_data] counts = Counter((is_solo_artist, solo_artist_probability > 0.5) # (actual, predicted) for _, is_solo_artist, solo_artist_probability in classified) #Prints stats... print counts words = sorted(classifier.word_probs, key=p_solo_artist_given_word) solo_artist_words = words[-5:] band_words = words[:5] print "most_probable_solo_artist_words", solo_artist_words print "most_probable_band_words", band_words
def train_and_test_model(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) for _, is_spam, spam_probability in classified) print "----------test begin part 1 is spam-------------" msg = "i like to play basketball" print msg print tokenize(msg) a = classifier.classify(msg) print a print "-----------test end----------" print "----------test begin part 2 is091609151152s1rssr spam-------------" msg_two = "software" print msg_two print tokenize(msg_two) b = classifier.classify(msg_two) print b print "-----------test end----------" print counts classified.sort(key=lambda row: row[2]) spammiest_hams = filter(lambda row: not row[1], classified)[-5:] hammiest_spams = filter(lambda row: row[1], classified)[:5] print "spammiest_hams", spammiest_hams print "hammiest_spams", hammiest_spams words = sorted(classifier.word_probs, key=p_spam_given_word) spammiest_words = words[-5:] hammiest_words = words[:5] print "spammiest_words", spammiest_words print "hammiest_words", hammiest_words
def train_and_test_model(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) for _, is_spam, spam_probability in classified) print counts print("---20150518 ex---") msg = "I like to play computer." msg2 = "I don't know how to do it." print("---test begin---") print(msg) print(tokenize(msg)) a = classifier.classify(msg) print (a) print(msg2) print(tokenize(msg2)) b = classifier.classify(msg2) print (b) print("---test end----") print("---20150518 ex---\n") classified.sort(key=lambda row: row[2]) spammiest_hams = filter(lambda row: not row[1], classified)[-5:] hammiest_spams = filter(lambda row: row[1], classified)[:5] print "spammiest_hams", spammiest_hams print "hammiest_spams", hammiest_spams words = sorted(classifier.word_probs, key=p_spam_given_word) spammiest_words = words[-5:] hammiest_words = words[:5] print "spammiest_words", spammiest_words print "hammiest_words", hammiest_words
def train_and_test_model2(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.75) nbc = NaiveBayesClassifier() nbc.train(train_data) classified = [(subject, is_spam, nbc.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) for _, is_spam, spam_probability in classified) print(counts) return np.array([spam_probability > 0.5 for _, _, spam_probability in classified]), \ np.array([prob for _, _, prob in classified])
def train_and_test_model(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) for _, is_spam, spam_probability in classified) print("-------test1 start------") msg1 = "What the f**k" print(tokenize(msg1)) print(classifier.classify(msg1)) print("-------test1 end------\n") print("-------test2 start------") msg2 = "Hello World" print(tokenize(msg2)) print(classifier.classify(msg2)) print("-------test2 end------\n") print counts classified.sort(key=lambda row: row[2]) spammiest_hams = filter(lambda row: not row[1], classified)[-5:] hammiest_spams = filter(lambda row: row[1], classified)[:5] print "spammiest_hams", spammiest_hams print "hammiest_spams", hammiest_spams words = sorted(classifier.word_probs, key=p_spam_given_word) spammiest_words = words[-5:] hammiest_words = words[:5] print "spammiest_words", spammiest_words print "hammiest_words", hammiest_words
def train_and_test_model(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) for _, is_spam, spam_probability in classified) msg = "I Love you so much ha ha." msg2 = "What the F**k" print("----test bigin----") print(msg) print(tokenize(msg)) a = classifier.classify(msg) print(a) print("----2----") print(msg2) print(tokenize(msg2)) b = classifier.classify(msg2) print(b) print("----test END----") print counts classified.sort(key=lambda row: row[2]) spammiest_hams = filter(lambda row: not row[1], classified)[-5:] hammiest_spams = filter(lambda row: row[1], classified)[:5] print "spammiest_hams", spammiest_hams print "hammiest_spams", hammiest_spams words = sorted(classifier.word_probs, key=p_spam_given_word) spammiest_words = words[-5:] hammiest_words = words[:5] print "spammiest_words", spammiest_words print "hammiest_words", hammiest_words
def train_and_test_model(path, settings_options: dict = None): if settings_options: for key, value in settings_options.items(): setattr(settings, key, value) data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data( data, 0.83 ) # Change: Use 0.83 percentage to split between train and test data classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_prob > 0.5) # (actual, predicted) for _, is_spam, spam_prob in classified) # Change: Adds accuracy as metric to compare results hits, misses = counts[(True, True)] + counts[(False, False)], counts[ (True, False)] + counts[(False, True)] accuracy = hits / len(classified) print("Accuracy:", accuracy, "Counts:", counts, "Settings:", settings_options) # classified.sort(key=lambda row: row[2]) # spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:] # hammiest_spams = list(filter(lambda row: row[1], classified))[:5] # # print("spammiest_hams", spammiest_hams) # print("hammiest_spams", hammiest_spams) # # words = sorted(classifier.word_probs, key=p_spam_given_word) # # spammiest_words = words[-5:] # hammiest_words = words[:5] # # print("spammiest_words", spammiest_words) # print("hammiest_words", hammiest_words) return accuracy
# glob.glob returns every file name that matches the wildcard path for fn in glob.glob(path): is_spam = "ham" not in fn with open(fn, 'r', encoding='ISO-8859-1') as file: for line in file: if line.startswith("Subject:"): # remove the leading "Subject: " and keep what's left subject = subject_regex.sub("", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifer() classifier.train(train_data) # triplets (subject, actual is_spam, predicted spam probability) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] # assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) counts = Counter((is_spam, spam_porbability > 0.5) for _, is_spam, spam_porbability in classified) print(counts)
def test_split_data(self): with mock.patch.object(random, 'random', side_effect=[r/10 for r in range(7)]): self.assertEqual(([0, 1, 2, 3], [4, 5, 6]), machine_learning.split_data(range(7), 0.4))
def test_split_data(self): with mock.patch.object(random, 'random', side_effect=[r / 10 for r in range(7)]): self.assertEqual(([0, 1, 2, 3], [4, 5, 6]), machine_learning.split_data(range(7), 0.4))
# Subject 부분을 제거하고 나머지 부분을 유지 subject = re.sub(r"^Subject: ", "", line).strip() data.append((subject, is_spam)) return data def p_spam_given_word(word_prob): """베이즈 정리를 통해 p(스팸 \ 메시지가 해당 단어를 포함) 을 계산""" word, prob_if_spam, prob_if_not_spam = word_prob return prob_if_spam / (prob_if_spam + prob_if_not_spam) if __name__ == "__main__": data = get_subject_data() random.seed(0) # 예시와 동일한 결과를 얻기위해 설정 train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassfier() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted) for _, is_spam, spam_probability in classified) print(counts) # 스팸일 확률을 오름차순으로 정렬 classified.sort(key=lambda row: row[2]) # 스팸이 아닌 메시지 중에서 스팸일 확률이 가장 높은 메시지
is_spam = "ham" not in filename # There are some garbage characters in the emails; the errors = 'ignore' # skips them instead of raising an exception with open(filename, errors='ignore') as email_file: for line in email_file: if line.startswith("Subject:"): subject = line.lstrip("Subject: ") data.append(Message(subject, is_spam)) break # done with this file import random from machine_learning import split_data random.seed(0) train_messages, test_messages = split_data(data, 0.75) model = NaiveBayesClassifier() model.train(train_messages) from collections import Counter predictions = [(message, model.predict(message.text)) for message in test_messages] confusion_matrix = Counter((message.is_spam, spam_probability > 0.5) for message, spam_probability in predictions) print(confusion_matrix) def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
def get_train_test_data(path): data = get_subject_data(path) random.seed(0) # just so you get the same answers as me train_data, test_data = split_data(data, 0.75) return train_data, test_data
ax[row][col].set_xticks([]) ax[row][col].set_xticks([]) for mark, (species, points) in zip(marks, points_by_species.items()): xs = [point[i] for point in points] ys = [point[j] for point in points] ax[row][col].scatter(xs, ys, marker=mark, label=species) ax[-1][-1].legend(loc='lower right', prop={'size': 6}) plt.show() import random from machine_learning import split_data import math random.seed(12) iris_train, iris_test = split_data(iris_data, 0.70) assert len(iris_train) == math.floor(0.7 * len(iris_data)) assert len(iris_test) == math.ceil(0.3 * len(iris_data)) from typing import Tuple # track how many times we see (predicted, actual) confusion_matrix: Dict[Tuple[str, str], int] = defaultdict(int) num_correct = 0 k = 5 for iris in iris_test: predicted = knn_classify(k, iris_train, iris.point) actual = iris.label