def main(): data = [] for verdict in ['spam', 'not_spam']: for files in glob.glob(PATH + verdict + "/*")[:500]: is_spam = True if verdict == 'spam' else False with open(files, "r", encoding='utf-8', errors='ignore') as f: for line in f: if line.startswith("Subject:"): subject = re.sub("^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) print("Spam" if classifier.classify("Get free laptops now!") > 0.5 else "Not Spam") classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] count = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) spammiest_hams, hammiest_spams = most_misclassified(classified) print("Accuracy: ", accuracy(count)) print("Precision: ", precision(count)) print("Recall: ", recall(count)) print("\nTop 5 falsely classified as spam:\n\n", spammiest_hams) print("\nTop 5 falsely classified as not spam:\n\n", hammiest_spams) print("\nMost spammiest words: ", spammiest_word(classifier))
def spamFilterChecker(): print('Received the JAVA Request!') # Get the text data from the JAVA Program. req_data = abhishek_request.get_json() text_to_be_classified = req_data['text_to_be_classified'] print(text_to_be_classified) # ---------------------------------------------------------------------------- # Make a POST request to the plino Spam API. # ---------------------------------------------------------------------------- data = [] for verdict in ['spam', 'not_spam']: for files in glob.glob(PATH + verdict + "/*")[:500]: is_spam = True if verdict == 'spam' else False with open(files, "r", encoding='utf-8', errors='ignore') as f: for line in f: if line.startswith("Subject:"): subject = re.sub("^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.80) classifier = NaiveBayesClassifier() classifier.train(train_data) json_response = "" value = classifier.classify(text_to_be_classified) if value < 0.9: json_response = "{'email_class' : 'spam'}" else: json_response = "{'email_class' : 'ham'}" print("====================================================") print("POSSIBILITY OF HAM : ", value) print(json_response) print("====================================================") return json_response
for _, row in df.iterrows(): if row['v1'] == 'spam': spam_data.append(row['v2']) else: legit_data.append(row['v2']) NB_classifier = NaiveBayesClassifier() spam_train = spam_data[:int(len(spam_data) * 2 / 3)] spam_test = spam_data[int(len(spam_data) * 2 / 3):] legit_train = legit_data[:int(len(legit_data) * 2 / 3)] legit_test = legit_data[int(len(legit_data) * 2 / 3):] NB_classifier.train(spam_train, legit_train) spam_accuracy = 0 legit_accuracy = 0 for text in spam_test: prediction = NB_classifier.predict(text) spam_accuracy += prediction spam_accuracy /= len(spam_test) for text in legit_test: prediction = NB_classifier.predict(text) legit_accuracy += 1 - prediction legit_accuracy /= len(legit_test) print("Spam Text prediction accuracy:", spam_accuracy)
# glob.glob returns every filename that matches the wildcarded path for fn in glob.glob(path): is_spam = "ham" not in fn with open(fn, "r") as file: for line in file: if line.startswith("Subject:"): # remove the leading "Subject: " and keep what's left subject = re.sub(r"^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) # triplets (subject, actual is_spam, predicted spam probability) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] # assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) # sort by spam_probability from smallest to largest classified.sort(key=lambda row: row[2]) # the highest predicted spam probabilities among the non-spams spammiest_hams = filter(lambda row: not row[1], classified)[-5:] # the lowest predicted spam probabilities among the actual spams
# glob.glob returns every filename that matches the wildcarded path for fn in glob.glob(path): is_spam = "ham" not in fn with open(fn, 'r') as file: for line in file: if line.startswith("Subject:"): # remove the leading "Subject: " and keep what's left subject = re.sub(r"^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) # triplets (subject, actual is_spam, predicted spam probability) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] # assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) # sort by spam_probability from smallest to largest classified.sort(key=lambda row: row[2]) # the highest predicted spam probabilities among the non-spams spammiest_hams = filter(lambda row: not row[1], classified)[-5:]