def main(): data = [] for verdict in ['spam', 'not_spam']: for files in glob.glob(PATH + verdict + "/*")[:500]: is_spam = True if verdict == 'spam' else False with open(files, "r", encoding='utf-8', errors='ignore') as f: for line in f: if line.startswith("Subject:"): subject = re.sub("^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) print("Spam" if classifier.classify("Get free laptops now!") > 0.5 else "Not Spam") classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] count = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) spammiest_hams, hammiest_spams = most_misclassified(classified) print("Accuracy: ", accuracy(count)) print("Precision: ", precision(count)) print("Recall: ", recall(count)) print("\nTop 5 falsely classified as spam:\n\n", spammiest_hams) print("\nTop 5 falsely classified as not spam:\n\n", hammiest_spams) print("\nMost spammiest words: ", spammiest_word(classifier))
class TestClassifier(unittest.TestCase): def setUp(self): self.examples = {'university': ['''Abbottabad Public School , also commonly referred to as APS and Railway Public School , is a private , all boys , boarding school for , 7th to 12th grade students , located in Abbottabad , Pakistan .''']} self.classifier = NaiveBayesClassifier(self.examples) def test_create_vocabulary(self): self.classifier.vocabulary.should.contain('private') def test_vocabulary_size(self): self.classifier.vocabulary_size.should.eql(28) def test_subset_of_documents_with_target_value(self): len(self.classifier.get_documents_with_target_value('university')).should.eql(1) def test_text_of_documents(self): documents = self.classifier.get_documents_with_target_value('university') self.classifier.get_text(documents).should.contain('private') def test_text_distinct_words(self): documents = self.classifier.get_documents_with_target_value('university') text = self.classifier.get_text(documents) self.classifier.get_text_diff_words_count(text).should.eql(28) def test_example_count(self): self.classifier.get_example_count().should.eql(1) def test_occurrences_of_word_count(self): documents = self.classifier.get_documents_with_target_value('university') text = self.classifier.get_text(documents) self.classifier.occurrences_count(',', text).should.eql(7) def test_learn(self): self.classifier.learn() def test_word_positions_in_doc(self): documents = self.classifier.get_documents_with_target_value('university') len(self.classifier.word_positions(documents[0])).should.eql(38) def test_classify(self): self.classifier.learn() self.classifier.classify(self.examples['university'][0]).should.eql('university')
def spamFilterChecker(): print('Received the JAVA Request!') # Get the text data from the JAVA Program. req_data = abhishek_request.get_json() text_to_be_classified = req_data['text_to_be_classified'] print(text_to_be_classified) # ---------------------------------------------------------------------------- # Make a POST request to the plino Spam API. # ---------------------------------------------------------------------------- data = [] for verdict in ['spam', 'not_spam']: for files in glob.glob(PATH + verdict + "/*")[:500]: is_spam = True if verdict == 'spam' else False with open(files, "r", encoding='utf-8', errors='ignore') as f: for line in f: if line.startswith("Subject:"): subject = re.sub("^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.80) classifier = NaiveBayesClassifier() classifier.train(train_data) json_response = "" value = classifier.classify(text_to_be_classified) if value < 0.9: json_response = "{'email_class' : 'spam'}" else: json_response = "{'email_class' : 'ham'}" print("====================================================") print("POSSIBILITY OF HAM : ", value) print(json_response) print("====================================================") return json_response
dir = os.path.realpath('..') keyword = 'hillary' trainingDataFile = '/home/cc/twitterSentiment/src/input/hillary.txt' inpfile = open(trainingDataFile, "r") lines = inpfile.read().split() tweets = [] for tweet in tweets: tweets.append(tweet) time = 'daily' classifierDumpFile = '/home/cc/twitterSentiment/src//input/naivebayes_model.pickle' trainingRequired = 0 # instantiate the instance of classifier class nb = NaiveBayesClassifier(tweets, keyword, time, \ trainingDataFile, classifierDumpFile, trainingRequired) # run the classifier model on tweets nb.classify() htmlcode = nb.getHTML() htmlfile = open('/var/www/html/index.html','w') htmlfile.write(htmlcode) htmlfile.close() # time = 'lastweek' # twitterData = get_twitter_data.TwitterData() # tweets = twitterData.getTwitterData(keyword, time) if not os.path.exists('./input/trump.txt'): keyword = 'trump' # time = 'lastweek' # twitterData = get_twitter_data.TwitterData() # tweets = twitterData.getTwitterData(keyword, time)
with open(fn, "r") as file: for line in file: if line.startswith("Subject:"): # remove the leading "Subject: " and keep what's left subject = re.sub(r"^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) # triplets (subject, actual is_spam, predicted spam probability) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] # assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) # sort by spam_probability from smallest to largest classified.sort(key=lambda row: row[2]) # the highest predicted spam probabilities among the non-spams spammiest_hams = filter(lambda row: not row[1], classified)[-5:] # the lowest predicted spam probabilities among the actual spams hammiest_spams = filter(lambda row: row[1], classified)[:5]
print("Testing Probabilities") assert naive_bayes.probability('scam', True) == 0.4 assert naive_bayes.probability('scam', False) == 0.6 print("passed") print("Testing Conditional Probabilities") assert naive_bayes.conditional_probability(('errors',True), given=('scam',True)) == 1.0 assert naive_bayes.conditional_probability(('links',False), given=('scam',True)) == 0.25 assert naive_bayes.conditional_probability(('errors',True), given=('scam',False)) == 0.16666666666666666 assert naive_bayes.conditional_probability(('links',False), given=('scam',False)) == 0.5 print("passed") observed_features = { 'errors': True, 'links': False } print("Testing Likeihoods") assert naive_bayes.likelihood(('scam',True), observed_features) == 0.1 assert round(naive_bayes.likelihood(('scam',False), observed_features),3) == 0.05 print("passed") print("Testing Classification") assert naive_bayes.classify(observed_features) == True print('passed') print('ALL TESTS PASSED')
with open(fn, 'r') as file: for line in file: if line.startswith("Subject:"): # remove the leading "Subject: " and keep what's left subject = re.sub(r"^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) # triplets (subject, actual is_spam, predicted spam probability) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] # assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) # sort by spam_probability from smallest to largest classified.sort(key=lambda row: row[2]) # the highest predicted spam probabilities among the non-spams spammiest_hams = filter(lambda row: not row[1], classified)[-5:] # the lowest predicted spam probabilities among the actual spams hammiest_spams = filter(lambda row: row[1], classified)[:5]