def test_extract_events_odds(self): instances = {'spam': ["buy viagra", "buy cialis"] * 100 + ["meeting love"], 'genuine': ["meeting tomorrow", "buy milk"] * 100} odds = Bayes.extract_events_odds(instances) b = Bayes({'spam': 0.9, 'genuine': 0.1}) b.update_from_events('buy coffee for meeting'.split(), odds) self.assertEqual(b.most_likely(0.8), 'genuine')
def test_extract_events_odds(self): instances = { 'spam': ["buy viagra", "buy cialis"] * 100 + ["meeting love"], 'genuine': ["meeting tomorrow", "buy milk"] * 100 } odds = Bayes.extract_events_odds(instances) b = Bayes({'spam': 0.9, 'genuine': 0.1}) b.update_from_events('buy coffee for meeting'.split(), odds) self.assertEqual(b.most_likely(0.8), 'genuine')
# emails. words_odds = {'buy': (5, 100), 'viagra': (1, 1000), 'meeting': (15, 2)} # Emails to be analyzed. emails = [ "let's schedule a meeting for tomorrow", # 100% genuine (meeting) "buy some viagra", # 100% spam (buy, viagra) "buy coffee for the meeting", # buy x meeting, should be genuine ] for email in emails: # Start with priors of 90% chance being genuine, 10% spam. # Probabilities are normalized automatically. b = Bayes([('genuine', 90), ('spam', 10)]) # Update probabilities, using the words in the emails as events and the # database of chances to figure out the change. b.update_from_events(email.split(), words_odds) # Print the email and if it's likely spam or not. print(email[:15] + '...', b.most_likely()) print('') print(' -- Spam Filter With Email Corpus -- ') # Email corpus. A hundred spam emails to buy products and with the word # "meeting" thrown around. Genuine emails are about meetings and buying # milk. instances = { 'spam': ["buy viagra", "buy cialis"] * 100 + ["meeting love"], 'genuine': ["meeting tomorrow", "buy milk"] * 100 }
def test_update_from_events(self): b = Bayes([1, 1]) b.update_from_events(['a', 'a', 'a'], {'a': (0.5, 2)}) self.assertEqual(b, [0.5 ** 3, 2 ** 3])
def test_update_from_events(self): b = Bayes([1, 1]) b.update_from_events(['a', 'a', 'a'], {'a': (0.5, 2)}) self.assertEqual(b, [0.5**3, 2**3])
# emails. words_odds = {'buy': (5, 100), 'viagra': (1, 1000), 'meeting': (15, 2)} # Emails to be analyzed. emails = [ "let's schedule a meeting for tomorrow", # 100% genuine (meeting) "buy some viagra", # 100% spam (buy, viagra) "buy coffee for the meeting", # buy x meeting, should be genuine ] for email in emails: # Start with priors of 90% chance being genuine, 10% spam. # Probabilities are normalized automatically. b = Bayes([('genuine', 90), ('spam', 10)]) # Update probabilities, using the words in the emails as events and the # database of chances to figure out the change. b.update_from_events(email.split(), words_odds) # Print the email and if it's likely spam or not. print(email[:15] + '...', b.most_likely()) print('') print(' -- Spam Filter With Email Corpus -- ') # Email corpus. A hundred spam emails to buy products and with the word # "meeting" thrown around. Genuine emails are about meetings and buying # milk. instances = {'spam': ["buy viagra", "buy cialis"] * 100 + ["meeting love"], 'genuine': ["meeting tomorrow", "buy milk"] * 100} # Use str.split to extract features/events/words from the corpus and build # the model.