def test_naive_bayes(self): """ core - naive bayes can classify text based upon trained vocabulary """ list_of_posts, list_classes = self.load_bayes_data_set() vocabulary = core.create_vocabulary(list_of_posts) training_matrix = [] for post in list_of_posts: vector = core.bag_of_words_to_vector(vocabulary, post) training_matrix.append(vector) p_0_vector, p_1_vector, p_any_being_abusive = \ core.train_naive_bayes0(np.array(training_matrix), np.array(list_classes)) test_entry = ['love', 'my', 'dalmation'] vector = core.bag_of_words_to_vector(vocabulary, test_entry) this_document = np.array(vector) result = core.classify_naive_bayes(this_document, p_0_vector, p_1_vector, p_any_being_abusive) self.assertEqual(0, result) test_entry = ['stupid', 'garbage'] vector = core.bag_of_words_to_vector(vocabulary, test_entry) this_document = np.array(vector) result = core.classify_naive_bayes(this_document, p_0_vector, p_1_vector, p_any_being_abusive) self.assertEqual(1, result)
def test_create_vocabulary(self): """core - create_vocabulary creates... you know""" list_of_posts, list_classes = self.load_bayes_data_set() expected = ['cute', 'love', 'help', 'garbage', 'quit', 'food', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'i', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my'] self.assertEqual(expected, core.create_vocabulary(list_of_posts))
def _bayes_trial(self): doc_list = [] class_list = [] full_text = [] for i in range(1, 26): filepath = "data/spam/{0}.txt".format(i) word_list = core.text_parse(open(filepath).read()) doc_list.append(word_list) full_text.extend(word_list) class_list.append(1) filepath = "data/ham/{0}.txt".format(i) word_list = core.text_parse(open(filepath).read()) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) vocabulary = core.create_vocabulary(doc_list) training_set = range(50) test_set = [] for i in range(10): rand_index = int(random.uniform(0, len(training_set))) test_set.append(training_set[rand_index]) del(training_set[rand_index]) training_matrix = [] train_classes = [] for doc_index in training_set: training_matrix.append(core.bag_of_words_to_vector( vocabulary, doc_list[doc_index])) train_classes.append(class_list[doc_index]) p_0_V, p_1_V, p_spam = core.train_naive_bayes0(array( training_matrix), array(train_classes)) error_count = 0 for doc_index in test_set: word_vector = core.bag_of_words_to_vector(vocabulary, doc_list[doc_index]) result = core.classify_naive_bayes(array(word_vector), p_0_V, p_1_V, p_spam) if result != class_list[doc_index]: error_count += 1 error_rate = float(error_count)/len(test_set) logging.info("errors: {0}\ttest_set_count: {1}".format(error_count, len(test_set))) logging.info("the error rate is {0}".format(error_rate)) return error_rate
def test_calculate_most_frequent(self): """ core - calculate_most_frequent gets top n frequents """ list_of_posts, list_classes = self.load_bayes_data_set() vocabulary = core.create_vocabulary(list_of_posts) fr = open(self.fake_text) full_text = core.text_parse(" ".join(fr.readlines())) result = core.calculate_most_frequent(vocabulary, full_text, 5) expected = [('him', 3), ('dog', 3), ('stupid', 3), ('stop', 2), ('worthless', 2)] self.assertEqual(expected, result)
def local_words(feed_1, feed_0): """ Parse two RSS feeds; remove the most frequently ocurring words. """ doc_list = [] class_list = [] full_text = [] min_len = min(len(feed_1['entries']), len(feed_0['entries'])) for i in range(min_len): word_list = core.text_parse(feed_1['entries'][i]['summary']) doc_list.append(word_list) full_text.extend(word_list) class_list.append(1) word_list = core.text_parse(feed_0['entries'][i]['summary']) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) vocabulary = core.create_vocabulary(doc_list) vocabulary = filter_stopwords(vocabulary, stopwords_file()) # filter out stopwords stopwords = core.get_stopwords(stopwords_file()) vocabulary = [token for token in vocabulary if not core.is_stopword(token, stopwords)] top_thirty_words = core.calculate_most_frequent(vocabulary, full_text) for pair_w in top_thirty_words: if pair_w[0] in vocabulary: vocabulary.remove(pair_w[0]) training_set = range(2*min_len) test_set = [] for i in range(20): random_i = int(random.uniform(0, len(training_set))) test_set.append(training_set[random_i]) del(training_set[random_i]) training_matrix = [] train_classes = [] for doc_index in training_set: word_vector = core.bag_of_words_to_vector(vocabulary, doc_list[doc_index]) training_matrix.append(word_vector) train_classes.append(class_list[doc_index]) p_0_v, p_1_v, p_spam = core.train_naive_bayes0(array(training_matrix), array(train_classes)) error_count = 0 for doc_index in test_set: word_vector = core.bag_of_words_to_vector(vocabulary, doc_list[doc_index]) classification = core.classify_naive_bayes(array(word_vector), p_0_v, p_1_v, p_spam) if classification != class_list[doc_index]: error_count += 1 error_rate = float(error_count)/len(test_set) logging.info("errors: {0}\terror rate: {1}".format(error_count, error_rate)) return vocabulary, p_0_v, p_1_v