Пример #1
0
    def test_naive_bayes(self):
        """
        core - naive bayes can classify text based upon trained vocabulary
        """
        list_of_posts, list_classes = self.load_bayes_data_set()
        vocabulary = core.create_vocabulary(list_of_posts)
        training_matrix = []
        for post in list_of_posts:
            vector = core.bag_of_words_to_vector(vocabulary, post)
            training_matrix.append(vector)

        p_0_vector, p_1_vector, p_any_being_abusive = \
            core.train_naive_bayes0(np.array(training_matrix),
                                    np.array(list_classes))

        test_entry = ['love', 'my', 'dalmation']

        vector = core.bag_of_words_to_vector(vocabulary, test_entry)
        this_document = np.array(vector)
        result = core.classify_naive_bayes(this_document,
                                           p_0_vector,
                                           p_1_vector,
                                           p_any_being_abusive)
        self.assertEqual(0, result)

        test_entry = ['stupid', 'garbage']
        vector = core.bag_of_words_to_vector(vocabulary, test_entry)
        this_document = np.array(vector)
        result = core.classify_naive_bayes(this_document,
                                           p_0_vector,
                                           p_1_vector,
                                           p_any_being_abusive)
        self.assertEqual(1, result)
Пример #2
0
 def test_create_vocabulary(self):
     """core - create_vocabulary creates... you know"""
     list_of_posts, list_classes = self.load_bayes_data_set()
     expected = ['cute', 'love', 'help', 'garbage',
                 'quit', 'food', 'problems', 'is',
                 'park', 'stop', 'flea', 'dalmation',
                 'licks', 'not', 'him', 'buying',
                 'posting', 'has', 'worthless', 'ate',
                 'to', 'i', 'maybe', 'please',
                 'dog', 'how', 'stupid', 'so',
                 'take', 'mr', 'steak', 'my']
     self.assertEqual(expected, core.create_vocabulary(list_of_posts))
Пример #3
0
    def _bayes_trial(self):
        doc_list = []
        class_list = []
        full_text = []

        for i in range(1, 26):
            filepath = "data/spam/{0}.txt".format(i)
            word_list = core.text_parse(open(filepath).read())
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(1)

            filepath = "data/ham/{0}.txt".format(i)
            word_list = core.text_parse(open(filepath).read())
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(0)
        vocabulary = core.create_vocabulary(doc_list)
        training_set = range(50)

        test_set = []
        for i in range(10):
            rand_index = int(random.uniform(0, len(training_set)))
            test_set.append(training_set[rand_index])
            del(training_set[rand_index])

        training_matrix = []
        train_classes = []
        for doc_index in training_set:
            training_matrix.append(core.bag_of_words_to_vector(
                                   vocabulary,
                                   doc_list[doc_index]))
            train_classes.append(class_list[doc_index])

        p_0_V, p_1_V, p_spam = core.train_naive_bayes0(array(
                                                       training_matrix),
                                                       array(train_classes))
        error_count = 0
        for doc_index in test_set:
            word_vector = core.bag_of_words_to_vector(vocabulary,
                                                      doc_list[doc_index])
            result = core.classify_naive_bayes(array(word_vector),
                                               p_0_V,
                                               p_1_V,
                                               p_spam)
            if result != class_list[doc_index]:
                error_count += 1
        error_rate = float(error_count)/len(test_set)
        logging.info("errors: {0}\ttest_set_count: {1}".format(error_count,
                                                               len(test_set)))
        logging.info("the error rate is {0}".format(error_rate))
        return error_rate
Пример #4
0
    def test_calculate_most_frequent(self):
        """
        core - calculate_most_frequent gets top n frequents
        """
        list_of_posts, list_classes = self.load_bayes_data_set()
        vocabulary = core.create_vocabulary(list_of_posts)

        fr = open(self.fake_text)
        full_text = core.text_parse(" ".join(fr.readlines()))

        result = core.calculate_most_frequent(vocabulary, full_text, 5)

        expected = [('him', 3), ('dog', 3), ('stupid', 3),
                    ('stop', 2), ('worthless', 2)]
        self.assertEqual(expected, result)
Пример #5
0
def local_words(feed_1, feed_0):
    """
    Parse two RSS feeds;
    remove the most frequently ocurring words.
    """
    doc_list = []
    class_list = []
    full_text = []
    min_len = min(len(feed_1['entries']), len(feed_0['entries']))
    for i in range(min_len):
        word_list = core.text_parse(feed_1['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)

        word_list = core.text_parse(feed_0['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)

    vocabulary = core.create_vocabulary(doc_list)
    vocabulary = filter_stopwords(vocabulary, stopwords_file())

    # filter out stopwords
    stopwords = core.get_stopwords(stopwords_file())
    vocabulary = [token for token in vocabulary
                  if not core.is_stopword(token, stopwords)]

    top_thirty_words = core.calculate_most_frequent(vocabulary, full_text)
    for pair_w in top_thirty_words:
        if pair_w[0] in vocabulary:
            vocabulary.remove(pair_w[0])
    training_set = range(2*min_len)

    test_set = []
    for i in range(20):
        random_i = int(random.uniform(0, len(training_set)))
        test_set.append(training_set[random_i])
        del(training_set[random_i])
    training_matrix = []
    train_classes = []
    for doc_index in training_set:
        word_vector = core.bag_of_words_to_vector(vocabulary,
                                                  doc_list[doc_index])
        training_matrix.append(word_vector)
        train_classes.append(class_list[doc_index])

    p_0_v, p_1_v, p_spam = core.train_naive_bayes0(array(training_matrix),
                                                   array(train_classes))
    error_count = 0
    for doc_index in test_set:
        word_vector = core.bag_of_words_to_vector(vocabulary,
                                                  doc_list[doc_index])
        classification = core.classify_naive_bayes(array(word_vector),
                                                   p_0_v,
                                                   p_1_v,
                                                   p_spam)
        if classification != class_list[doc_index]:
            error_count += 1
    error_rate = float(error_count)/len(test_set)
    logging.info("errors: {0}\terror rate: {1}".format(error_count,
                                                       error_rate))
    return vocabulary, p_0_v, p_1_v