Exemplo n.º 1
0
def get_lambda(parts, alpha):
    for i in range(0, 101, 5):
        lambda_legit = 10**i
        sum_accuracy_score = 0
        cnt = 0
        for test in parts:
            d_tran = []
            predict = []
            answer = []

            for part in parts:
                if part != test:
                    d_tran = d_tran + part

            bayes = train(d_tran)

            for answer_class, message in test:
                predict_class, _ = classify(bayes, message, 1, lambda_legit,
                                            alpha)
                predict.append(int(predict_class == SPAM))
                answer.append(int(answer_class == SPAM))

                if predict_class == SPAM and answer_class == LEGIT:
                    cnt += 1

            sum_accuracy_score += accuracy_score(predict, answer)

        print("cnt:", cnt, "lambda:", 10**i)
Exemplo n.º 2
0
def draw_roc(parts, alpha):
    d_train = []

    for part in parts:
        d_train = d_train + part

    bayes = train(d_train)

    roc_c = []
    roc_pred = []
    count_y = 0
    count_x = 0
    for answer_class, message in d_train:
        predict_class, pred = classify(bayes, message, 1, 1, alpha)

        if predict_class == answer_class:
            count_y += 1
            roc_c.append(True)
        else:
            count_x += 1
            roc_c.append(False)

        roc_pred.append(pred)

    roc_c, roc_pred = zip(*sorted(zip(roc_c, roc_pred), key=lambda x: x[1]))

    if count_y == 0:
        count_y = 1

    if count_x == 0:
        count_x = 1

    shag_x = 1 / count_x
    shag_y = 1 / count_y
    X = []
    Y = []
    x = 0
    y = 0
    X.append(x)
    Y.append(y)
    for t in roc_c:
        if t:
            y += shag_y
        else:
            x += shag_x

        X.append(x)
        Y.append(y)

    plt.plot(X, Y)
    plt.show()
Exemplo n.º 3
0
def test():
    """
    the probability reach 1 represent badness 
    """
    listpost, listclass = bayes.loaddataset()
    myvocablist = bayes.createlist(listpost)
    tmatrix = list()
    for doc in listpost:
        vec = bayes.word2vec(myvocablist, doc)
        tmatrix.append(vec)
    p0, p1, pa = bayes.train(tmatrix, listclass)
    testdoc1 = ['love', 'my', 'dalmation']
    testvec1 = bayes.word2vec(myvocablist, testdoc1)
    print testdoc1, 'classify as :', bayes.classify(testvec1, p0, p1, pa)
    testdoc2 = ['stupid', 'love']
    testvec2 = bayes.word2vec(myvocablist, testdoc2)
    print testdoc2, 'classify as :', bayes.classify(testvec2, p0, p1, pa)
Exemplo n.º 4
0
def main():
    # the number of words in the dictionary
    word_num = 2500
    # file that contains features of emails to be used in training
    train_feature_filename = "./data/train-features.txt"
    # file that contains labels of emails to be used in training
    train_label_filename = "./data/train-labels.txt"
    # the number of training emails
    train_email_num = 700
    # file that contains features of emails to be used in testing
    test_feature_filename = "./data/test-features.txt"
    # file that contains labels of emails to be used in testing
    test_label_filename = "./data/test-labels.txt"
    # the number of testing emails
    test_email_num = 260
    # train naive bayes model
    model = bayes.train(train_feature_filename, train_label_filename,
                        train_email_num, word_num)
    # make predictions on test data
    bayes.test(model, test_feature_filename, test_label_filename,
               test_email_num, word_num)
Exemplo n.º 5
0
def get_best_alpha(parts):
    best_accuracy = 0
    best_alpha = 0

    for alpha_degree in range(0, 10):
        alpha = 1 / (10**alpha_degree)
        sum_accuracy_score = 0

        for d_test in parts:
            d_train = []
            predict = []
            answer = []

            for part in parts:
                if part != d_test:
                    d_train = d_train + part

            bayes = train(d_train)

            for answer_class, message in d_test:
                predict_class, _ = classify(bayes, message, 1, 1, alpha)
                predict.append(int(predict_class == SPAM))
                answer.append(int(answer_class == SPAM))

            sum_accuracy_score += accuracy_score(predict, answer)

        accuracy = sum_accuracy_score / COUNT_PARTS
        if best_accuracy < accuracy:
            best_accuracy = accuracy
            best_alpha = alpha

        print('alpha:', alpha, 'accuracy:', accuracy)

    print("__________________________________________")
    print("Best alpha:", best_alpha)
    print("Best accuracy:", best_accuracy)
    print("__________________________________________")

    return best_alpha
Exemplo n.º 6
0
def classify_bayes():
    fracs = [x / 10 for x in range(1, 11)]
    digitd = (read_digitdata('training'), read_digitdata('test'),
              read_digitdata('validation'))
    faced = (read_facedata('train'), read_facedata('test'),
             read_facedata('validation'))
    datasets = [faced, digitd]
    # Bayesian
    for dataset in datasets:
        for f in fracs:
            fs = take_sample(dataset[0], f)
            t1 = time.time()
            model = bayes.train(fs)
            t2 = time.time()
            total = len(dataset[1])
            cor = 0
            for item in dataset[1]:
                cl, logprob = bayes.predict(item[0], model)
                if cl == item[1]:
                    cor += 1
            acc = float(cor) / total
            print("Bayes class accuracy frac {} train_time {} accuracy {}".
                  format(f, t2 - t1, acc))
Exemplo n.º 7
0
def draw_accuracy_from_lambda(parts, alpha):
    all_lambda_legit = []
    all_accuracy = []

    for i in range(0, 101, 5):
        lambda_legit = 10**i
        sum_accuracy_score = 0

        for test in parts:
            d_train = []
            predict = []
            answer = []

            for part in parts:
                if part != test:
                    d_train = d_train + part

            bayes = train(d_train)

            for answer_class, message in test:
                predict_class, _ = classify(bayes, message, 1, lambda_legit,
                                            alpha)
                predict.append(int(predict_class == SPAM))
                answer.append(int(answer_class == SPAM))

            sum_accuracy_score += accuracy_score(predict, answer)

        accuracy = sum_accuracy_score / COUNT_PARTS
        all_lambda_legit.append(i)
        all_accuracy.append(accuracy)
        print("lambda:", lambda_legit, "accuracy:", accuracy)

    plt.plot(all_lambda_legit, all_accuracy)
    plt.xlabel('10^x lambda legit')
    plt.ylabel('Accuracy')
    plt.show()
Exemplo n.º 8
0
	cur = con.cursor(mdb.cursors.DictCursor)
	cur.execute("SELECT a.article_id, like_flag, article_text \
	from test_user_activity a, test_article b \
	where a.article_id = b.article_id \
	and a.like_flag is not null \
	order by a.article_id")
	
	rows = cur.fetchall()
	
	total = 0
	for row in rows:
		
		t = row["article_text"]
		t = unquote_plus(t)
		if row["like_flag"] == 1:
			bayes.train(t, False)
		else:
			bayes.train(t, True)
		total = total + 1
		print total
		if (total % 100) == 0:
			query = "SELECT a.article_id, a.prediction, a.like_flag, b.article_text from test_user_activity a, \
			test_article b where a.article_id = b.article_id and a.article_id between %s and %s and user_name ='brad'" % (row["article_id"] + 1, (row["article_id"] + 101))
			print query
			cur.execute(query) 
			rows = cur.fetchall()
	
			for row in rows:
				t = row["article_text"]
				t = unquote_plus(t)
				prediction = bayes.classify(t)
Exemplo n.º 9
0
import bayes

dataset, labels = bayes.load_dataset()
print(dataset)
print(labels)
vocab_list = bayes.create_vocab_list(dataset)
print(vocab_list)
matrix = []
for array in dataset:
    vec = bayes.words_set_to_vec(vocab_list, array)
    matrix.append(vec)
print(matrix)
p_0_v, p_1_v, p_ab = bayes.train(matrix, labels)
print(p_0_v)
print(p_1_v)
print(p_ab)
print('<--->')
test = ['love', 'my', 'dalmation']
vec = bayes.words_set_to_vec(vocab_list, test)
classify = bayes.classify(vec, p_0_v, p_1_v, p_ab)
print(test)
print(vec)
print(classify)
print('<--->')
test = ['stupid', 'garbage']
vec = bayes.words_set_to_vec(vocab_list, test)
classify = bayes.classify(vec, p_0_v, p_1_v, p_ab)
print(test)
print(vec)
print(classify)
Exemplo n.º 10
0
import bayes
from sys import argv, exit

if len(argv) == 2:
    with open(argv[1]) as f:
        texts = [l.strip() for l in f]
else:
    print "give me a file as an argument!"
    exit(1)

storage = bayes.Storage("bayes.dat", 10)

try:
    storage.load()
except IOError:
    pass

bayes = bayes.Bayes(storage)

try:
    for text in texts:
        text = text.replace("\n", " ")
        print "==> %s" % text
        print ""
        resp = raw_input("Spam? [y/n] ")
        bayes.train(text, resp == "y")

finally:
    storage.finish()
Exemplo n.º 11
0
import bayes
from data_helpers import *
from sklearn.externals import joblib

posFile = "./data/train_food.txt"
negFile = "./data/train_notfood.txt"

print("正在获取训练矩阵及其分类向量")
trainList, classVec = loadTrainDataset(posFile, negFile)

print("正在将训练矩阵分词,并生成词表")
vectorized, vocabulary = jieba_cut_and_save_file(trainList, True)

bayes = bayes.oldNB(vocabulary)
# 初始化模型

print("正在训练模型")
bayes.train(vectorized, classVec)
# 训练

print("保存模型")
joblib.dump(bayes, "./arguments/train_model.m")
ham = getArticles('data/email/ham.pickle')
spam = getArticles('data/email/spam.pickle')

# Generate training set and test set
random.shuffle(ham)
random.shuffle(spam)

hamTestLen = len(ham) // 3
spamTestLen = len(spam) // 3
testData = ham[:hamTestLen] + spam[:spamTestLen]
testLabels = ['ham' for i in range(hamTestLen)
              ] + ['spam' for i in range(spamTestLen)]
trainData = ham[hamTestLen:] + spam[spamTestLen:]
trainLabels = ['ham' for i in range(len(ham) - hamTestLen)
               ] + ['spam' for i in range(len(spam) - spamTestLen)]

# Train model
wordBag = articles.createWordBag(trainData)
trainData = articles.createDataSet(trainData, wordBag)
model = bayes.train(trainData, trainLabels)

# Test model
correct = 0
testData = articles.createDataSet(testData, wordBag)
for i, data in enumerate(testData):
    res = bayes.classify(data, model)
    if res == testLabels[i]:
        correct += 1
print('Correctness: %d/%d' % (correct, len(testData)))
Exemplo n.º 13
0
    text = open('email/ham/%d.txt' % i).read()
    word_list = bayes.text_parse(text)
    doc_list.append(word_list)
    full_text.extend(word_list)
    class_list.append(0)

vocab_list = bayes.create_vocab_list(doc_list)

train_set = range(50)
test_set = []
for i in range(10):
    rand_index = int(random.uniform(0, len(train_set)))
    test_set.append(train_set[rand_index])
    del (train_set[rand_index])
train_matrix = []
train_classes = []
for index in train_set:
    vec = bayes.words_set_to_vec(vocab_list, doc_list[index])
    train_matrix.append(vec)
    train_classes.append(class_list[index])

p_0_v, p_1_v, p_ab = bayes.train(train_matrix, train_classes)

error_count = 0
for index in test_set:
    vec = bayes.words_set_to_vec(vocab_list, doc_list[index])
    classify = bayes.classify(vec, p_0_v, p_1_v, p_ab)
    if classify != class_list[index]:
        error_count += 1
print('the error rate is: ', float(error_count) / len(test_set))
Exemplo n.º 14
0
# -*- coding: utf-8 -*-
"""
Created on Mon May 13 10:49:16 2019

@author: cm
"""

import os
import sys
pwd = os.path.dirname(os.path.abspath(__file__))
sys.path.append(pwd)

import numpy as np
from bayes import train, read_vector

if __name__ == '__main__':
    ### 训练
    #读取变量
    labels = np.loadtxt(os.path.join(pwd, 'data', 'types.txt'))
    #读取词袋
    #vectors = read_vector('vector_pearson_40000.txt')
    vectors = read_vector('vectors1000.txt')
    #训练参数
    p0Vec, p1Vec, pClass1 = train(vectors, labels)