def get_lambda(parts, alpha): for i in range(0, 101, 5): lambda_legit = 10**i sum_accuracy_score = 0 cnt = 0 for test in parts: d_tran = [] predict = [] answer = [] for part in parts: if part != test: d_tran = d_tran + part bayes = train(d_tran) for answer_class, message in test: predict_class, _ = classify(bayes, message, 1, lambda_legit, alpha) predict.append(int(predict_class == SPAM)) answer.append(int(answer_class == SPAM)) if predict_class == SPAM and answer_class == LEGIT: cnt += 1 sum_accuracy_score += accuracy_score(predict, answer) print("cnt:", cnt, "lambda:", 10**i)
def draw_roc(parts, alpha): d_train = [] for part in parts: d_train = d_train + part bayes = train(d_train) roc_c = [] roc_pred = [] count_y = 0 count_x = 0 for answer_class, message in d_train: predict_class, pred = classify(bayes, message, 1, 1, alpha) if predict_class == answer_class: count_y += 1 roc_c.append(True) else: count_x += 1 roc_c.append(False) roc_pred.append(pred) roc_c, roc_pred = zip(*sorted(zip(roc_c, roc_pred), key=lambda x: x[1])) if count_y == 0: count_y = 1 if count_x == 0: count_x = 1 shag_x = 1 / count_x shag_y = 1 / count_y X = [] Y = [] x = 0 y = 0 X.append(x) Y.append(y) for t in roc_c: if t: y += shag_y else: x += shag_x X.append(x) Y.append(y) plt.plot(X, Y) plt.show()
def test(): """ the probability reach 1 represent badness """ listpost, listclass = bayes.loaddataset() myvocablist = bayes.createlist(listpost) tmatrix = list() for doc in listpost: vec = bayes.word2vec(myvocablist, doc) tmatrix.append(vec) p0, p1, pa = bayes.train(tmatrix, listclass) testdoc1 = ['love', 'my', 'dalmation'] testvec1 = bayes.word2vec(myvocablist, testdoc1) print testdoc1, 'classify as :', bayes.classify(testvec1, p0, p1, pa) testdoc2 = ['stupid', 'love'] testvec2 = bayes.word2vec(myvocablist, testdoc2) print testdoc2, 'classify as :', bayes.classify(testvec2, p0, p1, pa)
def main(): # the number of words in the dictionary word_num = 2500 # file that contains features of emails to be used in training train_feature_filename = "./data/train-features.txt" # file that contains labels of emails to be used in training train_label_filename = "./data/train-labels.txt" # the number of training emails train_email_num = 700 # file that contains features of emails to be used in testing test_feature_filename = "./data/test-features.txt" # file that contains labels of emails to be used in testing test_label_filename = "./data/test-labels.txt" # the number of testing emails test_email_num = 260 # train naive bayes model model = bayes.train(train_feature_filename, train_label_filename, train_email_num, word_num) # make predictions on test data bayes.test(model, test_feature_filename, test_label_filename, test_email_num, word_num)
def get_best_alpha(parts): best_accuracy = 0 best_alpha = 0 for alpha_degree in range(0, 10): alpha = 1 / (10**alpha_degree) sum_accuracy_score = 0 for d_test in parts: d_train = [] predict = [] answer = [] for part in parts: if part != d_test: d_train = d_train + part bayes = train(d_train) for answer_class, message in d_test: predict_class, _ = classify(bayes, message, 1, 1, alpha) predict.append(int(predict_class == SPAM)) answer.append(int(answer_class == SPAM)) sum_accuracy_score += accuracy_score(predict, answer) accuracy = sum_accuracy_score / COUNT_PARTS if best_accuracy < accuracy: best_accuracy = accuracy best_alpha = alpha print('alpha:', alpha, 'accuracy:', accuracy) print("__________________________________________") print("Best alpha:", best_alpha) print("Best accuracy:", best_accuracy) print("__________________________________________") return best_alpha
def classify_bayes(): fracs = [x / 10 for x in range(1, 11)] digitd = (read_digitdata('training'), read_digitdata('test'), read_digitdata('validation')) faced = (read_facedata('train'), read_facedata('test'), read_facedata('validation')) datasets = [faced, digitd] # Bayesian for dataset in datasets: for f in fracs: fs = take_sample(dataset[0], f) t1 = time.time() model = bayes.train(fs) t2 = time.time() total = len(dataset[1]) cor = 0 for item in dataset[1]: cl, logprob = bayes.predict(item[0], model) if cl == item[1]: cor += 1 acc = float(cor) / total print("Bayes class accuracy frac {} train_time {} accuracy {}". format(f, t2 - t1, acc))
def draw_accuracy_from_lambda(parts, alpha): all_lambda_legit = [] all_accuracy = [] for i in range(0, 101, 5): lambda_legit = 10**i sum_accuracy_score = 0 for test in parts: d_train = [] predict = [] answer = [] for part in parts: if part != test: d_train = d_train + part bayes = train(d_train) for answer_class, message in test: predict_class, _ = classify(bayes, message, 1, lambda_legit, alpha) predict.append(int(predict_class == SPAM)) answer.append(int(answer_class == SPAM)) sum_accuracy_score += accuracy_score(predict, answer) accuracy = sum_accuracy_score / COUNT_PARTS all_lambda_legit.append(i) all_accuracy.append(accuracy) print("lambda:", lambda_legit, "accuracy:", accuracy) plt.plot(all_lambda_legit, all_accuracy) plt.xlabel('10^x lambda legit') plt.ylabel('Accuracy') plt.show()
cur = con.cursor(mdb.cursors.DictCursor) cur.execute("SELECT a.article_id, like_flag, article_text \ from test_user_activity a, test_article b \ where a.article_id = b.article_id \ and a.like_flag is not null \ order by a.article_id") rows = cur.fetchall() total = 0 for row in rows: t = row["article_text"] t = unquote_plus(t) if row["like_flag"] == 1: bayes.train(t, False) else: bayes.train(t, True) total = total + 1 print total if (total % 100) == 0: query = "SELECT a.article_id, a.prediction, a.like_flag, b.article_text from test_user_activity a, \ test_article b where a.article_id = b.article_id and a.article_id between %s and %s and user_name ='brad'" % (row["article_id"] + 1, (row["article_id"] + 101)) print query cur.execute(query) rows = cur.fetchall() for row in rows: t = row["article_text"] t = unquote_plus(t) prediction = bayes.classify(t)
import bayes dataset, labels = bayes.load_dataset() print(dataset) print(labels) vocab_list = bayes.create_vocab_list(dataset) print(vocab_list) matrix = [] for array in dataset: vec = bayes.words_set_to_vec(vocab_list, array) matrix.append(vec) print(matrix) p_0_v, p_1_v, p_ab = bayes.train(matrix, labels) print(p_0_v) print(p_1_v) print(p_ab) print('<--->') test = ['love', 'my', 'dalmation'] vec = bayes.words_set_to_vec(vocab_list, test) classify = bayes.classify(vec, p_0_v, p_1_v, p_ab) print(test) print(vec) print(classify) print('<--->') test = ['stupid', 'garbage'] vec = bayes.words_set_to_vec(vocab_list, test) classify = bayes.classify(vec, p_0_v, p_1_v, p_ab) print(test) print(vec) print(classify)
import bayes from sys import argv, exit if len(argv) == 2: with open(argv[1]) as f: texts = [l.strip() for l in f] else: print "give me a file as an argument!" exit(1) storage = bayes.Storage("bayes.dat", 10) try: storage.load() except IOError: pass bayes = bayes.Bayes(storage) try: for text in texts: text = text.replace("\n", " ") print "==> %s" % text print "" resp = raw_input("Spam? [y/n] ") bayes.train(text, resp == "y") finally: storage.finish()
import bayes from data_helpers import * from sklearn.externals import joblib posFile = "./data/train_food.txt" negFile = "./data/train_notfood.txt" print("正在获取训练矩阵及其分类向量") trainList, classVec = loadTrainDataset(posFile, negFile) print("正在将训练矩阵分词,并生成词表") vectorized, vocabulary = jieba_cut_and_save_file(trainList, True) bayes = bayes.oldNB(vocabulary) # 初始化模型 print("正在训练模型") bayes.train(vectorized, classVec) # 训练 print("保存模型") joblib.dump(bayes, "./arguments/train_model.m")
ham = getArticles('data/email/ham.pickle') spam = getArticles('data/email/spam.pickle') # Generate training set and test set random.shuffle(ham) random.shuffle(spam) hamTestLen = len(ham) // 3 spamTestLen = len(spam) // 3 testData = ham[:hamTestLen] + spam[:spamTestLen] testLabels = ['ham' for i in range(hamTestLen) ] + ['spam' for i in range(spamTestLen)] trainData = ham[hamTestLen:] + spam[spamTestLen:] trainLabels = ['ham' for i in range(len(ham) - hamTestLen) ] + ['spam' for i in range(len(spam) - spamTestLen)] # Train model wordBag = articles.createWordBag(trainData) trainData = articles.createDataSet(trainData, wordBag) model = bayes.train(trainData, trainLabels) # Test model correct = 0 testData = articles.createDataSet(testData, wordBag) for i, data in enumerate(testData): res = bayes.classify(data, model) if res == testLabels[i]: correct += 1 print('Correctness: %d/%d' % (correct, len(testData)))
text = open('email/ham/%d.txt' % i).read() word_list = bayes.text_parse(text) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) vocab_list = bayes.create_vocab_list(doc_list) train_set = range(50) test_set = [] for i in range(10): rand_index = int(random.uniform(0, len(train_set))) test_set.append(train_set[rand_index]) del (train_set[rand_index]) train_matrix = [] train_classes = [] for index in train_set: vec = bayes.words_set_to_vec(vocab_list, doc_list[index]) train_matrix.append(vec) train_classes.append(class_list[index]) p_0_v, p_1_v, p_ab = bayes.train(train_matrix, train_classes) error_count = 0 for index in test_set: vec = bayes.words_set_to_vec(vocab_list, doc_list[index]) classify = bayes.classify(vec, p_0_v, p_1_v, p_ab) if classify != class_list[index]: error_count += 1 print('the error rate is: ', float(error_count) / len(test_set))
# -*- coding: utf-8 -*- """ Created on Mon May 13 10:49:16 2019 @author: cm """ import os import sys pwd = os.path.dirname(os.path.abspath(__file__)) sys.path.append(pwd) import numpy as np from bayes import train, read_vector if __name__ == '__main__': ### 训练 #读取变量 labels = np.loadtxt(os.path.join(pwd, 'data', 'types.txt')) #读取词袋 #vectors = read_vector('vector_pearson_40000.txt') vectors = read_vector('vectors1000.txt') #训练参数 p0Vec, p1Vec, pClass1 = train(vectors, labels)