def evaluate(trainfile, testfile): # 訓練データをロード trainData = [] fp = codecs.open(trainfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() trainData.append(temp) fp.close() # ナイーブベイズを訓練 nb = NaiveBayes() nb.train(trainData) print nb # テストデータを評価 hit = 0 numTest = 0 fp = codecs.open(testfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() correct = temp[0] # 正解カテゴリ words = temp[1:] # 文書:単語の集合 predict = nb.classify(words) # ナイーブベイズでカテゴリを予測 if correct == predict: hit += 1 # 予測と正解が一致したらヒット! numTest += 1 print "accuracy:", float(hit) / float(numTest) fp.close()
def crossValidation(data, N=num, randomize=False): if randomize: from random import shuffle shuffle(data) # Cross Validation accuracyList = [] for n in range(N): # split train and test data trainData = [d for i, d in enumerate(data) if i % N != n] testData = [d for i, d in enumerate(data) if i % N == n] # train data nb = NaiveBayes() nb.train(trainData) # accuracy of test data hit = 0 numTest = 0 for d in testData: correct = d[0] words = d[1:] predict = nb.classifier(words) if correct == predict: hit += 1 numTest += 1 accuracy = float(hit) / float(numTest) accuracyList.append(accuracy) average = sum(accuracyList) / float(N) average_f = round(average, 4) return average
def spamHamtoyExample() -> None: ''' Trains a naive bayes classifier using a folder with spam/ham emails Checks quality of classifier by using the model to predict the emails from the 'test' folder Different feature numbers are used to check how many features gives the best classification score (1-50) Plots the classification - x-axis = number of features, y-axis = classification accuracy ''' filedir = '../data/emails/' naivebay = NaiveBayes() naivebay.train(os.path.join(filedir, 'train/')) numOfItemsToPrint = 4 naivebay.printMostPopularHamWords(numOfItemsToPrint) naivebay.printMostPopularSpamWords(numOfItemsToPrint) naivebay.printMostindicativeHamWords(numOfItemsToPrint) naivebay.printMostindicativeSpamWords(numOfItemsToPrint) print('Model logPrior: {}'.format(naivebay.logPrior)) features = [1, 2, 5, 10, 20, 30, 40, 50] accuracy = [] for i in features: acc = naivebay.classifyAndEvaluateAllInFolder( os.path.join(filedir, 'test/'), i) accuracy.append(acc) print(i, "features, classification score:", acc) plt.figure("Naive results: #features vs classification error rate") plt.plot(features, accuracy) plt.grid(True) plt.xlabel('Number of Features') plt.ylabel('Classification Score') plt.show()
def evaluate(trainfile, testfile): # 訓練データをロード trainData = [] fp = codecs.open(trainfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() trainData.append(temp) fp.close() # ナイーブベイズを訓練 nb = NaiveBayes() nb.train(trainData) print nb # テストデータを評価 hit = 0 numTest = 0 fp = codecs.open(testfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() correct = temp[0] # 正解カテゴリ words = temp[1:] # 文書:単語の集合 predict = nb.classify(words) # ナイーブベイズでカテゴリを予測 if correct == predict: hit += 1 # 予測と正解が一致したらヒット! numTest += 1 print "accuracy:", float(hit) / float(numTest) fp.close()
def test_naivebayes(traindata, testdata): #raw pixel feature feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) print(a)
def train(self, train_set): """Teaches the classifier with labeled data instances.""" for d in train_set: self.corpus.add_doc(d) print 'Training on %d documents...\n' % len(train_set) if isinstance(self.classifier, NaiveBayes): self.classifier.train(self.corpus) for c in self.corpus.get_classes(): if len(c.get_classes()) > 1: subclassifier = NaiveBayes() subclassifier.train(c) self.subclassifiers[c.get_label()] = subclassifier else: # for nltk classifiers labeled_feature_set = [(d.get_features(), d.get_labels()[0]) for d in train_set] self.classifier.train(labeled_feature_set) # Sklearn classifiers
def trainTrendClassifier(): """ :return: None This function instantiates a model of the NaiveBayes class and trains the model on the categorized trends data. The trained model is stored in the database for future classification purpose. """ logger.debug("trainTrendsClassifier()") trainingFolder = config['training']['trends'] trainingDocs, trainingLabels = getData(trainingFolder) logger.debug("documents: " + str(len(trainingDocs)) + ", labels: " + str(len(trainingLabels))) model = NaiveBayes() model.train(trainingDocs, trainingLabels, stopWordsFlag=True, stem=True) model.saveToDB()
def main(): dataset_file, model = sys.argv[1], sys.argv[2] classes = load_classes(dataset_file) train, test = split_dataset(classes, 0.6) classifier = NaiveBayes() classifier.train(train) accuracy, recall, f1 = classifier.perfomance(test) print("Total perfomance") print("================") print("Accuracy: %f" % accuracy) print("Recall: %f" % recall) print("F1: %f" % f1) print("\n") class_accuracy, class_recall, class_f1 = classifier.class_perfomance(test) print("Class accuracy") print("================") for klass in class_accuracy: print("%s: %f" % (klass, class_accuracy[klass])) print("\n") print("Class recall") print("================") for klass in class_recall: print("%s: %f" % (klass, class_recall[klass])) print("\n") print("Class F1") print("================") for klass in class_f1: print("%s: %f" % (klass, class_f1[klass])) print("\n") classifier.save(model)
print("Running test for model: {}".format(args.m)) # Our input $x$ TEXT = torchtext.data.Field() # Our labels $y$ LABEL = torchtext.data.Field(sequential=False, unk_token=None) # Generate train/test splits from the SST dataset, filter out neutral examples train, val, test = torchtext.datasets.SST.splits( TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral') TEXT.build_vocab(train) LABEL.build_vocab(train) train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits((train, val, test), batch_size=10, device=-1, repeat = False) if args.m == "NaiveBayes": alpha = 1 model = NaiveBayes(alpha, TEXT, LABEL) model.train(train_iter, val_iter) # Evaluate on training set train_acc = model.evaluate(train_iter) print('final train_acc (should be very high): ', train_acc) # Evaluate on testing set # test_code_NB(model, test_iter) test_acc = model.evaluate(test_iter) print('final test_acc: ', test_acc)
##pos_class = "play:yes" ##pos_class = "play:no" ##datafile = "haireyescolor.txt" ##pos_class = "Sex:Male" ##pos_class = "Sex:Female" ##datafile = "../data/cmc-full.txt" ##pos_class = "contraceptive-method:none" ##pos_class = "contraceptive-method:long-term" ##pos_class = "contraceptive-method:short-term" d = Data(datafile) prnb = NaiveBayes(d) ##prnb = MaxAPost(d) prnb.train() pos = 0.0 neg = 0.0 for (v, c_true) in d.test_set: if c_true == pos_class: pos += 1 else: neg += 1 result_pos = [] result_neg = [] result_dif = [] result_nor = [] for (v, c_true) in d.test_set:
df['text'], df['is_spam'], test_size=0.2, random_state=191) print('Data set:') print('{} total'.format(df.shape[0])) for t, t_name in zip(targets, target_names): print('{} {}'.format(len(df[df['is_spam'] == t]), t_name)) print('\nTraining set:') print('{} total'.format(len(X_train))) for t, t_name in zip(targets, target_names): print('{} {}'.format(sum([y == t for y in y_train]), t_name)) print('\nTest set:') print('{} total'.format(len(X_test))) for t, t_name in zip(targets, target_names): print('{} {}'.format(sum([y == t for y in y_test]), t_name)) print('') # Build Classifier gvoc_model = NaiveBayes('General Vocabulary', X_train, y_train, targets, target_names) gvoc_model.train() gvoc_model.evaluate(X_test, y_test, show_top_features=10) rvoc_model = NaiveBayes('Reduced Vocabulary', X_train, y_train, targets, target_names, max_features=200) rvoc_model.train() rvoc_model.evaluate(X_test, y_test, show_top_features=10)
def stdmean(): limit = 0.7 ratio = 0.8 times = 5 print("digit") traindata, testdata = dataloader_digit() sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) pc.train(images, labels, 3, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) plt.plot(pal, sal, label="digitdata Perceptron std") plt.plot(pal, mal, label="digitdata Perceptron mean") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] for i in range(3): images, labels = traindata.shuffleout(p) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="digitdata NaiveBayes std") plt.plot(pal, mal, label="digitdata NaiveBayes mean") sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) pc.train(images, labels, 50, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="digitdata NeuralNetwork std") plt.plot(pal, mal, label="digitdata NeuralNetwork mean") print("face") traindata, testdata = dataloader_face() sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) pc.train(images, labels, 3, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) plt.plot(pal, sal, label="facedata Perceptron std") plt.plot(pal, mal, label="facedata Perceptron mean") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] for i in range(3): images, labels = traindata.shuffleout(p) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="facedata NaiveBayes std") plt.plot(pal, mal, label="facedata NaiveBayes mean") sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) pc.train(images, labels, 50, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="facedata NeuralNetwork std") plt.plot(pal, mal, label="facedata NeuralNetwork mean") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("time(in second)") plt.show()
def timeana(): import time limit = 0.7 ratio = 1 times = 200 print("digit") traindata, testdata = dataloader_digit() fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="digitdata Perceptron") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(20, 101, 10): start = time.time() nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) end = time.time() fal.append(end - start) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata NaiveBayes") fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="digitdata NeuralNetwork") print("face") traindata, testdata = dataloader_face() fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="facedata Perceptron") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(20, 101, 10): start = time.time() nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) end = time.time() fal.append(end - start) pal.append(p) print(a) plt.plot(pal, fal, label="facedata NaiveBayes") fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="facedata NeuralNetwork") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("time(in second)") plt.show()
def test_naivebayes_argmax_all(): traindata, testdata = dataloader_digit() feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata order") fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.shuffleout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata random") traindata, testdata = dataloader_face() feature_domians = [[0, 1] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="facedata order") fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.shuffleout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="facedata random") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("accuracy") plt.show()
(trainingData, trainingSentiment, testData, testSentiment) = input.extract("../Resources/SentimentAnalysisDataset.csv", 2, 1) (td, ts, ted, tes) = input.extract("../Resources/SenterTrainingData.csv", 5, 0) ts = ["0" if t == "0" else "1" for t in ts] tes = ["0" if t == "0" else "1" for t in tes] print(ts) print("good") print(tes) testData += ted testSentiment += tes bayes = NaiveBayes() bayes.train(trainingData, trainingSentiment, ["0", "1"]) correct = 0 #for i in range(len(testData)): # ans = bayes.classify(testData[i]) #if(ans == (testSentiment[i] == "1")): # if(ans == testSentiment[i]): # correct += 1 #print(float(correct)/len(testData) * 100) input.DoToAll("../Resources/CleanTables", input.writeTo, parameters=bayes) print("Done") input.DoToAll("../Resources/CleanTables", input.percent_negative_tweets) #print(input.percent_negative_tweets("cleanTable/Accessbank.csv")) #print(input.percent_negative_tweets("cleanTable/DiamondBank.csv")) #print(input.percent_negative_tweets("cleanTable/EcoBank.csv")) #print(input.percent_negative_tweets("cleanTable/FCMB.csv"))
nb = NaiveBayes() nb.load() cin = '' for item in r: print item['text'] cin = raw_input('Basic? Y/n/quit: ') if cin == 'n': training_set.append(('non-basic', item['text'])) elif cin == 'quit': break else: training_set.append(('basic', item['text'])) nb.train(training_set) nb.save() for item in r: print item['text'] print nb.classify(item['text']) cin = raw_input('Basic? Y/n/quit: ') if cin == 'n': training_set.append(('non-basic', item['text'])) elif cin == 'quit': break else: training_set.append(('basic', item['text'])) nb.save()
from data import Data from naivebayes import NaiveBayes filename = "datasets/weatherNominal.td" ## filename = "datasets/titanic.td" ## filename = "datasets/cmc.td" d = Data(filename) d.report() pr = NaiveBayes(d) pr.train() pr.show() for (v, c_true) in d.test_set: c_pred = pr.predict(v)[0] print(v, ":") print(" ", c_pred, "( true class:", c_true, ")") ## print(pr.predict(("Class:1st","Sex:Female","Age:Child"))) ## print(pr.predict(("Class:Crew","Sex:Female","Age:Child")))