def main(argv): training_set = argv[1] test_set = argv[2] algorithm = argv[3] training_set = read_csv(training_set) test_set = read_csv(test_set) algorithm = algorithm.upper() if algorithm == 'NB': nb = NaiveBayes() nb.calculate_nb(training_set, test_set) else: int_match = re.findall('\d*', algorithm) if int_match[0] is not None: algorithm = algorithm.strip(int_match[0]) k = int(int_match[0]) if algorithm == 'NN': nn = KNearestNeighbor() nn.calculate_knn(training_set, test_set, k) exit()
def train(self, in_file, out_file): # If existing learned data (pkl), load learned data. nb = None if os.path.exists(out_file): with open(out_file, 'rb') as f: nb = pickle.load(f) # If no learned data, execute learning. else: # Read learning data. nb = NaiveBayes() fin = codecs.open(in_file, 'r', 'utf-8') lines = fin.readlines() fin.close() items = [] for line in lines: words = line[:-2] train_words = words.split('@') items.append(train_words[1]) nb.train(train_words[1], train_words[0]) # Save learned data to pkl file. with open(out_file, 'wb') as f: pickle.dump(nb, f) return nb
def make_data_percentage_graph(input_tweets_file, input_labels_file): """ Creates a graph to determine the relationship between test set size and accuracy. We try testing on 25 percent - 85 percent of the dataset, where we increment by 5 percent in every run The value of alpha is kept as default - 1.0 """ plt.title( "Accuracy of Naive Bayes with varied test data size and fixed alpha" ) plt.xlabel("Percentage of data used for testing") plt.ylabel("Accuracy Achieved") values = [ 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0, 75.0, 80.0, 85.0 ] accuracies = [] for val in values: nb = NaiveBayes(input_tweets_file, input_labels_file, 1.0, (val / 100.0)) nb.update_model() accuracies.append(nb.evaluate_classifier_accuracy()) plt.plot(values, accuracies) # Save the figure in the Figures directory file_name = "Figures/accuracy_varied_test_ratio_" + input_tweets_file[ 8:10] + ".jpeg" plt.savefig(file_name) # To refresh the graph plt.close()
def make_accuracy_graph(input_tweets_file, input_labels_file): """ Creates a graph to determine the relationship between alpha and accuracy. The values of that are tried are from 1 - 50 The test set size is kept as default - 20% """ plt.title( "Accuracy of Naive Bayes with varied alpha and fixed test set size" ) plt.xlabel("Alpha Value") plt.ylabel("Accuracy Achieved") accuracies = [] for i in range(1, 51): nb = NaiveBayes(input_tweets_file, input_labels_file, i) nb.update_model() accuracies.append(nb.evaluate_classifier_accuracy()) plt.plot(range(1, 51), accuracies) # Save the figure in the Figures directory file_name = "Figures/accuracy_varied_alpha_" + input_tweets_file[ 8:10] + ".jpeg" plt.savefig(file_name) # To refresh the graph plt.close()
def bindListenerVal(self, event): #(not str(event.char).isdigit() if (event.keycode == 8): #print("reves") self.word = (str(self.bins_textField.get()))[:-1] else: try: self.word = str(self.bins_textField.get() + str(event.char)) except: self.word = str(self.bins_textField.get()) #print("shit:" + self.word) #print("shit:" + str(len(self.word))) if ((self.word.isdigit() and int(self.word) > 0) or (len(self.word) == 0)): # and len(str(self.bins_textField.get())) == 0)): self.inputBindAlret.configure(foreground="#3e5d93") if ((self.word.isdigit())): self.bindValOk = True if (self.directoryValOk == True): self.Build_button.configure(state='normal') self.NB = NaiveBayes() else: self.bindValOk = False self.Build_button.configure(state='disable') else: self.inputBindAlret.configure(foreground="#ffffffffffff") self.bindValOk = False self.Build_button.configure(state='disable')
def folderBrowseAction(self): dirWind = tk.Tk() dirWind.withdraw() path = askdirectory() if (len(str(self.directory_textField.get())) != 0): self.directory_textField.delete(0, 'end') self.directory_textField.insert(0, str(path)) dirWind.destroy() if (os.path.isdir(self.directory_textField.get()) == False): self.directoryValOk = False if (len(self.directory_textField.get()) != 0): messagebox.showerror('oops!', 'Please insert a valid Directory path!') self.directory_textField.delete(0, 'end') else: if ((os.path.exists(self.directory_textField.get() + "/train.csv") == False) or (os.path.exists(self.directory_textField.get() + "/test.csv") == False) or (os.path.exists(self.directory_textField.get() + "/Structure.txt") == False)): self.directoryValOk = False if (len(self.directory_textField.get()) != 0): messagebox.showerror( 'oops!', '~~ MISSING FILES ~~\n\nMake sure that the files:\ntrain.csv,\ntest.csv\nStructure.txt \nare exists in this path!' ) else: self.directoryValOk = True if (self.bindValOk == True): self.Build_button.configure(state='normal') self.NB = NaiveBayes()
def valuesCheckButtonAbillity(self, event): if (os.path.isdir(self.directory_textField.get()) == False): self.directoryValOk = False if (len(self.directory_textField.get()) != 0): messagebox.showerror('oops!', 'Please insert a valid Directory path!') self.directory_textField.delete(0, 'end') else: if ((os.path.exists(self.directory_textField.get() + "/train.csv") == False) or (os.path.exists(self.directory_textField.get() + "/test.csv") == False) or (os.path.exists(self.directory_textField.get() + "/Structure.txt") == False)): self.directoryValOk = False if (len(self.directory_textField.get()) != 0): messagebox.showerror( 'oops!', '~~ MISSING FILES ~~\n\nMake sure that the files:\ntrain.csv,\ntest.csv\nStructure.txt \nare exists in this path!' ) else: self.directoryValOk = True if (self.bindValOk == True): self.Build_button.configure(state='normal') self.NB = NaiveBayes()
def use_nb2(datas_train, datas_valid): nb = NaiveBayes() predicts_all = [] correct_all = 0 # for cur_part in range(1,partition+1): nb.train(datas_train) predicts = [nb.predict(data['content'])[0] for data in datas_valid] predicts_all += predicts correct = 0 for i in range(len(predicts)): if predicts[i] == datas_valid[i]['category']: correct += 1 correct_all += correct print("Correct: ", correct, "out of ", len(datas_valid)) print(datas_valid[-1]['date']) # datas_train = datas[min(len(datas), plen*cur_part) : min(len(datas), plen*(cur_part+1))] datas_valid = datas[min(len(datas), plen*(cur_part+1)) : min(len(datas), plen*(cur_part+2))] # for i in range(len(datas_train)): datas_train[i]['category'] = predicts[i] # res = "Correct: %d out of %d"%(correct_all, len(predicts_all)) print(res) return res
def __init__(self, *args, **kwargs): super(SmartMatch, self).__init__(*args, **kwargs) self.datapaths = {} self.monitor_thread = hub.spawn(self._monitor) self.naive_bayes = NaiveBayes() self.naive_bayes.init_classifier() self.logger.info('instantiated Naive Bayes classifier, its accuracy score is: %0.2f', self.naive_bayes.get_accuracy_score()) self.flow_container = {}
def run_nb(): print("Importing Naive Bayes...") from NaiveBayes import NaiveBayes print("Successfully Imported Naive Bayes.") print("Running...") nb_obj = NaiveBayes(file_paths.us_tweets_path, file_paths.us_labels_path) nb_obj.update_model() print("Execution Complete. Accuracy:" + str(nb_obj.evaluate_classifier_accuracy()) + " %")
def train(self, dataSet, CHigher): self.assignSensitivity(dataSet) dsX = self.splitDataFrame(dataSet, self.Sx) dsY = self.splitDataFrame(dataSet, self.Sy) NaiveBayes.train(self, dsX, self.modelX) NaiveBayes.train(self, dsY, self.modelY) self.modify(dataSet, CHigher)
def __init__(self,paramlist): #list of filenames in validation corpus self.validationcorpus = [] #list of filenames in trainingcorpus self.trainingcorpus = [] self.createvalandtrain() #svm class object: self.svm = SVM_classifier() self.nb = NaiveBayes() logging.basicConfig(filename="test.log", level=logging.DEBUG)
def naive_bayes(self): nb = NaiveBayes() accuracy_score = 0 plot = False train_data, test_data = self.kfold_split(self.k) for i in range(self.k): classifier = nb.train(train_data[i]) if i == 9: plot = True accuracy_score = accuracy_score + nb.test(classifier, test_data[i], plot) return accuracy_score / self.k
def main(): data, target = DataSetFileReader.read_dataset_file( 'data/SMSSpamCollection') classifier = NaiveBayes() classifier.fit(data, target) input_data = DataSetFileReader.read_input_data_file('data/inputdata') result = NaiveBayes.predict(classifier, input_data) for pred, msg in zip(result, input_data): print('{0} -> {1}'.format(pred.upper(), msg))
def main(): attributes_train, data_train = read_from_file("train.txt") # DTL dtl = DecisionTree() tree = dtl.build(data_train, attributes_train) with open("output_tree.txt", "w") as file: tree_string = dtl.write_tree_to_file(tree, attributes_train, 0) file.write(tree_string[:len(tree_string) - 1]) # KNN knn = KNearestNeighbors(attributes_train, data_train) # NAIVE BAYES naive_bayes = NaiveBayes(attributes_train, data_train) attribute_text, data_test = read_from_file("test.txt") knn_result = [] naive_bayes_result = [] dtl_result = [] real_classify = [] for line in data_test: real_classify.append(line[-1]) entry = line[:-1] knn_result.append(knn.predict(entry, 5)) naive_bayes_result.append(naive_bayes.predict(entry)) dtl_result.append(dtl.predict(tree, entry, attribute_text)) acc_knn = 0 acc_nb = 0 acc_dtl = 0 # get accuracy for (dtl, knn, nb, real) in zip(dtl_result, knn_result, naive_bayes_result, real_classify): if dtl == real: acc_dtl += 1 if knn == real: acc_knn += 1 if nb == real: acc_nb += 1 acc_knn /= len(real_classify) acc_nb /= len(real_classify) acc_dtl /= len(real_classify) acc_knn = float(math.ceil(acc_knn * 100)) / float(100) acc_nb = float(math.ceil(acc_nb * 100)) / float(100) acc_dtl = float(math.ceil(acc_dtl * 100)) / float(100) with open('output.txt', 'w') as output: output.write("Num\tDT\tKNN\tnaiveBase\n") for i, (a, b, c) in (enumerate( zip(dtl_result, knn_result, naive_bayes_result))): output.write(str(i + 1) + "\t" + a + "\t" + b + "\t" + c + "\n") output.write("\t" + str(acc_dtl) + "\t" + str(acc_knn) + "\t" + str(acc_nb) + "\n")
def buildSplits(numFolds, args): """Builds the splits for training/testing""" splits = [] trainDir = args[0] if len(args) == 1: print '[INFO]\tPerforming %d-fold cross-validation on data set:\t%s' % ( numFolds, trainDir) posTrainFileNames = os.listdir('%s/pos/' % trainDir) negTrainFileNames = os.listdir('%s/neg/' % trainDir) for fold in range(0, numFolds): split = NaiveBayes.TrainSplit() for fileName in posTrainFileNames: example = NaiveBayes.Example() example.words = readFile('%s/pos/%s' % (trainDir, fileName)) example.klass = 'pos' if fileName[2] == str(fold): split.test.append(example) else: split.train.append(example) for fileName in negTrainFileNames: example = NaiveBayes.Example() example.words = readFile('%s/neg/%s' % (trainDir, fileName)) example.klass = 'neg' if fileName[2] == str(fold): split.test.append(example) else: split.train.append(example) splits.append(split) elif len(args) == 2: split = NaiveBayes.TrainSplit() testDir = args[1] print '[INFO]\tTraining on data set:\t%s testing on data set:\t%s' % ( trainDir, testDir) posTrainFileNames = os.listdir('%s/pos/' % trainDir) negTrainFileNames = os.listdir('%s/neg/' % trainDir) for fileName in posTrainFileNames: example = NaiveBayes.Example() example.words = readFile('%s/pos/%s' % (trainDir, fileName)) example.klass = 'pos' split.train.append(example) for fileName in negTrainFileNames: example = NaiveBayes.Example() example.words = readFile('%s/neg/%s' % (trainDir, fileName)) example.klass = 'neg' split.train.append(example) posTestFileNames = os.listdir('%s/pos/' % testDir) negTestFileNames = os.listdir('%s/neg/' % testDir) for fileName in posTestFileNames: example = NaiveBayes.Example() example.words = readFile('%s/pos/%s' % (testDir, fileName)) example.klass = 'pos' split.test.append(example) for fileName in negTestFileNames: example = NaiveBayes.Example() example.words = readFile('%s/neg/%s' % (testDir, fileName)) example.klass = 'neg' split.test.append(example) splits.append(split) return splits
def compare(filename): #filename vai ser Tp1_data.csv showPlots = False Xs, Ys = get_data(filename) X_r, X_t, Y_r, Y_t = train_test_split(Xs, Ys, test_size=0.33, stratify=Ys) folds = 5 Kf = StratifiedKFold(Y_r, n_folds=folds) KnnErr, bestN, KnnPred = Knn(Kf, X_r, Y_r, X_t, Y_t, showPlots) #KnnPred AA-07 print("KnnErr, best_N:", KnnErr, bestN) LogScore, bestC, LogPred = Logistic(Kf, X_r, Y_r, X_t, Y_t, showPlots) print("LogisticScore, best_C:", LogScore, bestC) NBScore, bestBandwidth, NBPred = NaiveBayes(Kf, X_r, Y_r, X_t, Y_t, showPlots) print("NBScore, best_Bandwidth:", NBScore, bestBandwidth) MCNemarKnn_Log = MCNemar(KnnPred, LogPred, Y_t) #(|e01-e10|-1)²/e01+e10 MCNemarNB_Log = MCNemar(NBPred, LogPred, Y_t) MCNemarNB_Knn = MCNemar(KnnPred, NBPred, Y_t) print() print("McNemar:") print("MCNemarKnn_Log", MCNemarKnn_Log) print("MCNemarNB_Log", MCNemarNB_Log) print("MCNemarNB_Knn", MCNemarNB_Knn)
def buildTestCorpus(ch_aux): """takes doc1\n###\ndoc2\n###... and makes list of documents. build their NB, train on train, output pos\nneg\npos... """ # split on ### testSplit = NaiveBayes.TrainSplit() documents = ch_aux.split('###') for document in documents: document = document.strip() # remove trailing/starting newlines example = NaiveBayes.Example() # example for this document example.klass = 'UNK' # testing time, we don't know the label example.words = [] for word in document.split(): # for every token example.words.append(word) testSplit.test.append(example) return testSplit
def __init__(self, ntrees, nbayes, pruneSplit=0.0): super().__init__() self.classifiers = [] for i in range(ntrees): self.classifiers.append(DecisionTree(pruneSplit=pruneSplit)) for i in range(nbayes): self.classifiers.append(NaiveBayes())
def processData(self, modelName, gender, pClass, siblings, embarked): # loading the dataset df = pd.read_csv('train.csv', sep=',') # droping passengers id df = df.drop('PassengerId', axis=1) # changing strings to numeric values df["Sex"].replace({"male": 0, "female": 1}, inplace=True) df["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True) # fillin empty values df["Embarked"].fillna(df["Embarked"].mean(), inplace=True) # seperating inputs and outputs x = df.drop('Survived', axis=1) y = df['Survived'] model = None if modelName == 'Decision Tree': model = DecisionTree(df) elif modelName == 'Naive Bayes': model = NaiveBayes(df) elif modelName == 'Neural Network': model = NeuralNetwork(df) elif modelName == 'Random Forest': model = RandomForest(df) else: model = SupportVector(df) return model
def train(self): """ training the Entity Classifier """ for i in range(0, 6): ori_labels = self.label_sets[i] print("[Training Entity Classifier with VLSP 2018]") # instantiate a NB class object self.nb = NaiveBayes(np.unique(ori_labels)) print("---------------- Training In Progress --------------------") # start training by calling the train function self.nb.cross_validation(self.ori_data, ori_labels) self.classifiers.append(self.nb) print('----------------- Training Completed ---------------------')
def inspect(fname, dbname): classifier = NaiveBayes() ds_builder = DataSet(classifier) ds, labels, attributes = ds_builder.ReadDataSet(dbname) #purified_ds = ds_builder.PurifyDataSet(ds) trained_ds = classifier.TrainingDataSet(ds, labels, attributes) # data_str = get_data(fname, dbname) data_set = data_str.split() for i in range(len(data_set)): data_set[i] = float(data_set[i]) result = classifier.InspectData(ds, trained_ds, data_set) return result
def get_classifier_object(self): # if self.classifier_name == 'LogReg': # self.clf = LogReg(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() # elif self.classifier_name == 'DeciTree': # self.clf = DecisionTree(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() # elif self.classifier_name == 'svm': # self.clf = SVM(self.x_train, self.y_train, self.x_test, self.y_test) # self.clf.train() # self.y_pred = self.clf.predict() if self.classifier_name == 'RForest': self.clf = RandomForest(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'XGB': self.clf = XGBoost(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'NaiveBayes': self.clf = NaiveBayes(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() elif self.classifier_name == 'AdaBoost': self.clf = AdaBoost(self.x_train, self.y_train, self.x_test, self.y_test) self.clf.train() self.y_pred = self.clf.predict() return self.clf.get_classifier()
def load_from_model(self, model_name): """ Metoda nacita z predaneho modelu jednotlive slovniky klasifikacnich trid a spoustí GUI, ceka na stisk tlacitka a pote klasifikuje zadanou vetu. :param model_name: model ze ktereho se maji nacist jednotliva data. """ with open(model_name, "r") as read_file: json_load = json.load(read_file) if json_load["namepriz"] == "BagOfWords": self.priz_metoda = BagOfWords() elif json_load["namepriz"] == "TfIdf": self.priz_metoda = TfIdf() elif json_load["namepriz"] == "NGram": self.priz_metoda = NGram() self.priz_metoda.words = json_load["words"] self.priz_metoda.klas_tridy = json_load["klas_tridy"] self.priz_metoda.prior = json_load["prior"] if json_load["nameklas"] == "NaiveBayes": self.klasifikator = NaiveBayes(self.priz_metoda) elif json_load["nameklas"] == "NN": self.klasifikator = NN(self.priz_metoda) self.top.title("Classify") self.top.geometry('400x300') buttonCommit = Button(self.top, height=1, width=10, text="Commit", command=lambda: self.retrieve_input()) self.text1.pack() buttonCommit.pack() self.label.pack() self.top.mainloop()
def initiate_nb(): conn = sqlite3.connect("FakeNews.sqlite") c = conn.cursor() c.execute("Select * from FakeNewsTbl") results = c.fetchall() x_train = [row[2] for row in results] y_train = [row[1] for row in results] conn.close() train_data, test_data, train_labels, test_labels = train_test_split(x_train, y_train, shuffle=True, test_size=0.25, random_state=42, stratify=y_train) classes = np.unique(train_labels) # Training phase.... nb = NaiveBayes(classes) nb.train(train_data, train_labels) return nb
def leave_one_out(x, y, is_continuous): """ 使用留一法验证法对模型进行评估 :return: 模型留一法验证下的正确率 """ cnt = 0 for i in range(len(x)): train_x = np.delete(x, i, 0) train_y = np.delete(y, i, 0) test_x = x[i] test_y = y[i] naive_bayes = NaiveBayes(train_x, train_y, is_continuous) naive_bayes.train() test_result = naive_bayes.inference(test_x) if test_result == test_y: cnt += 1 return cnt / len(x)
def get_model(model): """ load model from json file then call NaiveBayes.test() to test """ with open(model, encoding='utf-8') as json_file: data = json.load(json_file) classes = np.asarray(data["classes"]) cates_info = data["cates_info"] cates_info = {int(k): v for k, v in cates_info.items()} for cate_index, cate in enumerate(classes): cates_info[cate_index] = { int(k): v for k, v in cates_info[cate_index].items() } nb = NaiveBayes(classes) nb.cates_info = cates_info return nb
def main(): n = int(input('enter no of training data sentences :')) X,Y = [],[] for i in range(n): words_i = input('enter ' + str(i)+'th sentence:').strip().split(' ') X.append(words_i) Y.append(input('enter the class of sentence :').strip()) clean_obj = Clean() clean_obj.feature_extract(X, n) clean_obj.print_features() X = clean_obj.transform_X(X, n) Y = clean_obj.transform_Y(Y) print('X is :',X) print('Y is :',Y) clf = NaiveBayes() clf.train_text(X, Y) clf.printdictionaries() m = int(input('enter number of testing entries:')) Xtest = [] for i in range(m): test_words_i = input('enter ' + str(i+1)+'th sentence:').strip().split(' ') Xtest.append(test_words_i) Xtest_trans = clean_obj.transform_X(Xtest, m) print(Xtest_trans) for i in range(m): print(clf.predict_text(Xtest_trans[i]))
def main(argv): trainfile = '' testfile = '' mode = '' # validate input if len(sys.argv) == 4: trainfile = sys.argv[1] testfile = sys.argv[2] mode = sys.argv[3] else: print("incorrect input supplied") sys.exit() # ingest data trainset = readFile(trainfile) testset = readFile(testfile) # train on random subsets of the data sizes = [25, 50, 100] accs = [] for size in sizes: tmpAcc = [] for j in range(4): tmpSet = random.sample(trainset.instances, size) bayes = NaiveBayes(trainset, testset) bayes.train(tmpSet) preds = bayes.classify(testset.instances) corCount = 0 for i in range(len(preds)): #print preds[i][0], testset.instances[i][-1], preds[i][1] if preds[i][0] == testset.instances[i][-1]: corCount += 1 print size, j, corCount tmpAcc.append(corCount) meanAcc = float(sum(tmpAcc)) / len(tmpAcc) accs.append([size, meanAcc])
def init_process(self): smooth = 1 smooth = input('Enter smoothing parameter') #Parse training and test data into sets. trainEmails = self.parseEmails("./DataSet/train"); testEmails = self.parseEmails("./DataSet/test"); #Train the data and then predict the classifier nb = NaiveBayes(smooth); nb.train(trainEmails); correctPred = 0; for e in testEmails: if(e.getLabel() == nb.predict(e)): correctPred = correctPred + 1 #Print accuracy statistics self.computeAccuracy(correctPred, len(testEmails));
def latihData(self): if (self.Algorithm.get() == "Naive Bayes"): try: print("NB") self.ImportedFile.setDataLatih() print(self.ImportedFile.getDataLatih()) self.NaiveBayes = NaiveBayes(self.ImportedFile.getDataLatih()) latihstart = time.clock() self.NaiveBayes.latih() waktu = time.clock() - latihstart waktu = round(waktu, 2) self.LabelWaktuLatih.configure(text="Waktu Latih: " + str(waktu) + " detik") self.LabelWaktuLatih.lift(self.Frame2) self.LabelModelLatih.lift(self.Frame2) except: msg.showerror("Terjadi Kesalahan", "Pastikan dataset sudah diinput!") elif (self.Algorithm.get() == "Gaussian NB"): print(0) try: print("GNB") self.ImportedFile.setDataLatih() print(self.ImportedFile.getDataLatih()) self.GNB = GNaiveBayes(self.ImportedFile.getDataLatih()) latihstart = time.clock() self.GNB.latih() waktu = time.clock() - latihstart waktu = round(waktu, 2) self.LabelWaktuLatih.configure(text="Waktu Latih: " + str(waktu) + " detik") self.LabelWaktuLatih.lift(self.Frame2) self.LabelModelLatih.lift(self.Frame2) except: msg.showerror( "Terjadi Kesalahan", "Pastikan dataset sudah diinput, dan bersifat numerik atau kontinyu!" ) else: msg.showerror( "Terjadi Kesalahan", "Harap masukkan data atau pilih algoritma terlebih dahulu")
def use_nb(datas_train, datas_valid): nb = NaiveBayes() nb.train(datas_train) # predicts = [] for i in range(len(datas_valid)): if (i%1000) == 0: print("Getting prediction of", i, "documents out of", len(datas_valid)) predicts.append(nb.predict(datas_valid[i]['content'])) # pp = [p[0] for p in predicts] # correct = 0 for i in range(len(predicts)): if predicts[i][0] == datas_valid[i]['category']: correct += 1 # res = "Correct: %d out of %d"%(correct, len(datas_valid)) print(res) return res
def init(self): """ Initialisation stuff """ pygame.init() pygame.display.init() pygame.display.set_caption('Naive Bayes for digit recognition') screen_width=330 screen_height=280 self.screen=pygame.display.set_mode([screen_width,screen_height]) # get the screen self.pixel = [[[(10,10,10), [i*10, j*10]] for i in range(28)] for j in range(28)] # Create a matrix that has dimensions 28x28. # Every entry is the rgb value of the pixel and # its actual position on screen self.predicted = '?' pygame.font.init() self.res = pygame.font.Font(None, 36) # Font for the predicted value self.text = pygame.font.Font(None, 18) # Font for the normal text self.prev_state = pygame.mouse.get_pressed() self.classifier = NaiveBayes() # The classifier. PATH='./trained.pickle' if os.path.isfile(PATH): # If we trained it already we loade the training values self.classifier.train([],[],True) else: # else we train it training = True print 'Reading MNIST' # first read the training data training_set, training_labels = self.classifier.read_MNIST(60000, training) print'DONE!\n' print 'Training' t = 's' start_time = time.localtime() self.classifier.train(training_set, training_labels, False) # train the classifier end_time = time.localtime() # just stuff for the timing output b = end_time[4] - start_time[4] if b < 0: b = 60 + b t = str(b) + 'min ' a = end_time[5] - start_time[5] if a < 0: a = 60 + a t += str(a) + 'sec' print 'DONE IN ' + t + '!\n' self.initialized = True
from Tokenizer import Tokenizer import re import os import json from collections import OrderedDict dir1 = sys.argv[1] # dir for top level category classifier. prior, condprobs & config dir2 = sys.argv[2] # dor for subcat classifier. Contains subdirs for every top level categort. Subdir have prior & condprobs infile = open(sys.argv[3],'r') # input file \t seperated opformat = sys.argv[4] #json or tsv assert opformat == 'json' or opformat == 'tsv' prior = json.load(open(os.path.join(dir1,'prior.json'),'rb')) condprobs = json.load(open(os.path.join(dir1,'probs.json'),'rb')) NB = NaiveBayes(prior, condprobs) t = Tokenizer() subcat_classifiers = {} for k in prior.keys(): p = json.load(open(os.path.join(dir2,re.sub('[ &]','_', k),'prior.json'),'rb')) c = json.load(open(os.path.join(dir2,re.sub('[ &]','_', k),'probs.json'),'rb')) subcat_classifiers[k] = NaiveBayes(p,c) def unicodify(text): return text.encode('utf-8','ignore') def print_line(d): if opformat == 'tsv': print "\t".join(d.values()).encode('utf-8','ignore') if opformat == 'json':
def main(argv): trainfile = '' testfile = '' mode = '' # validate input if len(sys.argv) == 4: trainfile = sys.argv[1] testfile = sys.argv[2] mode = sys.argv[3] else: print("incorrect input supplied") sys.exit() # ingest data trainset = readFile(trainfile) testset = readFile(testfile) # y1 = 0 # y2 = 0 # for instance in trainset.instances: # if instance[-1] == trainset.labels[0]: # y1 +=1 # else: # y2 +=1 if mode == "n": print trainset.attributeValues print trainset.labels[0], y1 print trainset.labels[1], y2 bayes = NaiveBayes(trainset, testset) bayes.train(trainset.instances) #print bayes.yCounts #print bayes.xGivenYCounts[trainset.labels[0]]['bl_of_lymph_c'].values() #print bayes.xGivenYCounts[trainset.labels[1]]['bl_of_lymph_c'].values() preds = bayes.classify(testset.instances) corCount = 0 for i in range(len(preds)): print preds[i][0], testset.instances[i][-1], preds[i][1] if preds[i][0] == testset.instances[i][-1]: corCount += 1 print corCount if mode == "t": tan = TAN(trainset, trainset) edges = tan.initializeGraph() prim = tan.growPrim(edges) tan.setParentList(prim[1]) for attrib in trainset.attributes: if tan.parentList[attrib]: print attrib, tan.parentList[attrib][0], 'class' else: print attrib, 'class' preds = tan.classify(testset.instances) print '' corCount = 0 for i in range(len(preds)): print preds[i][0], testset.instances[i][-1], preds[i][1] if preds[i][0] == testset.instances[i][-1]: corCount += 1 print '' print corCount
are written to stdout. @author David Greisler <*****@*****.**> @author Paul Kitt <*****@*****.**> """ from DirectoryCrawler import DirectoryCrawler from TrainingClass import TrainingClass from NaiveBayes import NaiveBayes from BagOfWords import BagOfWords import os root_path = "<HIER PFAD ZUM data VERZEICHNIS EINFUEGEN!>" crawler = DirectoryCrawler(root_path) naive_bayes = NaiveBayes() class_names = [ "politik", "sport", "wirtschaft" ] training_classes = [] number_of_documents = 0 classes = [] vocabulary = BagOfWords("") print "Root directory for test/training data: " + crawler.root_path print "Class names: " + ', '.join(str(name) for name in class_names) for document_class in class_names: training_class = TrainingClass(document_class, crawler.read_training_documents(document_class)) training_classes.append(training_class) number_of_documents += len(training_class.training_documents)
#datapoints=[] for data in data2: datapoint=getFeatures(data) print data classes.append('non_confused') datapoints.append(datapoint) #print datapoints #print classes #print print len(data2) print len(datapoints) nb = NaiveBayes() tdatapoints = datapoints tclasses = classes; if ('test' in sys.argv[1]): print sys.argv[1] try: datapoints=pickle.load( open( "save.p", "rb" ) ) classes=pickle.load( open( "save.p1", "rb" ) ) except: datapoints = tdatapoints classes = tclasses print "reverting to stateless mode" else:
# from __future__ import division from NaiveBayes import NaiveBayes import Preprocessing __author__ = 'undeed' inputDataTrain = 'Data Train.xlsx' inputDataTest = 'Data Test.xlsx' preprocessedData = "dataset_preprocessing.xlsx" model = "model_classification.xlsx" outputResult = "RESULT CLASS.xlsx" # print "preprocess file" # Preprocessing.preprocessFile(inputDataTrain, preprocessedData) # nb = NaiveBayes(model) # # print "start learning" # nb.learning(inputDataTrain, preprocessedData) # print "stop learning" print "start testing" nb.testing(inputDataTest, outputResult)
class NaiveBayesUI: """ Class that allowes us to draw our own digits and let the classifier do it's magic on them """ def init(self): """ Initialisation stuff """ pygame.init() pygame.display.init() pygame.display.set_caption('Naive Bayes for digit recognition') screen_width=330 screen_height=280 self.screen=pygame.display.set_mode([screen_width,screen_height]) # get the screen self.pixel = [[[(10,10,10), [i*10, j*10]] for i in range(28)] for j in range(28)] # Create a matrix that has dimensions 28x28. # Every entry is the rgb value of the pixel and # its actual position on screen self.predicted = '?' pygame.font.init() self.res = pygame.font.Font(None, 36) # Font for the predicted value self.text = pygame.font.Font(None, 18) # Font for the normal text self.prev_state = pygame.mouse.get_pressed() self.classifier = NaiveBayes() # The classifier. PATH='./trained.pickle' if os.path.isfile(PATH): # If we trained it already we loade the training values self.classifier.train([],[],True) else: # else we train it training = True print 'Reading MNIST' # first read the training data training_set, training_labels = self.classifier.read_MNIST(60000, training) print'DONE!\n' print 'Training' t = 's' start_time = time.localtime() self.classifier.train(training_set, training_labels, False) # train the classifier end_time = time.localtime() # just stuff for the timing output b = end_time[4] - start_time[4] if b < 0: b = 60 + b t = str(b) + 'min ' a = end_time[5] - start_time[5] if a < 0: a = 60 + a t += str(a) + 'sec' print 'DONE IN ' + t + '!\n' self.initialized = True def draw(self): """ The draw method that gets called in the "Mainloop" """ if self.initialized: self.screen.fill((0,0,0)) for row in self.pixel: for pixel in row: pygame.draw.rect(self.screen, pixel[0], (pixel[1][0], pixel[1][1], 10, 10)) # Draw the pixel in the matrix pygame.draw.rect(self.screen, (255, 0, 0), (280, 0, 50, 50)) # Draws the predict "button" pygame.draw.rect(self.screen, (255,0,0),(280,230,50,50)) # Draws the clar screen "button" pygame.draw.rect(self.screen, (255,255,255), (280, 0, 2, 280)) # Border between picture and "buttons" text = self.res.render('= '+str(self.predicted), 1, (255, 255, 255)) # Draws the wanted texts at their positions self.screen.blit(text, (285, 140)) text = self.text.render('Clear', 1, (0,0,0)) self.screen.blit(text, (285, 240)) text = self.text.render('Screen', 1, (0,0,0)) self.screen.blit(text, (285, 260)) text = self.text.render('Predict', 1, (0,0,0)) self.screen.blit(text, (285, 20)) pygame.display.flip() def addtuples(self,x,y): """ Helper method to simply add two RGB tupels """ a = [] for i in range(len(x)): b = x[i] + y[i] if b < 0: b = 0 if b > 255: b = 255 a.append(b) return tuple(a) def update(self): """ The update function is called every time in the "Mainloop" """ self.mouse_state = pygame.mouse.get_pressed() if self.mouse_state[0] == 1: # if the LMB is pressed we eiter want to draw some pixel on the screen, ... pos = pygame.mouse.get_pos() if pos[0] >= 0 and pos[1] >= 0 and pos[0] < 280 and pos[1] < 280: x = pos[0] / 10 % 28 y = pos[1] / 10 % 28 self.pixel [y][x][0] = (255,255,255) if y > 0: self.pixel[y-1][x][0] = self.addtuples(self.pixel[y-1][x][0], (5, 5, 5)) if y < 27: self.pixel[y+1][x][0] = self.addtuples(self.pixel[y+1][x][0], (5, 5, 5)) if x > 0: self.pixel[y][x-1][0] = self.addtuples(self.pixel[y][x-1][0], (5, 5, 5)) if x < 27: self.pixel[y][x+1][0] = self.addtuples(self.pixel[y][x+1][0], (5, 5, 5)) elif pos[0] >= 285 and pos[0] <= 330 and pos[1] >= 230 and pos[1] <= 280 and self.prev_state[0] != self.mouse_state[0]: # ... clicked the clear button, ... for i in range(28): for j in range(28): self.pixel[i][j][0] = (10,10,10) self.predicted = '?' elif pos[0] >= 285 and pos[0] <= 330 and pos[1] >= 0 and pos[1] <= 50 and self.prev_state[0] != self.mouse_state[0]: # ... or we want our picture to be predicted image = [] for i in range(28): for j in range(28): image.append(self.pixel[i][j][0][0]) self.predicted = self.classifier.predict(image) self.prev_state = self.mouse_state def main(self): """ "Mainloop" """ while 1: for event in pygame.event.get(): if event.type == pygame.QUIT: return self.update() self.draw()
def output(self, partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainDir = '../data/imdb1/' classifier = NaiveBayes() if partId == 1: # development on all words splits = classifier.crossValidationSplits(trainDir) accuracy = 0.0 for split in splits: nb = NaiveBayes() nb.train(split) guesses = nb.test(split) numCorrect = 0.0 for i in range(0, len(guesses)): guess = guesses[i] gold = split.test[i].klass if guess == gold: numCorrect += 1 accuracy += numCorrect/len(guesses) accuracy = accuracy / 10.0 output = 'accuracy: 1 %f' % accuracy return output elif partId == 2: # testing on all words trainSplit = classifier.trainSplit(trainDir) classifier.train(trainSplit) testSplit = buildTestCorpus(ch_aux) guesses = classifier.test(testSplit) guesses.insert(0, '2') output = '\n'.join(guesses) return output elif partId == 3: # development without stopwords splits = classifier.crossValidationSplits(trainDir) accuracy = 0.0 for split in splits: nb = NaiveBayes() nb.FILTER_STOP_WORDS = True nb.train(split) guesses = nb.test(split) numCorrect = 0.0 for i in range(0, len(guesses)): guess = guesses[i] gold = split.test[i].klass if guess == gold: numCorrect += 1 accuracy += numCorrect/len(guesses) accuracy = accuracy / 10.0 output = 'accuracy: 3 %f' % accuracy return output elif partId == 4: # testing without stopwords classifier.FILTER_STOP_WORDS = True trainSplit = classifier.trainSplit(trainDir) classifier.train(trainSplit) testSplit = buildTestCorpus(ch_aux) guesses = classifier.test(testSplit) guesses.insert(0, '4') # put in the part id. output = '\n'.join(guesses) return output elif partId == 5: # development binarized splits = classifier.crossValidationSplits(trainDir) accuracy = 0.0 for split in splits: nb = NaiveBayes() nb.BOOLEAN_NB = True nb.train(split) guesses = nb.test(split) numCorrect = 0.0 for i in range(0, len(guesses)): guess = guesses[i] gold = split.test[i].klass if guess == gold: numCorrect += 1 accuracy += numCorrect/len(guesses) accuracy = accuracy / 10.0 output = 'accuracy: 5 %f' % accuracy return output elif partId == 6: # testing binarized classifier.BOOLEAN_NB = True trainSplit = classifier.trainSplit(trainDir) classifier.train(trainSplit) testSplit = buildTestCorpus(ch_aux) guesses = classifier.test(testSplit) guesses.insert(0, '6') # put in the part id. output = '\n'.join(guesses) return output elif partId == 7: # development best model splits = classifier.crossValidationSplits(trainDir) accuracy = 0.0 for split in splits: nb = NaiveBayes() nb.BEST_MODEL = True nb.train(split) guesses = nb.test(split) numCorrect = 0.0 for i in range(0, len(guesses)): guess = guesses[i] gold = split.test[i].klass if guess == gold: numCorrect += 1 accuracy += numCorrect/len(guesses) accuracy = accuracy / 10.0 output = 'accuracy: 7 %f' % accuracy return output elif partId == 8: # testing best model classifier.BEST_MODEL = True trainSplit = classifier.trainSplit(trainDir) classifier.train(trainSplit) testSplit = buildTestCorpus(ch_aux) guesses = classifier.test(testSplit) guesses.insert(0, '8') # put in the part id. output = '\n'.join(guesses) return output else: print 'Unknown partId: %d' % partId return None
def __init__(self, trainset, testset): NaiveBayes.__init__(self, trainset, testset) self.train(trainset.instances)
importer = Importer() print('Loading stop words') importer.add_stop_words('data/stopwords/german/') # Importing training sets training_data = [] print('Loading training data') training_data.append(importer.extract_training_data('data/politik/', label='politik')) training_data.append(importer.extract_training_data('data/wirtschaft/', label='wirtschaft')) training_data.append(importer.extract_training_data('data/sport/', label='sport')) nb = NaiveBayes() print('Training') nb.train(training_data) # Importing test sets test_data = [] print('Loading test data') test_data.append(importer.extract_test_data('data/politik/', label='politik')) test_data.append(importer.extract_test_data('data/sport/', label='sport')) test_data.append(importer.extract_test_data('data/wirtschaft/', label='wirtschaft')) print('Testing') accuracy = nb.test(test_data) print('accuracy: ' + str(accuracy))
tn = [(a,b) for a,b in accuracyResults if a == 0 and b ==0] precision = float(len(tp))/(len(tp) + len(fp)) recall = float(len(tp))/(len(tp) + len(fn)) return 1./((a*(1/precision))+((1-a)*1/recall)) if __name__ == "__main__": from DataSet import DataSet from NaiveBayes import NaiveBayes from IBk import IBk fileIn = "C:\\Users\\a5rjqzz\\Desktop\\Python\\pyClassifiers\\data\\IBk\\sample_set_life.gla" ds = DataSet(fileIn) nb = NaiveBayes() es = Estimator() ib = IBk() for i in xrange(30):# train, test = ds.getTrainTestSet() crossValida = ds.getCrossValidationSet(2) #nb.train(ds) #results = nb.test(test) #print es.accuracy(results) #ib.train(train) #results = ib.test(test) #print es.accuracy(results)
def main(): features = None IDs = None all_targets = None try: print 'load training features...' features, IDs = load_features(features_file_train, 1) print 'load training targets...' all_targets = load_targets(targets_file_train) except IOError: print "The corresponding files have not been created yet." print "Please run preprocessing with the same parameters and try again." raise SystemExit(0) print 'split data...' features_train, all_targets_train, IDs_train, features_test, targets_test, IDs_test\ = splitdata(features, all_targets, IDs) # run Naive Bayes for each target separately # (not 1 vs all because different targets are independent) all_targets_train = all_targets_train.T all_probabilities = [] for i in xrange(15): print 'TARGET %d:' % (i) print 'train...' nb = NaiveBayes() targets = all_targets_train[i] nb.train(features_train, targets) # PREDICTION: print 'predict...' probabilities = nb.predict(features_test) all_probabilities.append(probabilities) print 'write predictions to file...' predictions_file = open(predictions_filename, 'w') write_csv_row(predictions_file, ['id', 's1', 's2', 's3', 's4', 's5', 'w1', 'w2',\ 'w3','w4', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6',\ 'k7', 'k8','k9', 'k10', 'k11', 'k12', 'k13',\ 'k14', 'k15']) prob_file = open(probabilities_filename, 'w') write_csv_row(prob_file, ['id', 's1', 's2', 's3', 's4', 's5', 'w1', 'w2',\ 'w3','w4', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6',\ 'k7', 'k8','k9', 'k10', 'k11', 'k12', 'k13',\ 'k14', 'k15']) all_prob = (np.array(all_probabilities).T).tolist() all_predictions = [] for i in range(len(all_prob)): prob = all_prob[i] ID = IDs_test[i] zeros = [int(ID)] + [0] * 9 write_csv_row(prob_file, zeros + prob) #make predictions from probabilities (either 0 or 1): pred = [round(p) for p in prob] write_csv_row(predictions_file, zeros + pred) all_predictions.append(pred) prob_file.close() predictions_file.close() print '' print 'EVALUATE PREDICTIONS' row_errors = 0 total_errors = 0 number_tweets = len(targets_test) predictions_total = 15 * number_tweets for i in xrange(number_tweets): target_row = targets_test[i] targets_rounded = [round(p) for p in target_row] predictions = all_predictions[i] row_wrong = False for j in xrange(len(targets_rounded)): if targets_rounded[j] != predictions[j]: row_wrong = True total_errors += 1 if row_wrong: row_errors += 1 row_accuracy = (float(number_tweets - row_errors)/number_tweets) * 100 total_accuracy = (float(predictions_total - total_errors))\ / predictions_total* 100 print '%d/%d tweets contain an error in the predictions \ --> accuracy = %d percent' % (row_errors, number_tweets, row_accuracy) print '%d/%d predictions in total wrong --> accuracy %d percent.' \ % (total_errors, predictions_total, total_accuracy) print 'finished.' return
def output(partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainDir = "../data/imdb1/" classifier = NaiveBayes() if partId == 1: # development on all words splits = classifier.crossValidationSplits(trainDir) accuracy = 0.0 for split in splits: nb = NaiveBayes() nb.train(split) guesses = nb.test(split) numCorrect = 0.0 for i in range(0, len(guesses)): guess = guesses[i] gold = split.test[i].klass if guess == gold: numCorrect += 1 accuracy += numCorrect / len(guesses) accuracy = accuracy / 10.0 output = "accuracy: 1 %f" % accuracy return output elif partId == 2: # testing on all words trainSplit = classifier.trainSplit(trainDir) classifier.train(trainSplit) testSplit = buildTestCorpus(ch_aux) guesses = classifier.test(testSplit) guesses.insert(0, "2") output = "\n".join(guesses) return output elif partId == 3: # development without stopwords splits = classifier.crossValidationSplits(trainDir) accuracy = 0.0 for split in splits: nb = NaiveBayes() nb.FILTER_STOP_WORDS = True nb.train(split) guesses = nb.test(split) numCorrect = 0.0 for i in range(0, len(guesses)): guess = guesses[i] gold = split.test[i].klass if guess == gold: numCorrect += 1 accuracy += numCorrect / len(guesses) accuracy = accuracy / 10.0 output = "accuracy: 3 %f" % accuracy return output elif partId == 4: # testing without stopwords classifier.FILTER_STOP_WORDS = True trainSplit = classifier.trainSplit(trainDir) classifier.train(trainSplit) testSplit = buildTestCorpus(ch_aux) guesses = classifier.test(testSplit) guesses.insert(0, "4") # put in the part id. output = "\n".join(guesses) return output else: print "Unknown partId: %d" % partId return None