def __init__(self, train_file, test_file, N): self.V = ['lib', 'con'] self.test_file = test_file self.NB = NaiveBayes(train_file) self.N = N self.exclude_stopwords() self.NB.calculate_p_wv() self.test()
class NBStopWords: def __init__(self, train_file, test_file, N): self.V = ['lib', 'con'] self.test_file = test_file self.NB = NaiveBayes(train_file) self.N = N self.exclude_stopwords() self.NB.calculate_p_wv() self.test() def exclude(self, word, counter): if word in counter: del counter[word] def exclude_stopwords(self): stopwords = list() for word in self.NB.vocabulary.most_common(self.N): w = word[0] stopwords.append(w) self.exclude(w, self.NB.text_libs) self.exclude(w, self.NB.text_cons) for word in stopwords: self.exclude(word, self.NB.vocabulary) def test(self): accurate = 0 sum = 0 with open(self.test_file, "r") as docs: for doc in docs: sum += 1 doc = doc.rstrip("\n\r") if self.V[0] in doc: label = self.V[0] else: label = self.V[1] with open(doc, "r") as text: v_lib, v_con = self.NB.classify(text) if v_lib > v_con: v_nb = self.V[0] print "L" else: v_nb = self.V[1] print "C" if v_nb == label: accurate += 1 accuracy = float(accurate) / sum print("Accuracy: %.04f" % accuracy)
def digitClassification(percent: int): n_total_digits = 450 n_samples_digits = int(n_total_digits * (percent / 100)) n_testing_digits = 1000 digit_list = read_digits_file("digitdata/trainingimages", n_samples_digits) digit_test_list = read_digits_file("digitdata/testimages", n_testing_digits) y = np.array( util.loadLabelsFile("digitdata/traininglabels", n_samples_digits)) train_y = y test_y = np.array( util.loadLabelsFile("digitdata/testlabels", n_testing_digits)) x = perceptron_faces_features(digit_list) train_x = x test_x = perceptron_faces_features(digit_test_list) p_model = Perceptron(100) start_time = time.time() p_model.train(train_x, train_y) elapsed_time = time.time() - start_time print('%.3f' % (elapsed_time) + " seconds for training") start_time = time.time() matches = list(p_model.predict(test_x) == test_y).count(True) elapsed_time = time.time() - start_time print('%.3f' % (elapsed_time) + " seconds for predicting") accuracy = matches / n_testing_digits print('Perceptron accuracy:', 100 * accuracy, '%') print() x = nb_faces_features(digit_list) train_x = x test_x = nb_faces_features(digit_test_list) nb_model = NaiveBayes() start_time = time.time() nb_model.train(train_x, train_y) elapsed_time = time.time() - start_time print('%.3f' % (elapsed_time) + " seconds for training") start_time = time.time() matches = list(nb_model.predict(test_x) == test_y).count(True) elapsed_time = time.time() - start_time print('%.3f' % (elapsed_time) + " seconds for predicting") accuracy = matches / n_testing_digits print('nb accuracy:', 100 * accuracy, '%') print() x = knn_faces_features(digit_list) train_x = x test_x = knn_faces_features(digit_test_list) knn_model = KNN() start_time = time.time() knn_model.train(train_x, train_y) elapsed_time = time.time() - start_time print('%.3f' % (elapsed_time) + " seconds for training") start_time = time.time() matches = list(knn_model.predict(test_x) == test_y).count(True) elapsed_time = time.time() - start_time print('%.3f' % (elapsed_time) + " seconds for predicting") accuracy = matches / n_testing_digits print('KNN accuracy:', 100 * accuracy, '%') print()
from nb import NaiveBayes import numpy as np import scipy.io as sio from sklearn.metrics import zero_one_loss #change this to where mat_dict = sio.loadmat('XwindowsDocData.mat') Xtrain = mat_dict['xtrain'].toarray() Xtest = mat_dict['xtest'].toarray() ytrain = mat_dict['ytrain'].flatten() ytest = mat_dict['ytest'].flatten() nb = NaiveBayes() pi, theta = nb.fit(Xtrain, ytrain) ypred_train = nb.predict(Xtrain) ypred_test = nb.predict(Xtest) print(ypred_train) print(ypred_test) #because the classes are 1,2 ypred_train = 1 + ypred_train.argmax(axis=1) ypred_test = 1 + ypred_test.argmax(axis=1) print(ypred_train[-20:]) print(ytrain[-20:]) print(ypred_test[-20:])
from nb import NaiveBayes from sklearn import datasets from sklearn.model_selection import train_test_split import numpy as np def accuracy(pred, label): acc = np.sum(pred == label) / len(label) return acc X, y = datasets.make_classification(n_samples=1000, n_features=4, n_classes=2, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) model = NaiveBayes() model.fit(X_train, y_train) predict = model.predict(X_test) acc = accuracy(predict, y_test) print(acc)
data = Data("mushrooms.csv") method = ANN(data) i = time() method.train() method.predict() tempo = time() - i result = method.getPercentage() print 'Tempo (ms):', tempo print 'Taxa de acerto:', result print '' data = Data("mushrooms.csv") method = SVM(data) i = time() method.train() method.predict() tempo = time() - i result = method.getPercentage() print 'Tempo (ms):', tempo print 'Taxa de acerto:', result print '' data = Data("mushrooms.csv") method = NaiveBayes(data) i = time() method.train() method.predict() tempo = time() - i result = method.getPercentage() print 'Tempo (ms):', tempo print 'Taxa de acerto:', result
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt from nb import NaiveBayes def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayes() nb.fit(X_train, y_train) predictions = nb.predict(X_test) # print(type(y_train)) print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
def main(): start_time = time.time() # Read documents, divide according to the topics and separate train and test data-set. t_path = os.getcwd() + "/bbc/" all_docs = defaultdict(lambda: list()) topic_list = list() print("Reading all the documents...\n") for topic in os.listdir(t_path): d_path = t_path + topic + '/' topic_list.append(topic) temp_docs = list() for f in os.listdir(d_path): f_path = d_path + f temp_docs.append(Document(f_path, topic)) all_docs[topic] = temp_docs[:] fold_count = 10 train_docs, test_docs = list(), list() for key, value in all_docs.items(): random.shuffle(value) test_len = int(len(value) / fold_count) train_docs += value[:-test_len] test_docs += value[-test_len:] # Create tfidf and tfidfie index of training docs, and store into the docs. index = Index(train_docs) print("Train Document Count: " + str(len(train_docs))) print("Test Document Count: " + str(len(test_docs))) test_topics = [d.topic for d in test_docs] for doc in train_docs: doc.vector = doc.tfidfie for doc in test_docs: doc.vector = doc.tf # create classifier instances. nb = NaiveBayes() rc = RankClassifier() kmeans = KMeans(topic_list) classifier_list = [rc, nb, kmeans] for i in range(len(classifier_list)): print("\nClassifier #" + str(i + 1) + "\n") classifier = classifier_list[i] classifier.confusion_matrix, c_dict = init_confusion_matrix(topic_list) print("Training...\n") classifier.train(train_docs) print("Testing... Classifying the test docs...\n") predictions = classifier.classify(test_docs) # Update the confusion matrix and statistics with updated values. classifier.confusion_matrix = update_confusion_matrix( test_topics, predictions, classifier.confusion_matrix, c_dict) classifier.stats = cal_stats(classifier.confusion_matrix) print("Confusion Matrix\n") for item in classifier.confusion_matrix: print(item) print("\nStatistics\n") print_table(get_stats_table(classifier.stats)) print("Run time...{} secs \n".format(round(time.time() - start_time, 4))) # call recommendation system once classifiers are ready. recommendation(all_docs, test_docs, classifier_list)
# label_encoder = LabelEncoder() # for col in X.columns: # X[col] = label_encoder.fit_transform(X[col]) # y['Play'] = label_encoder.fit_transform(y['Play']) # print(X.head()) # print(y) # X = np.array(X, dtype=np.float64) # y = np.array(y, dtype=np.float64) # X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) X, y = datasets.make_classification(n_samples=100, n_features=4, n_classes=2, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) # print(len(y[y == 0]) / len(y)) # print(len(y[y == 1]) / len(y)) nb = NaiveBayes() # print(X_test) nb.fit(X_train, y_train) # predictions = nb.predict(X_test) # print(predictions) # predictions = nb.predict([[ 0.83617024, 0.47576265, 0.76693704, 1.54433392]]) # print(predictions) # print('Naive Bayes Classification Accuracy: ', accuracy(y_test, predictions))
from nb import NaiveBayes from itertools import islice import sys if __name__ == '__main__': nb = NaiveBayes(sys.argv[1]) nb.calculate_p_wv() for w, p in islice( sorted(nb.p_wv.items(), key=lambda pair: pair[1][0], reverse=True), 20): print(w + " %.04f" % p[0]) print for w, p in islice( sorted(nb.p_wv.items(), key=lambda pair: pair[1][1], reverse=True), 20): print(w + " %.04f" % p[1])
def main(): start_time = time.time() t_path = "../data_set/bbc/" all_docs = defaultdict(lambda: list()) topic_list = list() print("Reading all the documents...\n") print(os.listdir(t_path)) for topic in os.listdir(t_path): d_path = t_path + topic + '/' topic_list.append(topic) temp_docs = list() for f in os.listdir(d_path): f_path = d_path + f temp_docs.append(Document(f_path, topic)) all_docs[topic] = temp_docs[:] fold_count = 10 train_docs, test_docs = list(), list() for key, value in all_docs.items(): random.shuffle(value) test_len = int(len(value) / fold_count) train_docs += value[:-test_len] test_docs += value[-test_len:] index = Index(train_docs) print("Train Document Count: " + str(len(train_docs))) print("Test Document Count: " + str(len(test_docs))) test_topics = [d.topic for d in test_docs] for doc in train_docs: doc.vector = doc.tfidfie for doc in test_docs: doc.vector = doc.tf nb = NaiveBayes() rc = RankClassifier() kmeans = KMeans(topic_list) classifier_list = [rc, nb, kmeans] for i in range(len(classifier_list)): print("\nClassifier #" + str(i + 1) + "\n") classifier = classifier_list[i] classifier.confusion_matrix, c_dict = init_confusion_matrix(topic_list) print("Training...\n") classifier.train(train_docs) print("Testing... Classifying the test docs...\n") predictions = classifier.classify(test_docs) classifier.confusion_matrix = update_confusion_matrix( test_topics, predictions, classifier.confusion_matrix, c_dict) classifier.stats = cal_stats(classifier.confusion_matrix) print("Confusion Matrix\n") for item in classifier.confusion_matrix: print(item) print("\nStatistics\n") print_table(get_stats_table(classifier.stats)) print("Run time...{} secs \n".format(round(time.time() - start_time, 4))) recommendation(all_docs, test_docs, classifier_list)
def main(): start_time = time.time() # Read documents, divide according to the topics and separate train and test data-set. t_path = "../bbc/" all_docs = defaultdict(lambda: list()) topic_list = list() for topic in os.listdir(t_path): d_path = t_path + topic + '/' topic_list.append(topic) temp_docs = list() for f in os.listdir(d_path): f_path = d_path + f temp_docs.append(Document(f_path, topic)) all_docs[topic] = temp_docs[:] fold_count = 10 train_docs, test_docs = list(), list() for key, value in all_docs.items(): random.shuffle(value) test_len = int(len(value) / fold_count) train_docs += value[:-test_len] # explanation # lis = [1,2,3,4,5] # print(lis[:-4]) # print(lis[-4:]) test_docs += value[-test_len:] # Create tfidf and tfidfie index of training docs, and store into the docs. index = Index(train_docs) test_topics = [d.topic for d in test_docs] for doc in train_docs: doc.vector = doc.tfidfie for doc in test_docs: doc.vector = doc.tf # create classifier instances. nb = NaiveBayes() rc = RankClassifier() kmeans = KMeans(topic_list) classifier_list = [nb, rc, kmeans] for i in range(len(classifier_list)): classifier = classifier_list[i] classifier.confusion_matrix, c_dict = init_confusion_matrix(topic_list) classifier.train(train_docs) predictions = classifier.classify(test_docs) # Update the confusion matrix and statistics with updated values. classifier.confusion_matrix = update_confusion_matrix( test_topics, predictions, classifier.confusion_matrix, c_dict) classifier.stats = cal_stats(classifier.confusion_matrix) global lst lst = [] lst.append(all_docs) lst.append(test_docs) lst.append(classifier_list) return redirect('http://localhost:5000/recommend')