def cross_validation(X_train, y_train, num_folds, k_choices, m_choices): num_test = X_train.shape[0] / num_folds # 将训练集分成 num_folds 份 X_train_folds = np.array(np.array_split(X_train, num_folds)) y_train_folds = np.array(np.array_split(y_train, num_folds)) # 保存不同 k 的结果 k_to_accuracies = dict.fromkeys(m_choices) for m in k_to_accuracies: k_to_accuracies[m] = {} # 交叉验证核心运行代码 for dist_m in m_choices: for n in range(num_folds): combinat = [x for x in range(num_folds) if x != n] x_training_dat = np.concatenate(X_train_folds[combinat]) y_training_dat = np.concatenate(y_train_folds[combinat]) classifier_k = KNearestNeighbor() classifier_k.train(x_training_dat, y_training_dat) ks_y_cross_validation_pred = classifier_k.predict_labels_diffrent_Ks( X_train_folds[n], k_choices, dist_m) for k in range(len(k_choices)): # y_cross_validation_pred = classifier_k.predict(X_train_folds[n], k=k_choices[k], dist_m=dist_m) # num_correct = np.sum(y_cross_validation_pred == y_train_folds[n]) num_correct = np.sum( ks_y_cross_validation_pred[k] == y_train_folds[n]) accuracy = float(num_correct) / num_test k_to_accuracies[dist_m].setdefault(k_choices[k], []).append(accuracy) print("num_folds: %d / %d, dist_m: %s, k: %d, acc: %f" % (n + 1, num_folds, dist_m, k_choices[k], accuracy)) return k_to_accuracies
def index(): mndata = MNIST('./data') # Load data to variables train_images, train_labels = mndata.load_training() test_images, test_labels = mndata.load_testing() # Create random range of test examples to include exampleindeces = np.random.random_integers(0, high=9999, size=4) # Assign test data to numpy arrays images = np.asarray(test_images) labels = np.asarray(test_labels) # Construct the KNN classifier classifier = KNearestNeighbor() # Load the classifier with train data classifier.train(np.asarray(train_images), np.asarray(train_labels)) # Predict the labels with KNN predictions = np.rint(classifier.predict(images[exampleindeces], 3)) # Save ground truth labels for checking if prediction was correct truths = labels[exampleindeces] i = 1 for index in exampleindeces: two_d = (np.reshape(images[index], (28, 28)) * 255).astype(np.uint8) im = Image.fromarray(two_d, 'L') filename = "static/" + str(i) + ".png" im.save(filename) i += 1 # Render the page return render_template('index.html', preds=predictions, truths=truths)
def __init__(self): self.data = data.data_utilities.get_CIFAR10_data(num_training=50000, num_validation=0, num_test=10000) self.knn = KNearestNeighbor() self.knn.train(self.data['X_train'].reshape((50000, 3072)), self.data['y_train']) self.mlp = load_model("mlp_relu.h5") self.cnn = load_model("cnn_relu.h5") self.datamean = np.resize(self.data['mean'], (1, 3072))
def run_test(best_k, best_m, X_train, y_train, X_test, y_test): # 选择最好的 k 值,在测试集中测试 num_test = X_test.shape[0] classifier = KNearestNeighbor() classifier.train(X_train, y_train) y_test_pred = classifier.predict(X_test, k=best_k, dist_m=best_m) num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)) return num_correct, num_test, accuracy
class ImageHandler: #Predictors initializer def __init__(self): self.data = data.data_utilities.get_CIFAR10_data(num_training=50000, num_validation=0, num_test=10000) self.knn = KNearestNeighbor() self.knn.train(self.data['X_train'].reshape((50000, 3072)), self.data['y_train']) self.mlp = load_model("mlp_relu.h5") self.cnn = load_model("cnn_relu.h5") self.datamean = np.resize(self.data['mean'], (1, 3072)) #Supposing url is a valid string url def predict_image(self, url): image_file = open("image_to_predict.jpg", "wb") req = Request( url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}) res = urlopen(req) image = res.read() image_file.write(image) image_file.close() ready_image = self.prepare_image() #Predicting labels_dict = {} labels_dict["knn"] = self.knn.predict(X=ready_image, k=3) labels_dict["mlp"] = np.argmax(self.mlp.predict(ready_image)) labels_dict["cnn"] = np.argmax( self.cnn.predict(np.reshape(ready_image, (1, 3, 32, 32)))) return labels_dict #Resize and vectorize the image def prepare_image(self): img = Image.open("image_to_predict.jpg") resized = img.resize((32, 32)) #Transpose the image tensor so that the img_tensor = np.array(resized).transpose(2, 0, 1).copy() image_array = np.resize(img_tensor, (1, 3072)) - self.datamean return image_array
if i == 0: plt.title(cls) # plt.show() # plt.close() #选取5000张训练集, 500张测试集, num_training = 5000 mask = range(num_claesses) x_train = x_train[mask] y_train = y_train[mask] num_test = 500 mask = range(num_test) x_test = x_test[mask] y_test = y_test[mask] x_train = np.reshape(x_train, (x_train.shape[0], -1)) #把图像数据拉长为行向量 x_test = np.reshape(x_test, (x_test.shape[0], -1)) print("x_train的shape:", x_train.shape) print("x_test的shape:", x_test.shape) # 3.2) 测试集预测 classifier = KNearestNeighbor() classifier.train(x_train, y_train) dists = classifier.compute_distances_no_loops(x_test) print(dists) y_test_pred = classifier.predict_labels(dists, k=1) num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
#raise ValueError(t) cm = metrics.confusion_matrix(testt, t) a = metrics.accuracy(testt, t) p, r = metrics.precision_and_recall(testt, t) try: f = metrics.f1_measure(testt, t) except: f = 0 print("Accuracy = %f\n" % a) print("Precision = %f, Recall = %f\n" % (p, r)) print("F1 measure = %f\n" % f) print("Tomorrow's Forecast: %f\n" % decision[-1]) print(sum(t) / len(t)) elif model == "knn": knn = KNearestNeighbor(10, distance_measure='euclidean', aggregator='mean') knn.fit(trainf, traint) labels = knn.predict(testf) binary_labels = metrics.make_binary(labels) cm = metrics.confusion_matrix(testt, binary_labels) a = metrics.accuracy(testt, binary_labels) p, r = metrics.precision_and_recall(testt, binary_labels) f = metrics.f1_measure(testt, binary_labels) print(binary_labels) print("Accuracy = %f\n" % a) print("Precision = %f, Recall = %f\n" % (p, r)) print("F1 measure = %f\n" % f) print(sum(binary_labels) / len(binary_labels))
def accuracy(actual, predicted): return sum(actual == predicted) / len(predicted) data = pd.read_csv("car.csv", dtype="category", header=None) data.columns = [ "buying", "maint", "doors", "persons", "lug-boot", "safety", "accept" ] X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data['accept'], test_size=0.25, random_state=0) from knn import KNearestNeighbor model = KNearestNeighbor(X_train, y_train) predictions = model.fit_predict(X_test, k=1) print(accuracy(y_test, predictions)) # model = MyNaiveBayes(smoothing=False) # model.fit(X_train, y_train) # predictions = model.predict(X_test) # print(accuracy(y_test, predictions)) # model = MyNaiveBayes(smoothing=True) # model.fit(X_train, y_train) # predictions = model.predict(X_test) # print(accuracy(y_test, predictions))
num_claesses=len(classes) samples_per_class=7 num_training = 5000 mask = range(num_training)#(0,5000),step=1 x_train = x_train[mask] #5000*32*#2*3 y_train = y_train[mask] num_test = 500 mask = range(num_test) x_test = x_test[mask] y_test = x_test[mask] x_train = np.reshape(x_train,(x_train.shape[0],-1)) x_test = np.reshape(x_test,(x_test.shape[0],-1)) classifier = KNearestNeighbor() classifier.train(x_train,y_train) #比较准确率 #dists = classifier.compute_distance_two_loops(x_test) dists = classifier.compute_distance_one_loops(x_test) y_test_pred = classifier.predict_labels(dists,k=1) num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct)/num_test print('get %d / %d correct =>accuracy : %f' % (num_correct ,num_test ,accuracy)) #dists_one = classifier.compute_distance_one_loops(x_test) #difference = np.linalg.norm(dists-dists_one,ord='fro') #求范数 #print('difference was : %f' % difference)
#为了加快我们的训练速度,我们只选取5000张训练集,500张测试集 num_training = 5000 mask = range(num_training) x_train = x_train[mask] y_train = y_train[mask] num_test = 500 mask = range(num_test) x_test = x_test[mask] y_test = y_test[mask] #至此,数据载入部分已经算是完成了,但是为了欧氏距离的计算,我们把得到的图像数据拉长成行向量 x_train = np.reshape(x_train, (x_train.shape[0], -1)) x_test = np.reshape(x_test, (x_test.shape[0], -1)) print(x_train.shape, x_test.shape) classifier = KNearestNeighbor() classifier.train(x_train, y_train) dists = classifier.compute_distances_two_loops(x_test) #也可用其他两种方法 y_test_pred = classifier.predict_labels(dists, k=1) #模型评估也是机器学习中的一个重要概念,这里我们使用准确率作为模型的评价指标, num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)) #这三种方法的差别在于它们的计算时间不同,我们来做下比较。比较代码如下: import time def time_function(f, *args): tic = time.time() f(*args)
from knn import KNearestNeighbor import numpy as np from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier def test_model(model): model.fit(X_train, y_train) y_pred = model.predict(X_test) return accuracy_score(y_test, y_pred) dt = load_iris() X_train, X_test, y_train, y_test = train_test_split(np.array(dt.data), np.array(dt.target), train_size=0.33) my_accuracy = test_model(KNearestNeighbor(k=3)) sklearn_accuracy = test_model(KNeighborsClassifier(n_neighbors=3)) print('My accuracy: {} \nsklearn accuracy: {}'.format(my_accuracy, sklearn_accuracy))
data = processor.get_processed_data() data = np.array(data) # Shuffle data np.random.shuffle(data) print('Processed Data') # KFold split and validation kf = KFold(n_splits=10) # Accuracy storing array accuracies = [] # Run over each fold for train_index, test_index in kf.split(data): # Get train and test data print("TRAIN:", train_index, "TEST:", test_index) train, test = data[train_index], data[test_index] # Build classifier on data knn = KNearestNeighbor(5, train, processor.labels) # Make predictions predictions = [knn.predict_class(point['data']) for point in test] # Compare to actual labels correct_labels = [int(prediction == label['label']) for prediction, label in zip(predictions, test)] # Calculate accuracy accuracy = sum(correct_labels)/len(correct_labels) print(accuracy) accuracies.append(accuracy) # Print out how well it performed print(accuracies)
import data.data_utilities import time import matplotlib.pyplot as plt data_dict = data.data_utilities.get_CIFAR10_data() #Preparing the data Xtr = np.reshape(data_dict['X_train'],(data_dict['X_train'].shape[0],3072)) Ytr = data_dict['y_train'] Xte = np.reshape(data_dict['X_test'],(data_dict['X_test'].shape[0],3072)) Yte = data_dict['y_test'] Xval = np.reshape(data_dict['X_val'],(data_dict['X_val'].shape[0],3072)) Yval = data_dict['y_val'] #Istantiating the classifier classifier = KNearestNeighbor() classifier.train(Xtr, Ytr) #Doing validation on k to find the best best_parameters ks = [1,3,5,7,9,11,13,15] k_accuracies = {} log = open("knn_log.txt","w") for k in ks: print("k : {0}".format(k)) y_test_pred = classifier.predict(Xval, k=k) num_correct = np.sum(y_test_pred == Yval) k_accuracies[k] = [float(num_correct) / Xtr.shape[0]] log.write("With k : {0} got accuracy : {1}\n\n".format(k,k_accuracies[k][0]))
from knn import KNearestNeighbor if __name__ == "__main__": x = [[2, 2], [1, 2], [3, 4], [1, 1], [3, 3], [7, 2], [5, 2], [6, 1], [8, -0], [7, 3]] target = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] knn = KNearestNeighbor(3) knn.train(x, target) output = knn.predict([[8, 3], [1, 1]]) print(output)