def crossValidate(X_fold, y_fold, k, idx): #print "Use idx ", idx , " for crossvalidation" #X_train = np.array(len(X_fold)-1) #X_cross = np.array(l) #y_train = np.array(len(y_fold)-1) #y_cross = np.array(len(y_fold)) for i in xrange(0, len(X_fold)): if i == idx: X_cross = X_fold[i] y_cross = y_fold[i] else: X_train = np.vstack(X_fold[0:i] + X_fold[i + 1:]) y_train = np.hstack(y_fold[0:i] + y_fold[i + 1:]) # print "dim train ", X_train.shape # print "dim cross ", X_cross.shape # print "dim y train ", y_train.shape # print "dim y cross ", y_cross.shape classifier = KNearestNeighbor() classifier.train(X_train, y_train) dists = classifier.compute_distances_no_loops(X_cross) y_cross_pred = classifier.predict_labels(dists, k) num_correct = np.sum(y_cross_pred == y_cross) print "cross val has ", y_cross.shape accuracy = float(num_correct) / len(y_cross) return accuracy
def cal_standard_knn(): # Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train) print('KNN Classifier Train Done\n') #------------------------------------------------------------ # Open cs231n/classifiers/k_nearest_neighbor.py and implement # compute_distances_two_loops. # Test your implementation: print('Ready to test with 2 loops') #dists = classifier.compute_distances_two_loops(X_test) #print(dists.shape) print('Ready to test with 1 loop') #dists = classifier.compute_distances_one_loop(X_test) #print(dists.shape) print('Ready to test with 0 loop\n') dists = classifier.compute_distances_no_loops(X_test) print(dists.shape) #------------------------------------------------------------ print('Ready to predict') y_pred = classifier.predict_labels(dists, 3) print('Accurarcy = %s' % np.mean(y_pred == y_test))
def use_classifier(x_train, y_train, x_test, y_test, k): classifier = KNearestNeighbor() classifier.train(x_train, y_train) dists = get_distance(classifier, x_test) y_pred = classifier.predict_labels(dists, k) accuracy = accuracy_score(y_test, y_pred) return accuracy
def test(X_train, y_train, X_test, y_test, best_k): classifier = KNearestNeighbor() classifier.train(X_train, y_train) y_test_pred = classifier.predict(X_test, k=best_k) # Compute and display the accuracy num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / len(y_test) print 'Best k=%d' % best_k print 'Got %d / %d correct => accuracy: %f' % (num_correct, len(y_test), accuracy)
def classifier(): train_data = np.array([ [1, 2, 2, 1], [4, 3, 4, 4], [3, 4, 4, 2], ]) train_labels = np.array([1, 2, 2]) knn = KNearestNeighbor() knn.train(train_data, train_labels) return knn
def cross_validation(train_data, train_label): """交叉验证的方式选择最优的超参数k""" num_folds = 5 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] # 任务: # 将训练数据切分,训练样本和对应的样本标签包含在数组 # x_train_folds 和 y_train_folds 之中,数组的长度为num_folds # 其中y_train_folds[i] 是一个矢量,表示矢量x_train_folds[i]中所有样本的标签 # 提示:可以尝试使用numpy的 array_spilt 方法 x_train_folds = np.array_split(train_data, num_folds) y_train_folds = np.array_split(train_label, num_folds) # 我们将不同k值下的准确率保存在一个字典中。交叉验证之后,k_to_accuracies[k]保存了一个 # 长度为num_folds的list,值为k值下的准确率 k_to_accuracies = {} # 任务: # 通过k折的交叉验证找到最佳k值。对于每一个k值,执行KNN算法num_folds次,每一次执行中,选择一折为验证集 # 其它折为训练集。将不同k值在不同折上的验证结果保存在k_to_accuracies字典中 classifiers = KNearestNeighbor() for k in k_choices: accuracies = np.zeros(num_folds) for fold in range(num_folds): temp_x = x_train_folds.copy() temp_y = y_train_folds.copy() # 组成验证集 x_validate_fold = temp_x.pop(fold) y_validate_fold = temp_y.pop(fold) # 组成训练集 x_temp_train_fold = np.array([x for x_fold in temp_x for x in x_fold]) y_temp_train_fold = np.array([y for y_fold in temp_y for y in y_fold]) classifiers.train(x_temp_train_fold, y_temp_train_fold) # 进行验证 y_test_predicted = classifiers.predict(x_validate_fold, k, 0) num_correct = np.sum(y_test_predicted == y_validate_fold) accuracy = float(num_correct) / y_validate_fold.shape[0] accuracies[fold] = accuracy k_to_accuracies[k] = accuracies # 输出准确率 for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d, accuracy = %f' % (k, accuracy)) # 画图显示所有的精确度散点 for k in k_choices: accuracies = k_to_accuracies[k] plt.scatter([k]*len(accuracies), accuracies) # plot the trend line with error bars that correspond to standard # 画出在不同k值下,误差均值和标准差 accuracies_mean = np.array([np.mean(k_to_accuracies[k]) for k in sorted(k_to_accuracies)]) accuracies_std = np.array([np.std(k_to_accuracies[k]) for k in sorted(k_to_accuracies)]) plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) plt.title('Cross-validation on k') plt.xlabel('k') plt.ylabel('Cross-validation accuracy') plt.show()
def test_cross_validation(X_train, y_train): print('Ready to test with cross_validation') num_folds = 5 k_choices = [1, 3, 5, 8, 10] X_train_folds = [] y_train_folds = [] print('Train data shape = ', X_train.shape) y_train = y_train.reshape(-1, 1) print('Train label shape = ', y_train.shape) X_train_folds = np.array_split(X_train, num_folds) y_train_folds = np.array_split(y_train, num_folds) k_to_accuracies = {} for each_k in k_choices: k_to_accuracies.setdefault(each_k, []) for i in range(num_folds): classfer = KNearestNeighbor() X_train_slice = np.vstack(X_train_folds[0:i] + X_train_folds[i + 1:num_folds]) y_train_slice = np.vstack(y_train_folds[0:i] + y_train_folds[i + 1:num_folds]) y_train_slice = y_train_slice.reshape(-1) #print('debug') #print(y_train_slice.shape) X_test_slice = X_train_folds[i] y_test_slice = y_train_folds[i] y_test_slice = y_test_slice.reshape(-1) #print(X_train_slice.shape) classfer.train(X_train_slice, y_train_slice) dis = classfer.compute_distances_no_loops(X_test_slice) y_predict = classfer.predict_labels(dis, each_k) acc = np.mean(y_predict == y_test_slice) k_to_accuracies[each_k].append(acc) #break #break for each_k in k_choices: for item in k_to_accuracies[each_k]: print('k = %d, acc = %f' % (each_k, item))
def main(): X_train, y_train, X_test, y_test = gen_train_test(5000, 500) num_test = y_test.shape[0] classifier = KNearestNeighbor() classifier.train(X_train, y_train) starttime = datetime.datetime.now() dists = classifier.compute_distances_one_loop(X_test) endtime = datetime.datetime.now() print(endtime - starttime).seconds print dists.shape y_test_pred = classifier.predict_labels(dists, k=5) # Compute and print the fraction of correctly predicted examples num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)
def cross_validate(X_train, y_train): num_folds = 5 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] X_train_folds = [] y_train_folds = [] N = len(X_train) train_folds = np.array_split(range(N), num_folds, axis=0) k_to_accuracies = {} for k1 in k_choices: fold_eval = [] for i in range(num_folds): mask = np.ones(N, dtype=bool) mask[train_folds[i]] = False X_train_cur = X_train[mask] y_train_cur = y_train[mask] classifier = KNearestNeighbor() classifier.train(X_train_cur, y_train_cur) X_test_cur = X_train[train_folds[i]] y_test_cur = y_train[train_folds[i]] dists = classifier.compute_distances_no_loops(X_test_cur) y_test_pred = classifier.predict_labels(dists, k=k1) num_correct = np.sum(y_test_pred == y_test_cur) accuracy = float(num_correct) / len(y_test_cur) fold_eval.append(accuracy) #pass k_to_accuracies[k1] = fold_eval[:] #k_to_accuracies[k1] = [1,2,3,4,5] for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print 'k = %d, accuracy = %f' % (k, accuracy) for k in k_choices: accuracies = k_to_accuracies[k] plt.scatter([k] * len(accuracies), accuracies) accuracies_mean = np.array( [np.mean(v) for k, v in sorted(k_to_accuracies.items())]) accuracies_std = np.array( [np.std(v) for k, v in sorted(k_to_accuracies.items())]) plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) plt.title('Cross-validation on k') plt.xlabel('k') plt.ylabel('Cross-validation accuracy') plt.savefig('./figures/validation_k')
def cross_validate(X_train, y_train, num_folds=5): k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] X_train_folds = np.array_split(X_train, num_folds) y_train_folds = np.array_split(y_train, num_folds) # A dictionary holding the accuracies for different values of k that we find # when running cross-validation. After running cross-validation, # k_to_accuracies[k] should be a list of length num_folds giving the different # accuracy values that we found when using that value of k. k_to_accuracies = {k: [] for k in k_choices} for i in range(num_folds): X_train_cv = np.vstack(X_train_folds[:i] + X_train_folds[i + 1:]) y_train_cv = np.hstack(y_train_folds[:i] + y_train_folds[i + 1:]) X_val = X_train_folds[i] y_val = y_train_folds[i] classifier = KNearestNeighbor() classifier.train(X_train_cv, y_train_cv) dists_cv = classifier.compute_distances_no_loops(X_val) for k in k_choices: y_val_pred = classifier.predict_labels(dists_cv, k=k) num_correct = np.sum(y_val_pred == y_val) accuracy = float(num_correct) / len(y_val) k_to_accuracies[k].append(accuracy) # Print out the computed accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print 'k = %d, accuracy = %f' % (k, accuracy) plot_cross_validation(k_choices, k_to_accuracies) sort_by_accuracy = sorted(k_to_accuracies, key=lambda k: np.mean(k_to_accuracies[k])) return sort_by_accuracy[-1]
def main(): X_train, y_train, X_test, y_test = load_CIFAR10('../cifar-10-batches-py') num_training = 48000 mask = list(range(num_training)) X_train = X_train[mask] y_train = y_train[mask] num_test = 1000 mask = list(range(num_test)) X_test = X_test[mask] y_test = y_test[mask] # Reshape the image data into rows print(X_train.shape) ''' (48000, 32, 32, 3) ''' X_train = np.reshape(X_train, (X_train.shape[0], -1)) print(X_train.shape) ''' (48000, 3072) ''' X_test = np.reshape(X_test, (X_test.shape[0], -1)) print(X_train.shape, X_test.shape) ''' (48000, 3072) (1000, 3072) ''' classifier = KNearestNeighbor() classifier.train(X_train, y_train) y_test_pred = classifier.predict(X_test, k=5) print(y_test_pred) # Compute and display the accuracy num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)) '''
xTrain, yTrain, xTest, yTest = load_CIFAR10(cifar10) lengthTrain = 5000 lengthTest = 500 xTrain = xTrain[:lengthTrain] yTrain = yTrain[:lengthTrain] xTrainOrgnl = xTrain yTrainOrgnl = yTrain xTest = xTest[:lengthTest] yTest = yTest[:lengthTest] xTrain = np.reshape( xTrain, (lengthTrain, xTrain.shape[1] * xTrain.shape[2] * xTrain.shape[3])) xTest = np.reshape( xTest, (lengthTest, xTest.shape[1] * xTest.shape[2] * xTest.shape[3])) clsfr = KNearestNeighbor() cvFold = 5 # kValue = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] kValue = np.random.random_integers(1, 100, 10) kValue = kValue[np.argsort(kValue)] kValue = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] xTrain = np.array(np.split(xTrain, cvFold)) yTrain = np.array(np.split(yTrain, cvFold)) # kValue = [3] kAccuracies = [] # print(xTrain) for ptr, k in enumerate(kValue): kValueAcc = [] for i in xrange(0, cvFold): xValid = xTrain[i]
# In[10]: # Reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print X_train.shape, X_test.shape # In[11]: from cs231n.classifiers import KNearestNeighbor # Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train) # In[12]: # We would now like to classify the test data with the kNN classifier. Recall that we can break down this process into two steps: # # 1. First we must compute the distances between all test examples and all train examples. # 2. Given these distances, for each test example we find the k nearest examples and have them vote for the label # # Lets begin with computing the distance matrix between all training and test examples. For example, if there are **Ntr** training examples and **Nte** test examples, this stage should result in a **Nte x Ntr** matrix where each element (i,j) is the distance between the i-th test and j-th train example. # # First, open `cs231n/classifiers/k_nearest_neighbor.py` and implement the function `compute_distances_two_loops` that uses a (very inefficient) double loop over all pairs of (test, train) examples and computes the distance matrix one element at a time. # In[25]:
y_train = y_train[mask] num_test = 500 mask = range(num_test) x_test = x_test[mask] y_test = y_test[mask] # %% x_train = np.reshape(x_train, (x_train.shape[0], -1)) x_test = np.reshape(x_test, (x_test.shape[0], -1)) print(x_train.shape, x_test.shape) # %% from cs231n.classifiers import KNearestNeighbor classifier = KNearestNeighbor() classifier.train(x_train, y_train) print('x') # %% dists = classifier.compute_distances_no_loops(x_test) print(dists) # %% plt.imshow(dists, interpolation='none') plt.show() # %% def get_accuracy(classifier, x_test, y_test, k): y_test_pred = classifier.predict_labels(x_test, k)
X_test = np.reshape(X_test, (X_test.shape[0], -1)) from cs231n.classifiers import KNearestNeighbor num_folds = 5 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] X_train_folds = [] y_train_folds = [] X_train_folds = np.split(X_train, num_folds) y_train_folds = np.split(y_train, num_folds) k_to_accuracies = {} for k_choice in k_choices: for i in range(num_folds): knn = KNearestNeighbor() xtrain = X_train_folds[:i] + X_train_folds[i + 1:] xtrain = np.asarray([item for sublist in xtrain for item in sublist]) ytrain = y_train_folds[:i] + y_train_folds[i + 1:] ytrain = np.asarray([item for sublist in ytrain for item in sublist]) knn.train(xtrain, ytrain) dists = knn.compute_distances_no_loops(np.asarray(X_train_folds[i])) y_test_pred = knn.predict_labels(dists, k=k_choice) num_correct = np.sum(y_test_pred == y_train_folds[i]) accuracy = float(num_correct) / len(y_train_folds[i]) k_to_accuracies.setdefault(k_choice, []).append(accuracy) print('k = %d, accuracy = %f' % (k_choice, accuracy)) for k in k_choices: accuracies = k_to_accuracies[k] plt.scatter([k] * len(accuracies), accuracies)
mask = list(range(num_test)) X_test = X_test[mask] y_test = y_test[mask] # Reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print(X_train.shape, X_test.shape) # %% from cs231n.classifiers import KNearestNeighbor # Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train) # %% [markdown] # We would now like to classify the test data with the kNN classifier. Recall that we can break down this process into two steps: # # 1. First we must compute the distances between all test examples and all train examples. # 2. Given these distances, for each test example we find the k nearest examples and have them vote for the label # # Lets begin with computing the distance matrix between all training and test examples. For example, if there are **Ntr** training examples and **Nte** test examples, this stage should result in a **Nte x Ntr** matrix where each element (i,j) is the distance between the i-th test and j-th train example. # # **Note: For the three distance computations that we require you to implement in this notebook, you may not use the np.linalg.norm() function that numpy provides.** # # First, open `cs231n/classifiers/k_nearest_neighbor.py` and implement the function `compute_distances_two_loops` that uses a (very inefficient) double loop over all pairs of (test, train) examples and computes the distance matrix one element at a time. # %%
num_test = 500 mask = list(range(num_test)) X_test = X_test[mask] y_test = y_test[mask] # Reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print(X_train.shape, X_test.shape) from cs231n.classifiers import KNearestNeighbor # Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train) """ dists = classifier.compute_distances_two_loops(X_test) print(dists.shape) plt.imshow(dists, interpolation='none') plt.show() # Now implement the function predict_labels and run the code below: # We use k = 1 (which is Nearest Neighbor). y_test_pred = classifier.predict_labels(dists, k=1) # Compute and print the fraction of correctly predicted examples num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test
# In[6]: # Reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print(X_train.shape, X_test.shape) # In[7]: from cs231n.classifiers import KNearestNeighbor # Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train) # We would now like to classify the test data with the kNN classifier. Recall that we can break down this process into two steps: # # 1. First we must compute the distances between all test examples and all train examples. # 2. Given these distances, for each test example we find the k nearest examples and have them vote for the label # # Lets begin with computing the distance matrix between all training and test examples. For example, if there are **Ntr** training examples and **Nte** test examples, this stage should result in a **Nte x Ntr** matrix where each element (i,j) is the distance between the i-th test and j-th train example. # # First, open `cs231n/classifiers/k_nearest_neighbor.py` and implement the function `compute_distances_two_loops` that uses a (very inefficient) double loop over all pairs of (test, train) examples and computes the distance matrix one element at a time. # In[8]: # Open cs231n/classifiers/k_nearest_neighbor.py and implement # compute_distances_two_loops.
num_test = 500 mask = list(range(num_test)) X_test = X_test[mask] y_test = y_test[mask] # Reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print('The shape of the new selected training dataset:', X_train.shape, X_test.shape) from cs231n.classifiers import KNearestNeighbor #create a kNN classifier instance #The classifier simply remember the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train) # open cs231n/classifiers /k_nearest_neighbor.py and implement #compute distances by two loops dists = classifier.compute_distances_two_loops(X_test) print(dists.shape) # We can visualize the distance matrix: each row is a single test example and #its distance to training examples plt.imshow(dists, interpolation='none') plt.show() # Now run the prediction fuction predict_labels and run the code # first try k=1
def test1(): cifar10_dir = 'cs231n/datasets/cifar-10-batches-py' X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir) print 'Training data shape:', X_train.shape print 'Training label shape:', y_train.shape print 'Test data shape:', X_test.shape print 'Test label shape:', y_test.shape # classes = ['plane','car','bird','cat','deer','dog','frog','horse','ship','truck'] # num_classes = len(classes) # sample_per_class = 7 # for y,cls in enumerate(classes): # idxs = np.flatnonzero(y_train == y) # idxs = np.random.choice(idxs, sample_per_class, replace=False) # for i, idx in enumerate(idxs): # plt_idx = i*num_classes + y + 1 # plt.subplot(sample_per_class, num_classes, plt_idx) # plt.imshow(X_train[idx].astype('uint8')) # plt.axis('off') # if i == 0: # plt.title(cls) # plt.savefig("./figures/cifar_sample.png") # plt.show() # plt.close() num_training = 5000 mask = range(num_training) X_train = X_train[mask] y_train = y_train[mask] num_test = 500 mask = range(num_test) X_test = X_test[mask] y_test = y_test[mask] X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print X_train.shape, X_test.shape from cs231n.classifiers import KNearestNeighbor classifier = KNearestNeighbor() classifier.train(X_train, y_train) # two_loop_time = time_function(classifier.compute_distances_two_loops,X_test) # print "two loop time %f" % two_loop_time # one_loop_time = time_function(classifier.compute_distances_one_loop,X_test) # print "one loop time %f " %one_loop_time # no_loop_time = time_function(classifier.compute_distances_no_loops,X_test) # print "no loop time %f "% no_loop_time dists = classifier.compute_distances_no_loops(X_test) # dist_one_loop = classifier.compute_distances_one_loop(X_test) # dist_two_loops = classifier.compute_distances_two_loops(X_test) #matrix_compare(dists,dist_one_loop) #matrix_compare(dists,dist_two_loops) y_test_pred = classifier.predict_labels(dists, k=5) num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print "God %d/%d correct => accuracy: %f" % (num_correct, num_test, accuracy) cross_validate(X_train, y_train)