def test_arange(self): train = np.arange(150).reshape(5, -1) test = np.square(np.arange(2, 122)).reshape(4, -1) knn = KNearestNeighbor() knn.train(train, None) d_two = knn.compute_distances_two_loops(test) d_one = knn.compute_distances_one_loop(test) d_no = knn.compute_distances_no_loops(test) self.assertAlmostEqual(0, np.linalg.norm(d_two - d_one, ord='fro')) self.assertAlmostEqual(0, np.linalg.norm(d_no - d_one, ord='fro'))
def Cross_validation(X_train, y_train): """交叉验证,确定超参K,同时可视化K值 :param X_train: 训练集 :param y_train: 训练标签 """ num_folds = 5 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] k_accuracy = {} # 将数据集分为5份 X_train_folds = np.array_split(X_train, num_folds) y_train_folds = np.array_split(y_train, num_folds) # 计算每种K值 for k in k_choices: k_accuracy[k] = [] # 每个K值分别计算每份数据集作为测试集时的正确率 for index in range(num_folds): # 构建数据集 X_te = X_train_folds[index] y_te = y_train_folds[index] X_tr = np.reshape( X_train_folds[:index] + X_train_folds[index + 1:], (X_train.shape[0] * (num_folds - 1) / num_folds, -1)) y_tr = np.reshape( y_train_folds[:index] + y_train_folds[index + 1:], (X_train.shape[0] * (num_folds - 1) / num_folds)) # 预测结果 classify = KNearestNeighbor() classify.train(X_tr, y_tr) y_te_pred = classify.predict(X_te, k=k) accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0]) k_accuracy[k].append(accuracy) for k, accuracylist in k_accuracy.items(): for accuracy in accuracylist: print("k = %d, accuracy = %.3f" % (k, accuracy)) # 可视化K值效果 for k in k_choices: accuracies = k_accuracy[k] plt.scatter([k] * len(accuracies), accuracies) accuracies_mean = np.array( [np.mean(v) for k, v in sorted(k_accuracy.items())]) accuracies_std = np.array( [np.std(v) for k, v in sorted(k_accuracy.items())]) # 根据均值和方差构建误差棒图 plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) plt.title('Cross-validation on k') plt.xlabel('k') plt.ylabel('Cross-validation accuracy') plt.show()
#Parameters: #a : array_like #Array to be reshaped. #newshape : int or tuple of ints #The new shape should be compatible with the original shape. If an integer, then the result will be a 1-D array of that length. One shape dimension can be -1. from k_nearest_neighbor import KNearestNeighbor # Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing classifier = KNearestNeighbor() classifier.train(X_train, y_train) #Open cs231n/classifiers/k_nearest_neighbor.py and implement # compute_distances_two_loops. # Test your implementation: dists = classifier.compute_distances_two_loops(X_test) print(dists.shape) # We can visualize the distance matrix: each row is a single test example and # its distances to training examples plt.imshow(dists,interpolation='none') plt.show() # Now implement the function predict_labels and run the code below: # We use k = 1 (which is Nearest Neighbor).
line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) np.random.shuffle(data_instances) # 5 fold cross validation learner_type = "CLASSIFICATION" fold_size = data_instances.shape[0] / 5 data_indices = [idx for idx in range(data_instances.shape[0])] for k in range(1, 100, 5): total_performance = 0.0 for holdout_fold_idx in range(5): kNN_model = KNearestNeighbor(k, learner_type) kNN_model.train(data_instances[ \ np.array( \ np.setdiff1d(data_indices, data_indices[ \ fold_size * holdout_fold_idx : \ fold_size * holdout_fold_idx + fold_size]))]) kNN_model.condense_training_data() # predict test data using k-NN and average performance predictions = kNN_model.predict( \ data_instances[ \ fold_size * holdout_fold_idx : \ fold_size * holdout_fold_idx + fold_size]) successes = fold_size - \ sum(abs( predictions - \ data_instances[ fold_size * holdout_fold_idx : fold_size * holdout_fold_idx + fold_size,-1])) performance = successes / fold_size
# Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing # classifier = KNearestNeighbor() # classifier.train(X_train, y_train) # numK = [8,9,10,11,12,13,14,15,16] numK = [12] results = {} bestValAcc = 0 bestK = None for num in numK: knn = KNearestNeighbor() knn.train(X_train, y_train) y_train_pred = knn.predict(X_train, k=num) y_val_pred = knn.predict(X_val, k=num) trainAcc = np.mean(y_train == y_train_pred) valAcc = np.mean(y_val == y_val_pred) print 'k: %d train accuracy: %.4f val accuracy: %.4f' % (num, trainAcc, valAcc) if valAcc > bestValAcc: bestValAcc = valAcc bestK = num print 'best validation accuracy achieved: %.4f, with best k : %d' % ( bestValAcc, bestK) # Based on the cross-validation results above, choose the best value for k, # retrain the classifier using all the training data, and test it on the test
k_to_accuracies = {} for k in k_choices: k_to_accuracies.setdefault(k, []) for i in range(num_folds): classifier = KNearestNeighbor() x_val_train = np.concatenate((x_train_folds[0:i], x_train_folds[i + 1:]), axis=0) x_val_train = x_val_train.reshape(-1, x_val_train.shape[2]) y_val_train = np.concatenate((y_train_folds[0:i], y_train_folds[i + 1:]), axis=0) y_val_train = y_val_train.reshape(-1, y_val_train.shape[2]) y_val_train = y_val_train[:, 0] classifier.train(x_val_train, y_val_train) for k in k_choices: y_val_pred = classifier.predict_labels(x_train_folds[i], k=k) num_correct = np.sum(y_val_pred == y_train_folds[i][:, 0]) accuracy = float(num_correct) / len(y_val_pred) k_to_accuracies[k] = k_to_accuracies[k] + [accuracy] for k in k_choices: accuracies = k_to_accuracies[k] plt.scatter([k] * len(accuracies), accuracies) accuracies_mean = np.array( [np.mean(v) for k, v in sorted(k_to_accuracies.items())]) accuracies_std = np.array( [np.std(v) for k, v in sorted(k_to_accuracies.items())]) plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
y_train = y_train[mask] num_test = 500 mask = range(num_test) X_test = X_test[mask] y_test = y_test[mask] # reshape训练和测试数据,转换为行的形式 X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print(X_train.shape) print(X_test.shape) classifier = KNearestNeighbor() classifier.train(X_train, y_train) dists = classifier.compute_distances_two_loops(X_test) print(dists.shape) plt.imshow(dists, interpolation='none') plt.show() # Now implement the function predict_labels and run the code below: # k=1时 y_test_pred = classifier.predict_labels(dists, k=1) # Compute and print the fraction of correctly predicted examples num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
cifar_10_dir = './cifar-10-batches-py' x_train, y_train, x_test, y_test = load_cifar10(cifar_10_dir) print('train_data_shape:', x_train.shape) print('train_labels_shape:', y_train.shape) print('test_data_shape:', x_test.shape) print('test_labels_shape:', y_test.shape) x_train = x_train.reshape(x_train.shape[0], -1) x_test = x_test.reshape(x_test.shape[0], -1) num_train = x_train.shape[0] num_test = x_test.shape[0] # num_train = 5000 # mask = range(num_train) # x_train = x_train[mask] # y_train = y_train[mask] # num_test = 500 # mask = range(num_test) # x_test = x_test[mask] # y_test = y_test[mask] classifier = KNearestNeighbor() classifier.train(x_train, y_train) dicts = classifier.compute_distance(x_test) y_test_pred = classifier.predict_labels(dicts, k=10) num_correct = np.sum(y_test_pred == y_test) accuracy = num_correct / num_test print('got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
X_train=X_train[mask] y_train=y_train[mask] num_test=500 mask=list(range(num_test)) X_test=X_test[mask] y_test=y_test[mask] #Reshape the image data into rows X_train=np.reshape(X_train,(X_train.shape[0],-1)) X_test=np.reshape(X_test,(X_test.shape[0],-1)) print(X_train.shape,X_test.shape) #Create a KNN classifier instance,k=1 classifier=KNearestNeighbor() classifier.train(X_train,y_train) """ dists=classifier.compute_distance_two_loops(X_test) print('dists.shape is') print(dists.shape) #plt.imshow(dists,interpolation='none') #plt.savefig('/home/hongyin/file/cs231n-assignment1/picFaster.jpg') y_test_pred=classifier.predict_labels(dists,k=1) num_correct=np.sum(y_test_pred==y_test) accuracy=float(num_correct)/num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)) #k=5 y_test_pred = classifier.predict_labels(dists, k=5) num_correct = np.sum(y_test_pred == y_test)
plt_idx = i * class_num + y + 1 plt.subplot(samples_pre_class, class_num, plt_idx) plt.imshow(X_train[idx].astype('uint8')) plt.axis('off') if i == 0: plt.title(cls) plt.show() X_train = X_train.reshape(500, -1) y_train = y_train.reshape(500, -1) X_test = X_test.reshape(10, -1) y_test = y_test.reshape(10, -1) classifier = KNearestNeighbor() classifier.train(X_train, y_train) dists = classifier.compute_distance_two_loops(X_test) dists_one = classifier.compute_distance_one_loop(X_test) diff = np.linalg.norm(dists - dists_one, ord='fro') if diff < 0.001: print('good') else: print('bad') y_pred = classifier.predict_labels(dists, 1) correct = np.where(y_pred == y_train) print('accuracy: ', len(correct) / len(y_test)) # cross validation num_folds = 5 k_chioces = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
# 可视化图像 VisualizeImage(X_train, y_train) input('Enter any key to Cross-validation...') # 创建用于超参数调优的交叉验证集(也可以验证集,因为数据量还是很大的) num_training = 5000 X_tr = X_train[:num_training, ::] X_tr = np.reshape(X_tr, (X_tr.shape[0], -1)) y_tr = y_train[:num_training] # print(X_tr.shape, y_tr.shape) num_testing = 500 X_te = X_test[:num_testing, ::] X_te = np.reshape(X_te, (X_te.shape[0], -1)) y_te = y_test[:num_testing] # print(X_te.shape, y_te.shape) # 交叉验证确定参数K Cross_validation(X_tr, y_tr) input('Enter any key to train model...') # 训练完整数据集(这里就以5000个数据集作为完整训练集,500个数据集作为测试集(60000个数据电脑内存吃不消), k值根据图显示10应为最佳) classify = KNearestNeighbor() classify.train(X_tr, y_tr) y_te_pred = classify.predict(X_te, k=10) accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0]) print('最终测试: ' ' K = %d, accuracy = %.3f' % (10, accuracy))
centroids = kmeans_model.get_centroids() for cluster_idx in xrange(len(clusters)): ave_label = 0.0 for instance in clusters[cluster_idx]: ave_label += instance[-1] if len(clusters[cluster_idx]) > 0: ave_label = ave_label / len(clusters[cluster_idx]) if learner_type == "CLASSIFICATION": ave_label = int(round(ave_label)) centroids[cluster_idx].append(ave_label) # for classification, vote to determine centroid classification # for regression, average to find centroid estimate # feed centroids into k-NN as training data kNN_model = KNearestNeighbor(best_ks[test[0]], learner_type) kNN_model.train(centroids) # predict test data using k-NN and average performance predictions = kNN_model.predict( \ data_instances[ \ fold_size * holdout_fold_idx : \ fold_size * holdout_fold_idx + fold_size]) if kNN_model.learner_type == "CLASSIFICATION": successes = fold_size - \ sum(abs( predictions - \ data_instances[ fold_size * holdout_fold_idx : fold_size * holdout_fold_idx + fold_size,-1])) performance = successes / fold_size elif kNN_model.learner_type == "REGRESSION": performance = sum((predictions - \
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] X_train_folds = np.array(np.split(X_train, num_folds)) y_train_folds = np.array(np.split(y_train, num_folds)) k_to_accuracies = {} # test each k for k in k_choices: # loop for each validation fold for val_idx in range(num_folds): # get a list of indexes of training folds, e.g. [1,2,3,4] [0,2,3,4] train_idx = [i for i in range(num_folds) if i != val_idx] # get training set x & y X_train_set = np.concatenate(X_train_folds[train_idx]) y_train_set = np.concatenate(y_train_folds[train_idx]) # train knn_classifer.train(X_train_set, y_train_set) # get prediction with current validation fold predict_y = knn_classifer.predict(X_train_folds[val_idx], k) # compute acc for the current validation fold accuracy = np.mean(predict_y == y_train_folds[val_idx]) # store the accuracy k_to_accuracies.setdefault(k, []).append(accuracy) # print out the computed accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d is %f' % (k, accuracy)) print('mean for k = %d is %f' % (k, np.mean(k_to_accuracies[k]))) # plot for k in k_choices: accuracies = k_to_accuracies[k]
def run_model_with_cross_validation(model_name, knn_mode, k_number): #GET DATA #- expect data_0 ... data_4 data_groups = list() data_groups.append(np.loadtxt('data_0', delimiter=',')) data_groups.append(np.loadtxt('data_1', delimiter=',')) data_groups.append(np.loadtxt('data_2', delimiter=',')) data_groups.append(np.loadtxt('data_3', delimiter=',')) data_groups.append(np.loadtxt('data_4', delimiter=',')) NUM_GROUPS = len(data_groups) #For each data_group, train on all others and test on me culminating_result = 0; for test_group_id in range(NUM_GROUPS): #Form training data as 4/5 data train_data = np.array([]) for train_group_id in range(len(data_groups)): if (train_group_id != test_group_id): #Initialize train_data if necessary if (train_data.size == 0): train_data = np.copy(data_groups[train_group_id]) else: train_data = np.concatenate( (train_data, data_groups[train_group_id]), axis=0) print('train_data, group ', str(test_group_id), 'length: ', len(train_data)) print(train_data) test_data = data_groups[test_group_id] result = 0 model = None if (model_name == 'knn'): model = KNearestNeighbor(train_data, k_number) model.train(train_data) print('KNN train data length', len(model.data)) result = model.test(test_data, knn_mode) elif (model_name == 'c_knn'): model = CondensedKNearestNeighbor(train_data, k_number) #Mode is always majority...this is not used for regression mode = "majority" model.train(train_data) print('condensed KNN train data length', len(model.data)) result = model.test(test_data, mode) else: print('error - ', model_name, ' is not a supported model') return print('test_data, group ', str(test_group_id), 'length:', len(test_data)) print(test_data) print() print('result of iteration ' + str(test_group_id)) print(result) print() culminating_result = culminating_result + result final_average_result = culminating_result / NUM_GROUPS print() print('final average result:') print(final_average_result) print() return final_average_result
import sklearn if __name__ == '__main__': train_path = "/Users/zxj/Desktop/Mini1/train.pkl" train_data = pickle.load(open(train_path, "rb")) # Fixed_parameters # Please do not change the fixed parameters val_ratio = 0.2 # student_parameters # You may want to change these in your experiment later. train_ratio = 1.0 # we split the train_data into 0.8:training train_num = int(train_data['data'].shape[0] * train_ratio * (1.0 - val_ratio)) val_num = -1 * int(train_data['data'].shape[0] * train_ratio * val_ratio) KNN_classifier = KNearestNeighbor() KNN_classifier.train(train_data['data'][:train_num], train_data['target'][:train_num]) dists = KNN_classifier.compute_distances(train_data['data'][val_num:, :]) k_choices = [2, 3, 5, 7, 9, 11, 15, 19] for k in k_choices: y_test_pred = KNN_classifier.predict_labels(dists, k) num_correct = np.sum(y_test_pred == train_data['target'][val_num:]) accuracy = float(num_correct) / (-1 * val_num) print( 'For K= %d and train_ratio= %f, Got %d / %d correct => VAL_accuracy: %f' % (k, train_ratio, num_correct, -1 * val_num, accuracy))
x_train_folds = [] y_train_folds = [] x_train_folds = np.array_split(X_train, num_folds) y_train_folds = np.array_split(Y_train, num_folds) k_to_accuracies = {} classifier = KNearestNeighbor() for k in k_choices: accuracies = np.zeros(num_folds) for fold in range(num_folds): temp_X = x_train_folds[:] temp_y = y_train_folds[:] x_validate_fold = temp_X.pop(fold) y_validate_fold = temp_y.pop(fold) temp_X = np.array([y for x in temp_X for y in x]) temp_y = np.array([y for x in temp_y for y in x]) classifier.train(temp_X, temp_y) y_test_pred = classifier.predict(x_validate_fold, k=k) num_correct = np.sum(y_test_pred == y_validate_fold) accuracy = float(num_correct) / num_test accuracies[fold] = accuracy k_to_accuracies[k] = accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d, accuracy = %f' % (k, accuracy))