def Cross_validation(X_train, y_train): """交叉验证,确定超参K,同时可视化K值 :param X_train: 训练集 :param y_train: 训练标签 """ num_folds = 5 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100] k_accuracy = {} # 将数据集分为5份 X_train_folds = np.array_split(X_train, num_folds) y_train_folds = np.array_split(y_train, num_folds) # 计算每种K值 for k in k_choices: k_accuracy[k] = [] # 每个K值分别计算每份数据集作为测试集时的正确率 for index in range(num_folds): # 构建数据集 X_te = X_train_folds[index] y_te = y_train_folds[index] X_tr = np.reshape( X_train_folds[:index] + X_train_folds[index + 1:], (X_train.shape[0] * (num_folds - 1) / num_folds, -1)) y_tr = np.reshape( y_train_folds[:index] + y_train_folds[index + 1:], (X_train.shape[0] * (num_folds - 1) / num_folds)) # 预测结果 classify = KNearestNeighbor() classify.train(X_tr, y_tr) y_te_pred = classify.predict(X_te, k=k) accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0]) k_accuracy[k].append(accuracy) for k, accuracylist in k_accuracy.items(): for accuracy in accuracylist: print("k = %d, accuracy = %.3f" % (k, accuracy)) # 可视化K值效果 for k in k_choices: accuracies = k_accuracy[k] plt.scatter([k] * len(accuracies), accuracies) accuracies_mean = np.array( [np.mean(v) for k, v in sorted(k_accuracy.items())]) accuracies_std = np.array( [np.std(v) for k, v in sorted(k_accuracy.items())]) # 根据均值和方差构建误差棒图 plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) plt.title('Cross-validation on k') plt.xlabel('k') plt.ylabel('Cross-validation accuracy') plt.show()
# TODO: # # Perform k-fold cross validation to find the best value of k. For each # # possible value of k, run the k-nearest-neighbor algorithm num_folds times, # # where in each case you use all but one of the folds as training data and the # # last fold as a validation set. Store the accuracies for all fold and all # # values of k in the k_to_accuracies dictionary. # ################################################################################ for k in k_choices: k_to_accuracies[k] = [] for i in range(num_folds): # prepare training data for the current fold X_train_fold = np.concatenate([ fold for j, fold in enumerate(X_train_folds) if i != j ]) y_train_fold = np.concatenate([ fold for j, fold in enumerate(y_train_folds) if i != j ]) # use of k-nearest-neighbor algorithm classifier.train(X_train_fold, y_train_fold) y_pred_fold = classifier.predict(X_train_folds[i], k=k, num_loops=0) # Compute the fraction of correctly predicted examples num_correct = np.sum(y_pred_fold == y_train_folds[i]) accuracy = float(num_correct) / X_train_folds[i].shape[0] k_to_accuracies[k].append(accuracy) ################################################################################ # END OF YOUR CODE # ################################################################################ # Print out the computed accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d, accuracy = %f' % (k, accuracy))
# 5 fold cross validation learner_type = "CLASSIFICATION" fold_size = data_instances.shape[0] / 5 data_indices = [idx for idx in range(data_instances.shape[0])] for k in range(1, 100, 5): total_performance = 0.0 for holdout_fold_idx in range(5): kNN_model = KNearestNeighbor(k, learner_type) kNN_model.train(data_instances[ \ np.array( \ np.setdiff1d(data_indices, data_indices[ \ fold_size * holdout_fold_idx : \ fold_size * holdout_fold_idx + fold_size]))]) kNN_model.condense_training_data() # predict test data using k-NN and average performance predictions = kNN_model.predict( \ data_instances[ \ fold_size * holdout_fold_idx : \ fold_size * holdout_fold_idx + fold_size]) successes = fold_size - \ sum(abs( predictions - \ data_instances[ fold_size * holdout_fold_idx : fold_size * holdout_fold_idx + fold_size,-1])) performance = successes / fold_size total_performance += performance ave_performance = total_performance / 5 print("k = %d, score = %f" % (k, ave_performance))
# Create a kNN classifier instance. # Remember that training a kNN classifier is a noop: # the Classifier simply remembers the data and does no further processing # classifier = KNearestNeighbor() # classifier.train(X_train, y_train) # numK = [8,9,10,11,12,13,14,15,16] numK = [12] results = {} bestValAcc = 0 bestK = None for num in numK: knn = KNearestNeighbor() knn.train(X_train, y_train) y_train_pred = knn.predict(X_train, k=num) y_val_pred = knn.predict(X_val, k=num) trainAcc = np.mean(y_train == y_train_pred) valAcc = np.mean(y_val == y_val_pred) print 'k: %d train accuracy: %.4f val accuracy: %.4f' % (num, trainAcc, valAcc) if valAcc > bestValAcc: bestValAcc = valAcc bestK = num print 'best validation accuracy achieved: %.4f, with best k : %d' % ( bestValAcc, bestK) # Based on the cross-validation results above, choose the best value for k, # retrain the classifier using all the training data, and test it on the test # data.
x_train_folds = [] y_train_folds = [] x_train_folds = np.array_split(X_train, num_folds) y_train_folds = np.array_split(Y_train, num_folds) k_to_accuracies = {} classifier = KNearestNeighbor() for k in k_choices: accuracies = np.zeros(num_folds) for fold in range(num_folds): temp_X = x_train_folds[:] temp_y = y_train_folds[:] x_validate_fold = temp_X.pop(fold) y_validate_fold = temp_y.pop(fold) temp_X = np.array([y for x in temp_X for y in x]) temp_y = np.array([y for x in temp_y for y in x]) classifier.train(temp_X, temp_y) y_test_pred = classifier.predict(x_validate_fold, k=k) num_correct = np.sum(y_test_pred == y_validate_fold) accuracy = float(num_correct) / num_test accuracies[fold] = accuracy k_to_accuracies[k] = accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d, accuracy = %f' % (k, accuracy))
# 可视化图像 VisualizeImage(X_train, y_train) input('Enter any key to Cross-validation...') # 创建用于超参数调优的交叉验证集(也可以验证集,因为数据量还是很大的) num_training = 5000 X_tr = X_train[:num_training, ::] X_tr = np.reshape(X_tr, (X_tr.shape[0], -1)) y_tr = y_train[:num_training] # print(X_tr.shape, y_tr.shape) num_testing = 500 X_te = X_test[:num_testing, ::] X_te = np.reshape(X_te, (X_te.shape[0], -1)) y_te = y_test[:num_testing] # print(X_te.shape, y_te.shape) # 交叉验证确定参数K Cross_validation(X_tr, y_tr) input('Enter any key to train model...') # 训练完整数据集(这里就以5000个数据集作为完整训练集,500个数据集作为测试集(60000个数据电脑内存吃不消), k值根据图显示10应为最佳) classify = KNearestNeighbor() classify.train(X_tr, y_tr) y_te_pred = classify.predict(X_te, k=10) accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0]) print('最终测试: ' ' K = %d, accuracy = %.3f' % (10, accuracy))
X_train_folds = np.array(np.split(X_train, num_folds)) y_train_folds = np.array(np.split(y_train, num_folds)) k_to_accuracies = {} # test each k for k in k_choices: # loop for each validation fold for val_idx in range(num_folds): # get a list of indexes of training folds, e.g. [1,2,3,4] [0,2,3,4] train_idx = [i for i in range(num_folds) if i != val_idx] # get training set x & y X_train_set = np.concatenate(X_train_folds[train_idx]) y_train_set = np.concatenate(y_train_folds[train_idx]) # train knn_classifer.train(X_train_set, y_train_set) # get prediction with current validation fold predict_y = knn_classifer.predict(X_train_folds[val_idx], k) # compute acc for the current validation fold accuracy = np.mean(predict_y == y_train_folds[val_idx]) # store the accuracy k_to_accuracies.setdefault(k, []).append(accuracy) # print out the computed accuracies for k in sorted(k_to_accuracies): for accuracy in k_to_accuracies[k]: print('k = %d is %f' % (k, accuracy)) print('mean for k = %d is %f' % (k, np.mean(k_to_accuracies[k]))) # plot for k in k_choices: accuracies = k_to_accuracies[k] plt.scatter([k] * len(accuracies), accuracies)