예제 #1
0
def Cross_validation(X_train, y_train):
    """交叉验证,确定超参K,同时可视化K值

    :param X_train: 训练集
    :param y_train: 训练标签
    """
    num_folds = 5
    k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
    k_accuracy = {}
    # 将数据集分为5份
    X_train_folds = np.array_split(X_train, num_folds)
    y_train_folds = np.array_split(y_train, num_folds)
    # 计算每种K值
    for k in k_choices:
        k_accuracy[k] = []
        # 每个K值分别计算每份数据集作为测试集时的正确率
        for index in range(num_folds):
            # 构建数据集
            X_te = X_train_folds[index]
            y_te = y_train_folds[index]
            X_tr = np.reshape(
                X_train_folds[:index] + X_train_folds[index + 1:],
                (X_train.shape[0] * (num_folds - 1) / num_folds, -1))
            y_tr = np.reshape(
                y_train_folds[:index] + y_train_folds[index + 1:],
                (X_train.shape[0] * (num_folds - 1) / num_folds))
            # 预测结果
            classify = KNearestNeighbor()
            classify.train(X_tr, y_tr)
            y_te_pred = classify.predict(X_te, k=k)
            accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0])
            k_accuracy[k].append(accuracy)

    for k, accuracylist in k_accuracy.items():
        for accuracy in accuracylist:
            print("k = %d, accuracy = %.3f" % (k, accuracy))

    # 可视化K值效果
    for k in k_choices:
        accuracies = k_accuracy[k]
        plt.scatter([k] * len(accuracies), accuracies)
    accuracies_mean = np.array(
        [np.mean(v) for k, v in sorted(k_accuracy.items())])
    accuracies_std = np.array(
        [np.std(v) for k, v in sorted(k_accuracy.items())])
    # 根据均值和方差构建误差棒图
    plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
    plt.title('Cross-validation on k')
    plt.xlabel('k')
    plt.ylabel('Cross-validation accuracy')
    plt.show()
예제 #2
0
파일: knn.py 프로젝트: Jeeukrishnan/CS231N
# TODO:                                                                        #
# Perform k-fold cross validation to find the best value of k. For each        #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times,   #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all     #
# values of k in the k_to_accuracies dictionary.                               #
################################################################################
for k in k_choices:
    k_to_accuracies[k] = []
    for i in range(num_folds):
        # prepare training data for the current fold
        X_train_fold = np.concatenate([ fold for j, fold in enumerate(X_train_folds) if i != j ])
        y_train_fold = np.concatenate([ fold for j, fold in enumerate(y_train_folds) if i != j ])
        
        # use of k-nearest-neighbor algorithm
        classifier.train(X_train_fold, y_train_fold)
        y_pred_fold = classifier.predict(X_train_folds[i], k=k, num_loops=0)

        # Compute the fraction of correctly predicted examples
        num_correct = np.sum(y_pred_fold == y_train_folds[i])
        accuracy = float(num_correct) / X_train_folds[i].shape[0]
        k_to_accuracies[k].append(accuracy)
        
################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

# Print out the computed accuracies
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d, accuracy = %f' % (k, accuracy))
예제 #3
0
    # 5 fold cross validation
    learner_type = "CLASSIFICATION"
    fold_size = data_instances.shape[0] / 5
    data_indices = [idx for idx in range(data_instances.shape[0])]
    for k in range(1, 100, 5):
        total_performance = 0.0
        for holdout_fold_idx in range(5):
            kNN_model = KNearestNeighbor(k, learner_type)
            kNN_model.train(data_instances[ \
                    np.array( \
                        np.setdiff1d(data_indices, data_indices[ \
                                fold_size * holdout_fold_idx : \
                                fold_size * holdout_fold_idx + fold_size]))])
            kNN_model.condense_training_data()
            #  predict test data using k-NN and average performance
            predictions = kNN_model.predict( \
                data_instances[ \
                    fold_size * holdout_fold_idx : \
                    fold_size * holdout_fold_idx + fold_size])
            successes = fold_size - \
                sum(abs(
                   predictions - \
                   data_instances[
                       fold_size * holdout_fold_idx :
                       fold_size * holdout_fold_idx + fold_size,-1]))
            performance = successes / fold_size
            total_performance += performance
        ave_performance = total_performance / 5
        print("k = %d, score = %f" % (k, ave_performance))
예제 #4
0
# Create a kNN classifier instance.
# Remember that training a kNN classifier is a noop:
# the Classifier simply remembers the data and does no further processing
# classifier = KNearestNeighbor()
# classifier.train(X_train, y_train)

# numK = [8,9,10,11,12,13,14,15,16]
numK = [12]
results = {}
bestValAcc = 0
bestK = None

for num in numK:
    knn = KNearestNeighbor()
    knn.train(X_train, y_train)
    y_train_pred = knn.predict(X_train, k=num)
    y_val_pred = knn.predict(X_val, k=num)
    trainAcc = np.mean(y_train == y_train_pred)
    valAcc = np.mean(y_val == y_val_pred)
    print 'k: %d train accuracy: %.4f val accuracy: %.4f' % (num, trainAcc,
                                                             valAcc)
    if valAcc > bestValAcc:
        bestValAcc = valAcc
        bestK = num

print 'best validation accuracy achieved: %.4f, with best k : %d' % (
    bestValAcc, bestK)

# Based on the cross-validation results above, choose the best value for k,
# retrain the classifier using all the training data, and test it on the test
# data.
예제 #5
0
x_train_folds = []
y_train_folds = []

x_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(Y_train, num_folds)

k_to_accuracies = {}

classifier = KNearestNeighbor()
for k in k_choices:
    accuracies = np.zeros(num_folds)
    for fold in range(num_folds):
        temp_X = x_train_folds[:]
        temp_y = y_train_folds[:]
        x_validate_fold = temp_X.pop(fold)
        y_validate_fold = temp_y.pop(fold)

        temp_X = np.array([y for x in temp_X for y in x])
        temp_y = np.array([y for x in temp_y for y in x])
        classifier.train(temp_X, temp_y)

        y_test_pred = classifier.predict(x_validate_fold, k=k)
        num_correct = np.sum(y_test_pred == y_validate_fold)
        accuracy = float(num_correct) / num_test
        accuracies[fold] = accuracy
    k_to_accuracies[k] = accuracies

for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d, accuracy = %f' % (k, accuracy))
예제 #6
0
# 可视化图像
VisualizeImage(X_train, y_train)
input('Enter any key to Cross-validation...')

# 创建用于超参数调优的交叉验证集(也可以验证集,因为数据量还是很大的)
num_training = 5000
X_tr = X_train[:num_training, ::]
X_tr = np.reshape(X_tr, (X_tr.shape[0], -1))
y_tr = y_train[:num_training]
# print(X_tr.shape, y_tr.shape)

num_testing = 500
X_te = X_test[:num_testing, ::]
X_te = np.reshape(X_te, (X_te.shape[0], -1))
y_te = y_test[:num_testing]
# print(X_te.shape, y_te.shape)

# 交叉验证确定参数K
Cross_validation(X_tr, y_tr)
input('Enter any key to train model...')

# 训练完整数据集(这里就以5000个数据集作为完整训练集,500个数据集作为测试集(60000个数据电脑内存吃不消), k值根据图显示10应为最佳)
classify = KNearestNeighbor()
classify.train(X_tr, y_tr)
y_te_pred = classify.predict(X_te, k=10)
accuracy = np.sum(y_te_pred == y_te) / float(X_te.shape[0])
print('最终测试: '
      '     K = %d, accuracy = %.3f' % (10, accuracy))


예제 #7
0
X_train_folds = np.array(np.split(X_train, num_folds))
y_train_folds = np.array(np.split(y_train, num_folds))
k_to_accuracies = {}
# test each k
for k in k_choices:
    # loop for each validation fold
    for val_idx in range(num_folds):
        # get a list of indexes of training folds, e.g. [1,2,3,4] [0,2,3,4]
        train_idx = [i for i in range(num_folds) if i != val_idx]
        # get training set x & y
        X_train_set = np.concatenate(X_train_folds[train_idx])
        y_train_set = np.concatenate(y_train_folds[train_idx])
        # train
        knn_classifer.train(X_train_set, y_train_set)
        # get prediction with current validation fold
        predict_y = knn_classifer.predict(X_train_folds[val_idx], k)
        # compute acc for the current validation fold
        accuracy = np.mean(predict_y == y_train_folds[val_idx])
        # store the accuracy
        k_to_accuracies.setdefault(k, []).append(accuracy)

# print out the computed accuracies
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d is %f' % (k, accuracy))
    print('mean for k = %d is %f' % (k, np.mean(k_to_accuracies[k])))
# plot
for k in k_choices:
    accuracies = k_to_accuracies[k]
    plt.scatter([k] * len(accuracies), accuracies)