def run(X_train, y_train, X_test, y_test, _k=[1]): """ Script to run the experiment given some data. It would train the Knn (compute n x n distances). And then predict labels for the test set. :param X_train: np mat, dimensions: N x D :param y_train: np mat, dimensions: N :param X_test: np mat, dimensions: M x D :param y_test: np mat, dimensions: M :param _k: list of int. How many k's to test for. :return: y_pred: np mat, dimensions: M """ # Compute distances: dists = mlBasics.compute_euclidean_distances(X_train, X_test) print "Distances computed" # For all k, for k in _k: # Predict labels y_test_pred = mlBasics.predict_labels(dists, y_train, k=k) print '{0:0.02f}'.format(np.mean(y_test_pred == y_test) * 100), "of test examples classified correctly. k =", key
def task1d(X_train, y_train, X_test, y_test): """ Compare the computational cost of classifying the test data :param X_train: :param y_train: :param X_test: :param y_test: :return: """ # k = 1 neighbor and the k neighbor which performed best in 1c question k_set = [1, 3] dists = mlBasics.compute_euclidean_distances(X_train, X_test) # The list of computational costs for two ks c_cost_lists = [] acc_lists = [] for k in k_set: start_time = time.time() y_test_pred = mlBasics.predict_labels(dists, y_train, k) duration_time = time.time() - start_time c_cost_lists.append(duration_time) acc = np.mean(y_test_pred == y_test) * 100 acc_lists.append(acc) increased_time = c_cost_lists[1] - c_cost_lists[0] acc_my_classifier = acc_lists[1] print "The increased computation time is ", increased_time print "The accuracy of my classifier if ", acc_my_classifier
def task1b(X_train, y_train, X_test, y_test): # Randomly subset the training set , test the first 10 images, get the confusion matrix exm_num_per_class = 100 k_set = [1, 5] X_training = [] y_training = [] for i in range(10): x_train_i = X_train[y_train == i] y_train_i = y_train[y_train == i] random_indexes = np.random.choice(range(len(y_train_i)), size=exm_num_per_class, replace=False) # random_indexes = np.random.randint(0, len(y_train_i), size=exm_num_per_class) x_train_i = x_train_i[random_indexes] y_train_i = y_train_i[random_indexes] X_training.extend(x_train_i.copy()) y_training.extend(y_train_i.copy()) X_testing = X_test[0:10] y_testing = y_test[0:10] # Test on test data for k in k_set: # 1) Compute distances: dists = mlBasics.compute_euclidean_distances(np.array(X_training), np.array(X_testing)) # 2) Run the code below and predict labels: y_test_pred = mlBasics.predict_labels(dists, y_training, k=k) print 'For k = ', k, ' : {0:0.02f}'.format( np.mean(y_test_pred == y_testing) * 100), "of test examples classified correctly." cm_1 = confusion_matrix(y_testing, y_test_pred, labels=range(10)) title = "Confusion Matrix k = " + str(k) plot_confusion_matrix(cm_1, range(10), title)
def five_fold_CV(dataset, labels, k): segments = [] segment_labels = [] fold = 5 seg_size = dataset.shape[0] / fold for i in range(1, fold + 1): segments = segments + [dataset[(i - 1) * seg_size:i * seg_size]] segment_labels = segment_labels + [ labels[(i - 1) * seg_size:i * seg_size] ] acc = [] for i in range(0, fold): tSet = np.empty((0, 784), int) for j in range(0, fold): if (j != i): tSet = np.vstack((tSet, segments[i])) dists = mlBasics.compute_euclidean_distances(tSet, segments[i]) test_pred = mlBasics.predict_labels(dists, segment_labels[i], k) if (k == 1): acc.append(np.mean(test_pred == segment_labels[i]) * 100) else: classifications = np.array( list( map( lambda y: np.argmax(np.bincount(y.astype(np.int64))). astype(np.float64), test_pred))) acc.append(np.mean(classifications == segment_labels[i]) * 100) #print("Accuracies from 5-fold CV: ", acc) return np.mean(acc)
def cross_validation(x_train, y_train, knn=1, K=5): kf = KFold(n_splits=K, shuffle=True) accuracy_all_fold = [] i = 0 for train, test in kf.split(x_train): dists = mlBasics.compute_euclidean_distances(x_train[train], x_train[test]) y_test_pred = mlBasics.predict_labels(dists, y_train[train], k=knn) accuracy = np.mean(y_test_pred == y_train[test]) * 100 accuracy_all_fold.append(accuracy) print('K= {1} Fold {0} Accuracy {2:.2f}'.format(knn, i, accuracy)) i += 1 return accuracy_all_fold
def cross_validation(X, Y, num_folds=5, k=1): # Dividing data into various folds X_folds = np.array(np.array_split(X, num_folds)) y_folds = np.array(np.array_split(Y, num_folds)) # List holding acuracies for k accuracies = [] for i in xrange(num_folds): train_id = [x for x in xrange(num_folds) if x != i] X_train_data = np.concatenate(X_folds[train_id]) Y_train_data = np.concatenate(y_folds[train_id]) dists = mlBasics.compute_euclidean_distances(X_train_data, X_folds[i]) y_test_pred = mlBasics.predict_labels(dists, Y_train_data, k) accuracy = np.mean(y_test_pred == y_folds[i]) accuracies.append(accuracy) print 'for k=%d, mean acc=%f ' % (k, np.mean(accuracies)) #for val in accuracies: # print 'accuracy = %f'%(val) return np.mean(accuracies)
i, j = 0, 100 for label in label_sample_idx: sample_idx[i:j] = np.random.choice(label_sample_idx[label], size=100) i, j = j, j + 100 x_train_sample, y_train_sample = X_train[sample_idx], y_train[sample_idx] # Reshape images x_train_sample = np.reshape(x_train_sample, (x_train_sample.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print('Compute Distances') dists = mlBasics.compute_euclidean_distances(x_train_sample, X_test) print('For k=1 Neighbour') y_test_pred = mlBasics.predict_labels(dists, y_train_sample, k=1) print('For k=5 Neighbours') y_test_pred_5 = mlBasics.predict_labels(dists, y_train_sample, k=5) from sklearn.metrics import confusion_matrix conf_1 = confusion_matrix(y_test, y_test_pred) conf_5 = confusion_matrix(y_test, y_test_pred_5) print('{0:0.02f}'.format(np.mean(y_test_pred == y_test) * 100), 'of test examples classified correctly for k=1 Neighbour(s).') print('Confusion Matrix for k=1 Neighbour(s)') print(conf_1) print('{0:0.02f}'.format(np.mean(y_test_pred_5 == y_test) * 100), 'of test examples classified correctly for k=5 Neighbour(s).')
@author: fame """ from load_mnist import load_mnist import hw1_knn as mlBasics import numpy as np # Load data - two class X_train, y_train = load_mnist('training', [0, 1]) X_test, y_test = load_mnist('testing', [0, 1]) # Load data - ALL CLASSES #X_train, y_train = load_mnist('training' ) #X_test, y_test = load_mnist('testing' ) # Reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) # Test on test data #1) Compute distances: dists = mlBasics.compute_euclidean_distances(X_train, X_test) #2) Run the code below and predict labels: y_test_pred = mlBasics.predict_labels(dists, y_train) #3) Report results # you should get following message '99.91 of test examples classified correctly.' print('{0:0.02f}'.format(np.mean(y_test_pred == y_test) * 100), "of test examples classified correctly.")
def task1c(X_train, y_train): ''' implement the 5-fold cross validation :param X_train: :param y_train: :param X_test: :param y_test: :return: ''' # K hyper parameters need to be test k_set = range(1, 16) # n-fold fold_num = 5 # example numbers per class, we need 100 images for one class, eg. class 1 exm_num_per_class = 100 # Initial the folds for training and folds for testing, and their corresponding label folds_training = [[] for i in range(fold_num)] label_for_folds_training = [[] for i in range(fold_num)] folds_testing = [[] for i in range(fold_num)] label_for_folds_testing = [[] for i in range(fold_num)] # For each class (number 1-10), sample 100 images randomly. for i in range(10): x_train_i = X_train[y_train == i] y_train_i = y_train[y_train == i] random_indexes = np.random.choice(range(len(y_train_i)), size=exm_num_per_class, replace=False) # random_indexes = np.random.randint(0, len(y_train_i), size=exm_num_per_class) x_train_i = x_train_i[random_indexes] y_train_i = y_train_i[random_indexes] stepsize = exm_num_per_class / fold_num for n in range(fold_num): # Use the one fold to test, and the rest to train folds_testing[n].append(x_train_i[stepsize * n:stepsize * (n + 1)]) label_for_folds_testing[n].append(y_train_i[stepsize * n:stepsize * (n + 1)]) folds_training[n].append( np.delete(x_train_i, range(stepsize * n, stepsize * (n + 1)), axis=0)) label_for_folds_training[n].append( np.delete(y_train_i, range(stepsize * n, stepsize * (n + 1)), axis=0)) # Variable result_records = {1: [acc1,...,acc5],2:[acc1,...,acc5]...} is to record the accuracies for each class result_records = {} for n in range(fold_num): # Following is to reshape folds_train = np.reshape(folds_training[n], (-1, np.shape(folds_training[n])[-1])) folds_test = np.reshape(folds_testing[n], (-1, np.shape(folds_testing[n])[-1])) label_for_folds_train = np.reshape(label_for_folds_training[n], -1) label_for_folds_test = np.reshape(label_for_folds_testing[n], -1) # Compute the distance dists = mlBasics.compute_euclidean_distances(folds_train, folds_test) # Iterate k for k in k_set: if k not in result_records.keys(): result_records[k] = [] y_test_pred = mlBasics.predict_labels(dists, label_for_folds_train, k=k) acc = np.mean(y_test_pred == label_for_folds_test) * 100 # add the acc to the result_records result_records[k].append(acc) # print '{0:0.02f}'.format(acc), "of test examples classified correctly." mean_acc_record = [ np.mean(result_records[key]) for key in result_records.keys() ] plot_acc_for_k(mean_acc_record, result_records.keys())
training_labels = np.array(training_labels) p = np.random.permutation( len(training_set) ) ## This scrambles the training set/labels, which would otherwise be in order training_set = training_set[p] ## this is a list of indices training_labels = training_labels[p] ## this is a list of generated labels X_train = X_train[training_set, :, :] y_train = np.array(training_labels) X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) dists = mlBasics.compute_euclidean_distances(X_train, X_test) y_test_pred_k1 = mlBasics.predict_labels(dists, y_train, k=1) print '{0:0.02f}'.format(np.mean(y_test_pred_k1 == y_test) * 100), "of test examples classified correctly (k = 1)." y_test_pred_k5 = mlBasics.predict_labels(dists, y_train, k=5) classifications = np.array( list( map( lambda y: np.argmax(np.bincount(y.astype(np.int64))).astype( np.float64), y_test_pred_k5))) ## For each row in y_test_pred, take the maximum value of a bin count to find the mode of the top k neighbors print '{0:0.02f}'.format(np.mean(classifications == y_test) * 100), "of test examples classified correctly (k = 5)."
X_train, y_train = load_mnist('training') X_test, y_test = load_mnist('testing') #x0 = np.arange(X_train[1]) new_X = np.empty((1000, 28, 28), dtype='int') new_y = np.empty(1000) k, l = 0, 100 for i in range(1, 10): temp, temp_lbl = load_mnist('training', [i]) random = np.random.randint(temp.shape[0], size=100) random_X = temp[random] random_y = temp_lbl[random] #print(random_X.shape) #print(random_y.shape) #np.append(new_X, random_X) #for k,l in range (1, 100): new_X[k:l] = random_X[0:100] new_y[k:l] = random_y[0:100] k, l = l, l + 100 # Reshape images X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) dists = mlBasics.compute_euclidean_distances(X_train, X_test) y_test_pred = mlBasics.predict_labels(dists, y_train_sample, k=1) y_test_pred_5 = mlBasics.predict_labels(dists, y_train_sample, k=5)
def test_all_data(X_train, y_train, X_test, y_test, k): dists = mlBasics.compute_euclidean_distances(X_train, X_test) y_test_pred = mlBasics.predict_labels(dists, y_train, k) return np.mean(y_test_pred == y_test) * 100
if __name__ == '__main__': ''' (a) Load data - ALL class ''' X_train, y_train, X_test, y_test = load_all_data() ''' (b) Load 1000 training example, 100 from each class and visualize 1 and 5 nearest neighbour for first 10 test examples ''' sample_size = 100 #samples per class X_1000, Y_1000 = extract_samples_per_class(X_train, y_train, sample_size) # k=1 dists = mlBasics.compute_euclidean_distances(X_1000, X_test) y_test_pred_1 = mlBasics.predict_labels(dists, Y_1000, k=1) print '################## part b #########################' print 'for k=1, {0:0.02f}'.format( np.mean(y_test_pred_1 == y_test) * 100), "of test examples classified correctly." # k=5 y_test_pred_5 = mlBasics.predict_labels(dists, Y_1000, k=5) print 'for k=5, {0:0.02f}'.format( np.mean(y_test_pred_5 == y_test) * 100), "of test examples classified correctly." #Confusion Matrix C_1 = metrics.confusion_matrix(y_test, y_test_pred_1) C_5 = metrics.confusion_matrix(y_test, y_test_pred_5)