def validate_k(X, y, k_list): """ Perform 5-fold validation on dataset X, y with varying K values from k_list. Used to derive some intuition about what range of k to validate over. Returns a 5 x len(k_list) array of test accuracies """ skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) accuracy = np.zeros((5, len(k_list))) i = 0 for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for j, k in enumerate(k_list): classifier = kNN() classifier.train(X_train, y_train) y_pred = classifier.predict(X_test, k) acc = (y_pred == y_test).mean() accuracy[i, j] = acc print("Q1 -- TIME: {} Fold {}, k: {}, Accuracy: {}".format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), i + 1, k, acc)) i += 1 statistics = np.zeros((2, len(k_list))) statistics[0, :] = np.mean(accuracy, 0) statistics[1, :] = np.std(accuracy, 0) return accuracy
def q1_regularised(X, y, k_list): """ For 20 different test/train splits, calculate the classification accuracy on the test set using the values in k_list. Returns: test_acc_ar: np.array: test errors for all 20 runs, across all hyperparameters in k_list. """ test_acc_ar = np.zeros((20, len(k_list))) for i in range(20): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i, stratify=y) print("Run {}".format(i)) for j, k in enumerate(k_list): classifier = kNN() classifier.train(X_train, y_train) y_pred = classifier.predict(X_test, k) test_acc = (y_pred == y_test).mean() test_acc_ar[i, j] = test_acc print("k Parameter: {}, Accuracy: {}".format(k, test_acc)) return test_acc_ar
def result(xtrain,xtest,ytrain,ytest,k): print 'Results for Knn with k =',k clf = knn.kNN(k=k,distance_m = distance) clf.fit(xtrain,ytrain) prd = clf.predict(xtest) print "Accuracy:",accuracy_score(ytest,prd) print 'Confusion Matrix' print confusion_matrix(ytest,prd) return accuracy_score(ytest,prd)
def main(): """Do a test if called from the command line""" data = pd.read_csv(DATAFILE, header=None, names=HEADER) X = data[FEATURES] y = data.species model = kNN(k=10) model.fit(X, y) print "Accuracy on training set:", model.score(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80) model.fit(X_train, y_train) print "Accuracy on test set: ", model.score(X_test, y_test)
def main(): # List of patient objects patient_list = parse_csv() # create the ten folds ten_folds_strat_list = stratify_data(patient_list) # create the classifer objects knn = kNN() naive_bayes = naiveBayes() # call the 10-fold cross validation ten_fold_strat_cross_validation(knn, ten_folds_strat_list, 10) ten_fold_strat_cross_validation(naive_bayes, ten_folds_strat_list)
def runTests(trainning, tr_classes, test, t_classes, k): errors = 0 for i in range(len(test)): c = knn.kNN(k, test[i], trainning, tr_classes) if (c != t_classes[i]): #print("Classification: " + str(c) + " Correct answer: " + str(t_classes[i])) errors = errors + 1 print("######## K = " + str(k) + " ########") print("Dataset size: " + str(len(trainning) + len(test))) print("Trainning set size: " + str(len(trainning))) print("Test set size: " + str(len(test))) print("Errors: " + str(errors)) print("Accuracy: " + str(1 - float(errors) / len(test))) print("\n") return (1 - errors / len(test))
def nca_mnist_experiment(trial, train_percentage=0.1, test_percentage=0.1): encoding_train_imgs_path = './data/MNIST_encoding/tf_train.encoding' encoding_test_imgs_path = './data/MNIST_encoding/tf_test.encoding' train_labels_path = './data/MNIST_encoding/tf_train.labels' test_labels_path = './data/MNIST_encoding/tf_test.labels' encoding_train = pickle.load(open(encoding_train_imgs_path, 'rb')) encoding_test = pickle.load(open(encoding_test_imgs_path, 'rb')) print(encoding_train.shape) train_labels = pickle.load(open(train_labels_path, 'rb')) test_labels = pickle.load(open(test_labels_path, 'rb')) print(train_labels.shape) m = len(encoding_train) train_m = int(m * train_percentage) sel = random.sample(range(m), train_m) X = encoding_train.astype(np.float)[sel] y = train_labels[sel] print(X.shape) print(y.shape) m = len(encoding_test) test_m = int(m * test_percentage) sel = random.sample(range(m), test_m) X_test = encoding_test.astype(np.float)[sel] y_test = test_labels[sel] print(X_test.shape) print(y_test.shape) knn = kNN() k_valus = [1, 3, 5, 7] for k in k_valus: knn.k = k acc_list = [] for _ in range(trial): acc = knn.evaluate(X, y, X_test, y_test) acc_list.append(acc) print(np.mean(np.array(acc_list))) nca = NCA(max_iter=100, learning_rate=0.01) nca.fit(X, y) x_train = nca.transform() x_test = nca.transform(X_test) for k in k_valus: knn.k = k acc_list = [] for _ in range(trial): acc = knn.evaluate(x_train, y, x_test, y_test) acc_list.append(acc) print(np.mean(np.array(acc_list)))
import datetime import numpy as np from sklearn.neighbors import KNeighborsClassifier from load_data import * from knn import kNN from plot import plot if __name__ == "__main__": train_set = load_data(10) plot(train_set) new_data = ['ZL', 169, 2] train_data = np.array(train_set[['打斗镜头', '接吻镜头']]) train_labels = np.array(train_set[['电影类别']]) time_s = datetime.datetime.now() # ===========================手动实现====================== label = kNN(new_data[1:], train_data, train_labels, k=3) time_e = datetime.datetime.now() - time_s print('用时:', time_e) print('新数据的类别:', label) # ===========================sklearn实现====================== clf = KNeighborsClassifier(n_neighbors=3) clf.fit(train_data, train_labels) label = clf.predict([new_data[1:]]) # 输入是2D数据 time_e = datetime.datetime.now() - time_s print('用时:', time_e) print('新数据的类别:', label[0])
# -*- coding: utf-8 -*- import f_test as ftest import knn as knn import centroid as cc import lr as lr import svm as svm if __name__ == '__main__': file_name = 'GenomeTrainXY.txt' raw_data = ftest.get_data(file_name) features, scores = ftest.f_test(raw_data) ftest.print_scores(features, scores) train = knn.pickTrainingData(file_name, features) test = knn.pickTestData("GenomeTestX.txt", features) print("\n\nPredictions for KNN (k=3) Classifier: ") knn.kNN(3, train, test) print("\nPredictions for Centroid Classifier: ") cc.centroid_classifier(train, test) print("\nPredictions for Linear Regression: ") lr.linear_regression(train, test) print("\nPredictions for SVM: ") svm.svm_classifier(train, test)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) model = moco.builder.MoCo( # models.__dict__[args.arch], netalexnet.alexnet, args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) print(model) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') testdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.Resize(size=256), transforms.RandomResizedCrop(size=224, scale=(0.2, 1.)), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomGrayscale(p=0.2), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.Resize(size=256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_dataset = ImageFolderInstance( '/data2/zyf/ImageNet/ILSVRC2012-100/train', transform=transform_train, two_crop=True) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None # train_loader = torch.utils.data.DataLoader( # train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), # num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, drop_last=True) test_dataset = ImageFolderInstance( '/data2/zyf/ImageNet/ILSVRC2012-100/val', transform=transform_test) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=args.workers, drop_last=True) ndata = train_dataset.__len__() print(ndata) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # print('*******************') # acc = kNN(0, model, train_loader, test_loader, 200, 0.1, ndata, low_dim=128) # print('+++++++++++++++++') # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) print('----------Evaluation---------') start = time.time() acc = kNN(0, model, train_loader, test_loader, 200, 0.1, ndata, low_dim=128) print("Evaluation Time: '{}'s".format(time.time() - start)) writer.add_scalar('nn_acc', acc, epoch) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=False, filename='checkpoint_{:04d}.pth.tar'.format(epoch)) writer.close()
# -*- coding: utf-8 -*- import numpy as np import cv2 as cv import Dataset import knn path = "ordo_2.csv" DS = Dataset.Dataset(path) df = DS.getDF() print(df.head()) X, Y = DS.getXY() print(X) k = 7 #gnb = GNB.GNB(X, Y) knn = knn.kNN(k, X, Y) accuracy = knn.getAccuracy() print(accuracy)
plt.ylabel("m") plt.title("{} Sample Complexity".format(title)) plt.show() if __name__ == "__main__": # run search for all algorithms A = [ "Perceptron()", "Winnow()", "LinearRegression()", ] Atitle = [ "Perceptron", "Winnow", "Least Squares", "One nearest-neighbours" ] for j, i in enumerate(A): alg = eval(i) neg = -1 if i == "Winnow()": neg = 0 mean, std = find_trend_m(alg, neg=neg) plot_trend(mean, std, Atitle[j]) alg = kNN() mean, std = find_trend_m(alg, neg=neg, num_runs=10, test_size=6000, max_n=18) plot_trend(mean, std, 'OneNN')
import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap cmap = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) iris = datasets.load_iris() X, y = iris['data'], iris['target'] # print(iris['target_names']) ['setosa' 'versicolor' 'virginica'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) from knn import kNN clf = kNN(k=5) clf.fit(X_train, y_train) prediction = clf.predict(X_test) acc = np.sum(prediction == y_test) / len(y_test) print(acc) # plt.figure() # plt.scatter(X[:, 2], X[:, 3], c=y, cmap=cmap, edgecolor='k', s=20) # plt.show()
def q2_regularised(X, y, n, k_list): """ For n different 5-fold test/train splits, calculate the classification accuracy on the test set using the values in k_list Returns statistics: n by len(k_list) array of the mean and std test errors for each x-validated run. k_max_index: list[int]: indexes of the most performant hyperparameters k for each one (use to derive optimal hyperparams from k_list) """ accuracy = np.zeros((n, len(k_list), 5)) for i in range(n): skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=i) l = 0 for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for j, k in enumerate(k_list): classifier = kNN() classifier.train(X_train, y_train) y_pred = classifier.predict(X_test, k) acc = (y_pred == y_test).mean() accuracy[i, j, l] = acc print("Q2 -- Run {}, Fold {}, k: {}/{}, Accuracy: {}".format( i + 1, l + 1, j + 1, len(k_list), acc)) l += 1 statistics = np.zeros((2, len(k_list))) statistics[0, :] = np.mean(accuracy, axis=(0, 2)) statistics[1, :] = np.std(accuracy, axis=(0, 2)) k_stats = np.mean(accuracy, axis=2) # Selecting most performant K parameter index k_max_index = np.argmax(k_stats, axis=1) print(k_max_index) k_mean = np.mean(accuracy, axis=2) k_max_index = np.argmax(k_mean, axis=1) k_std = np.std(accuracy, axis=2) # Selecting most performant K parameter optimal_params = [k_list[i] for i in k_max_index] print(optimal_params) optimal_errors = [] # Rerun classification using most optimal parameter to find smallest error for i, k in enumerate(optimal_params): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i, stratify=y) classifier = kNN() classifier.train(X_train, y_train) y_pred = classifier.predict(X_test, k) error = (y_pred == y_test).mean() print("K : {}, Error : {}".format(k, error)) optimal_errors.append(error) optimal_mean = np.mean(optimal_errors) optimal_std = np.std(optimal_errors) print(optimal_errors) print(k_max_index) print("Optimal Mean Error: {} Optimal Std Error: {}".format( optimal_mean, optimal_std)) return statistics, k_max_index
# projected training matrix productMatrix = knnTrainingMatrix.dot(projMatrix) # get test matrix knnTestData = open('data/knntest.txt', 'r').readlines() knnTestMatrix = [] for line in knnTestData: knnTestMatrix.append(np.fromstring(line, dtype=int, sep=' ')) knnTestMatrix = np.array(knnTestMatrix) # normal kNN test error numErrors = 0 for row in knnTestMatrix: vector = row[:-1] predictedLabel = kNN(vector, knnTrainingMatrix, labels, 15) if predictedLabel != row[-1]: numErrors = numErrors + 1 print(float(numErrors) / len(knnTestMatrix)) # projected kNN test error numErrors = 0 for row in knnTestMatrix: vector = row[:-1] projVector = vector.dot(projMatrix) predictedLabel = kNN(projVector, productMatrix, labels, 15) if predictedLabel != row[-1]: numErrors = numErrors + 1 print(float(numErrors) / len(knnTestMatrix)) # ID3 Example Use
def modelKNN(self, instanceFeature, k): """Performs a kNN on the model data and returns a dictionary of the vote proportions for the k nearest instances.""" return knn.kNN(self.data, instanceFeature, k)