def metric_lmnn_modular(train_fname=traindat, test_fname=testdat, label_train_fname=label_traindat, k=3): try: from modshogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile except ImportError: return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = MulticlassLabels(CSVFile(label_train_fname)) # LMNN lmnn = LMNN(feats_train, labels, k) lmnn.train() lmnn_distance = lmnn.get_distance() # perform classification with KNN knn = KNN(k, lmnn_distance, labels) knn.train() output = knn.apply(feats_test).get_labels() return lmnn, output
def RunAllKnnShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the query # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') queryFeat = RealFeatures(queryFeat.T) else: referenceData = np.genfromtxt(self.dataset, delimiter=',') # Labels are the last row of the dataset. labels = MulticlassLabels( referenceData[:, (referenceData.shape[1] - 1)]) referenceData = referenceData[:, :-1] with totalTimer: # Get all the parameters. k = re.search("-k (\d+)", options) if not k: Log.Fatal( "Required option: Number of furthest neighbors to find." ) q.put(-1) return -1 else: k = int(k.group(1)) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) q.put(-1) return -1 referenceFeat = RealFeatures(referenceData.T) distance = EuclideanDistance(referenceFeat, referenceFeat) # Perform All K-Nearest-Neighbors. model = SKNN(k, distance, labels) model.train() if len(self.dataset) == 2: out = model.apply(queryFeat).get_labels() else: out = model.apply(referenceFeat).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def load_train(self): ims, labels = self.load( self.test_images, self.test_labels) self.test_images = ims self.test_labels = labels labels_numbers = MulticlassLabels(self.test_labels) feats = RealFeatures(self.test_images.T) dist = EuclideanDistance() self.knn = KNN(self.k, dist, labels_numbers) self.knn.train(feats)
def knn(train_features, train_labels, test_features, test_labels, k=1): from modshogun import KNN, MulticlassAccuracy, EuclideanDistance distance = EuclideanDistance(train_features, train_features) knn = KNN(k, distance, train_labels) knn.train() train_output = knn.apply() test_output = knn.apply(test_features) evaluator = MulticlassAccuracy() print 'KNN training error is %.4f' % ((1-evaluator.evaluate(train_output, train_labels))*100) print 'KNN test error is %.4f' % ((1-evaluator.evaluate(test_output, test_labels))*100)
def RunAllKnnShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the query # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') queryFeat = RealFeatures(queryFeat.T) else: referenceData = np.genfromtxt(self.dataset, delimiter=',') # Labels are the last row of the dataset. labels = MulticlassLabels(referenceData[:, (referenceData.shape[1] - 1)]) referenceData = referenceData[:,:-1] with totalTimer: # Get all the parameters. k = re.search("-k (\d+)", options) if not k: Log.Fatal("Required option: Number of furthest neighbors to find.") q.put(-1) return -1 else: k = int(k.group(1)) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) q.put(-1) return -1 referenceFeat = RealFeatures(referenceData.T) distance = EuclideanDistance(referenceFeat, referenceFeat) # Perform All K-Nearest-Neighbors. model = SKNN(k, distance, labels) model.train() if len(self.dataset) == 2: out = model.apply(queryFeat).get_labels() else: out = model.apply(referenceFeat).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
class Number_recognition: def __init__(self, test_images, test_labels, k): self.test_images = test_images; self.test_labels = test_labels; self.k = k; def load(self, path_img, path_lbl): with open(path_lbl, 'rb') as file: magic, size = struct.unpack(">II", file.read(8)) if magic != 2049: raise ValueError('Magic number mismatch, expected 2049,' 'got %d' % magic) labels = array("B", file.read()) labels_result = np.zeros(shape=(size)) for i in xrange(size): labels_result[i] = labels[i] with open(path_img, 'rb') as file: magic, size, rows, cols = struct.unpack(">IIII", file.read(16)) print "rows: " + str(rows) + " cols: " + str(cols) if magic != 2051: raise ValueError('Magic number mismatch, expected 2051,' 'got %d' % magic) image_data = array("B", file.read()) images = np.zeros(shape=(size,rows*cols)) for i in xrange(size): images[i][:] = image_data[i*rows*cols : (i+1)*rows*cols] return images, labels_result def load_train(self): ims, labels = self.load( self.test_images, self.test_labels) self.test_images = ims self.test_labels = labels labels_numbers = MulticlassLabels(self.test_labels) feats = RealFeatures(self.test_images.T) dist = EuclideanDistance() self.knn = KNN(self.k, dist, labels_numbers) self.knn.train(feats) def predict(self, image): feats_test = RealFeatures(image. T) pred = self.knn.apply_multiclass(feats_test) return pred[:]
def BuildModel(self, data, labels, options): # Get all the parameters. n = re.search("-n (\d+)", options) self.n_neighbors = 5 if not n else int(n.group(1)) distance = EuclideanDistance(data, data) from modshogun import KNN_KDTREE knc = KNN(self.n_neighbors, distance, labels, KNN_KDTREE) knc.set_leaf_size(30) knc.train() return knc
def assign_labels(data, centroids, ncenters): from modshogun import EuclideanDistance from modshogun import RealFeatures, MulticlassLabels from modshogun import KNN from numpy import arange labels = MulticlassLabels(arange(0., ncenters)) fea = RealFeatures(data) fea_centroids = RealFeatures(centroids) distance = EuclideanDistance(fea_centroids, fea_centroids) knn = KNN(1, distance, labels) knn.train() return knn.apply(fea)
def assign_labels(data, centroids, ncenters): from modshogun import EuclideanDistance from modshogun import RealFeatures, MulticlassLabels from modshogun import KNN from numpy import arange labels = MulticlassLabels(arange(0.,ncenters)) fea = RealFeatures(data) fea_centroids = RealFeatures(centroids) distance = EuclideanDistance(fea_centroids, fea_centroids) knn = KNN(1, distance, labels) knn.train() return knn.apply(fea)
def knn_classify(traindat, testdat, k=3): from modshogun import KNN, MulticlassAccuracy, EuclideanDistance train_features, train_labels = traindat.features, traindat.labels distance = EuclideanDistance(train_features, train_features) knn = KNN(k, distance, train_labels) knn.train() test_features, test_labels = testdat.features, testdat.labels predicted_labels = knn.apply(test_features) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(predicted_labels, test_labels) err = 1 - acc return err
def knn_classify(traindat, testdat, k=3): from modshogun import KNN, MulticlassAccuracy, EuclideanDistance train_features, train_labels = traindat.features, traindat.labels distance = EuclideanDistance(train_features, train_features) knn = KNN(k, distance, train_labels) knn.train() test_features, test_labels = testdat.features, testdat.labels predicted_labels = knn.apply(test_features) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(predicted_labels, test_labels) err = 1-acc return err
def BuildModel(self, data, labels, options): # Get all the parameters. if "k" in options: n_neighbors = int(options.pop("k")) else: Log.Fatal("Required parameter 'k' not specified!") raise Exception("missing parameter") if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") distance = EuclideanDistance(data, data) knc = KNN(self.n_neighbors, distance, labels, KNN_KDTREE) knc.train() return knc
def lmnn_diagonal(train_features, train_labels, test_features, test_labels, k=1): from modshogun import LMNN, KNN, MSG_DEBUG, MulticlassAccuracy import numpy lmnn = LMNN(train_features, train_labels, k) lmnn.set_diagonal(True) lmnn.train() distance = lmnn.get_distance() knn = KNN(k, distance, train_labels) knn.train() train_output = knn.apply() test_output = knn.apply(test_features) evaluator = MulticlassAccuracy() print 'LMNN-diagonal training error is %.4f' % ((1-evaluator.evaluate(train_output, train_labels))*100) print 'LMNN-diagonal test error is %.4f' % ((1-evaluator.evaluate(test_output, test_labels))*100)
def classifier_knn_modular(train_fname=traindat, test_fname=testdat, label_train_fname=label_traindat, k=3): from modshogun import RealFeatures, MulticlassLabels, KNN, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) labels = MulticlassLabels(CSVFile(label_train_fname)) knn = KNN(k, distance, labels) knn_train = knn.train() output = knn.apply(feats_test).get_labels() multiple_k = knn.classify_for_multiple_k() return knn, knn_train, output, multiple_k
def lmnn(train_features, train_labels, test_features, test_labels, k=1): from modshogun import LMNN, KNN, MSG_DEBUG, MulticlassAccuracy import numpy # dummy = LMNN() # dummy.io.set_loglevel(MSG_DEBUG) lmnn = LMNN(train_features, train_labels, k) lmnn.train() distance = lmnn.get_distance() knn = KNN(k, distance, train_labels) knn.train() train_output = knn.apply() test_output = knn.apply(test_features) evaluator = MulticlassAccuracy() print 'LMNN training error is %.4f' % ((1-evaluator.evaluate(train_output, train_labels))*100) print 'LMNN test error is %.4f' % ((1-evaluator.evaluate(test_output, test_labels))*100)
def lmnn_classify(traindat, testdat, k=3): from modshogun import LMNN, KNN, MulticlassAccuracy, MSG_DEBUG train_features, train_labels = traindat.features, traindat.labels lmnn = LMNN(train_features, train_labels, k) lmnn.set_maxiter(1200) lmnn.io.set_loglevel(MSG_DEBUG) lmnn.train() distance = lmnn.get_distance() knn = KNN(k, distance, train_labels) knn.train() test_features, test_labels = testdat.features, testdat.labels predicted_labels = knn.apply(test_features) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(predicted_labels, test_labels) err = 1 - acc return err
def metric_lmnn_modular(train_fname=traindat,test_fname=testdat,label_train_fname=label_traindat,k=3): try: from modshogun import RealFeatures,MulticlassLabels,LMNN,KNN,CSVFile except ImportError: return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=MulticlassLabels(CSVFile(label_train_fname)) # LMNN lmnn=LMNN(feats_train,labels,k) lmnn.train() lmnn_distance=lmnn.get_distance() # perform classification with KNN knn=KNN(k,lmnn_distance,labels) knn.train() output=knn.apply(feats_test).get_labels() return lmnn,output
def lmnn_classify(traindat, testdat, k=3): from modshogun import LMNN, KNN, MulticlassAccuracy, MSG_DEBUG train_features, train_labels = traindat.features, traindat.labels lmnn = LMNN(train_features, train_labels, k) lmnn.set_maxiter(1200) lmnn.io.set_loglevel(MSG_DEBUG) lmnn.train() distance = lmnn.get_distance() knn = KNN(k, distance, train_labels) knn.train() test_features, test_labels = testdat.features, testdat.labels predicted_labels = knn.apply(test_features) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(predicted_labels, test_labels) err = 1-acc return err
def KNNAccuracy(distance, data, k, flag): transformedData = np.dot(data[0], distance.T) feat = RealFeatures(transformedData.T) labels = MulticlassLabels(data[1].astype(np.float64)) dist = EuclideanDistance(feat, feat) knn = KNN(k + 1, dist, labels) knn.train(feat) # Get nearest neighbors. nn = knn.nearest_neighbors() nn = np.delete(nn, 0, 0) # Compute unique labels. uniqueLabels = np.unique(labels) # Keep count correct predictions. count = 0 # Normalize labels for i in range(data[0].shape[0]): for j in range(len(uniqueLabels)): if (labels[i] == uniqueLabels[j]): labels[i] = j break for i in range(nn.shape[1]): mapLabels = [0 for x in range(len(uniqueLabels))] for j in range(nn.shape[0]): if (flag): distPoints = np.linalg.norm(data[0][nn[j][i], :] - data[0][i, :]) # Add constant factor of 1 incase two points overlap mapLabels[int(labels[nn[j, i]])] += 1 / (distPoints + 1)**2 else: # Subtract a variable factor to avoid draw condition without # affecting actual result. mapLabels[int(labels[nn[j, i]])] += 1 - j * 1e-8 maxInd = np.argmax(mapLabels) if (maxInd == labels[i]): count += 1 accuracy = (count / nn.shape[1]) * 100 return accuracy
def run_knn(Xtrain,Ytrain,Xtest,Ytest): prod_features = RealFeatures(Xtrain) prod_labels = MulticlassLabels(Ytrain) test_features = RealFeatures(Xtest) test_labels = MulticlassLabels(Ytest) if os.path.exists(".lmnn_model30000_5_reg05_cor20"): print "Using LMNN distance" lmnn = LMNN() sf = SerializableAsciiFile(".lmnn_model30000_5_reg05_cor20", 'r') lmnn.load_serializable(sf) diagonal = np.diag(lmnn.get_linear_transform()) #print('%d out of %d elements are non-zero.' % (np.sum(diagonal != 0), diagonal.size)) #diagonal = lmnn.get_linear_transform() np.set_printoptions(precision=1,threshold=1e10,linewidth=500) #lmnn.set_diagonal(True) dist = lmnn.get_distance() else: dist = EuclideanDistance() # classifier knn = KNN(K, dist, prod_labels) #knn.set_use_covertree(True) parallel = knn.get_global_parallel() parallel.set_num_threads(4) knn.set_global_parallel(parallel) knn.train(prod_features) print "Classifying test set..." pred = knn.apply_multiclass(test_features) print "Accuracy = %2.2f%%" % (100*np.mean(pred == Ytest)) cm = build_confusion_matrix(Ytest, pred, NCLASSES) #save_confusion_matrix(cm) #cm = load_confusion_matrix() print "Confusion matrix: " print cm #plot_confusion_matrix(cm) #results = predict_class_prob(pred, cm) #nn = build_neighbours_matrix(knn, prod_labels) #results = predict_class_from_neighbours(nn) #print "Log loss: " + str(calculate_log_loss(results, Ytest)) #print_prediction_output(results) return cm
def BuildModel(self, data, labels, options): # Get all the parameters. n = re.search("-n (\d+)", options) self.n_neighbors = 5 if not n else int(n.group(1)) distance = EuclidianDistance(data, data) knc = KNN(self.n_neighbors, distance, labels) knc.train() # Create and train the classifier. knc = LibSvm(self.C, self.kernel, labels) knc.train() return knc
def knn(train_features, train_labels, test_features, test_labels, k=1): from modshogun import KNN, MulticlassAccuracy, EuclideanDistance distance = EuclideanDistance(train_features, train_features) knn = KNN(k, distance, train_labels) knn.train() train_output = knn.apply() test_output = knn.apply(test_features) evaluator = MulticlassAccuracy() print 'KNN training error is %.4f' % ( (1 - evaluator.evaluate(train_output, train_labels)) * 100) print 'KNN test error is %.4f' % ( (1 - evaluator.evaluate(test_output, test_labels)) * 100)
def lmnn(train_features, train_labels, test_features, test_labels, k=1): from modshogun import LMNN, KNN, MSG_DEBUG, MulticlassAccuracy import numpy # dummy = LMNN() # dummy.io.set_loglevel(MSG_DEBUG) lmnn = LMNN(train_features, train_labels, k) lmnn.train() distance = lmnn.get_distance() knn = KNN(k, distance, train_labels) knn.train() train_output = knn.apply() test_output = knn.apply(test_features) evaluator = MulticlassAccuracy() print 'LMNN training error is %.4f' % ( (1 - evaluator.evaluate(train_output, train_labels)) * 100) print 'LMNN test error is %.4f' % ( (1 - evaluator.evaluate(test_output, test_labels)) * 100)
def lmnn_diagonal(train_features, train_labels, test_features, test_labels, k=1): from modshogun import LMNN, KNN, MSG_DEBUG, MulticlassAccuracy import numpy lmnn = LMNN(train_features, train_labels, k) lmnn.set_diagonal(True) lmnn.train() distance = lmnn.get_distance() knn = KNN(k, distance, train_labels) knn.train() train_output = knn.apply() test_output = knn.apply(test_features) evaluator = MulticlassAccuracy() print 'LMNN-diagonal training error is %.4f' % ( (1 - evaluator.evaluate(train_output, train_labels)) * 100) print 'LMNN-diagonal test error is %.4f' % ( (1 - evaluator.evaluate(test_output, test_labels)) * 100)
xs = [x[i,0], x[nn[1,i], 0]] ys = [x[i,1], x[nn[1,i], 1]] axis.plot(xs, ys, COLS[int(y[i])]) figure, axarr = pyplot.subplots(3, 1) x, y = sandwich_data() features = RealFeatures(x.T) labels = MulticlassLabels(y) print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features())) assert(features.get_num_vectors() == labels.get_num_labels()) distance = EuclideanDistance(features, features) k = 2 knn = KNN(k, distance, labels) plot_data(x, y, axarr[0]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0]) axarr[0].set_aspect('equal') axarr[0].set_xlim(-6, 4) axarr[0].set_ylim(-3, 2) lmnn = LMNN(features, labels, k) lmnn.set_maxiter(10000) lmnn.train() L = lmnn.get_linear_transform() knn.set_distance(lmnn.get_distance()) plot_data(x, y, axarr[1]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[1])
def evaluate(labels, feats, params={ 'n_neighbors': 2, 'use_cover_tree': 'True', 'dist': 'Manhattan' }, Nsplit=2): """ Run Cross-validation to evaluate the KNN. Parameters ---------- labels: 2d array Data set labels. feats: array Data set feats. params: dictionary Search scope parameters. Nsplit: int, default = 2 The n for n-fold cross validation. all_ks: range of int, default = range(1, 21) Numbers of neighbors. """ k = params.get('n_neighbors') use_cover_tree = params.get('use_cover_tree') == 'True' if params.get('dist' == 'Euclidean'): func_dist = EuclideanDistance else: func_dist = ManhattanMetric split = CrossValidationSplitting(labels, Nsplit) split.build_subsets() accuracy = np.zeros(Nsplit) acc_train = np.zeros(accuracy.shape) time_test = np.zeros(accuracy.shape) for i in range(Nsplit): idx_train = split.generate_subset_inverse(i) idx_test = split.generate_subset_indices(i) feats.add_subset(idx_train) labels.add_subset(idx_train) dist = func_dist(feats, feats) knn = KNN(k, dist, labels) knn.set_store_model_features(True) if use_cover_tree: knn.set_knn_solver_type(KNN_COVER_TREE) else: knn.set_knn_solver_type(KNN_BRUTE) knn.train() evaluator = MulticlassAccuracy() pred = knn.apply_multiclass() acc_train[i] = evaluator.evaluate(pred, labels) feats.remove_subset() labels.remove_subset() feats.add_subset(idx_test) labels.add_subset(idx_test) t_start = time.clock() pred = knn.apply_multiclass(feats) time_test[i] = (time.clock() - t_start) / labels.get_num_labels() accuracy[i] = evaluator.evaluate(pred, labels) feats.remove_subset() labels.remove_subset() print accuracy.mean() return accuracy
axis.plot(xs, ys, COLS[int(y[i])]) figure, axarr = pyplot.subplots(3, 1) x, y = sandwich_data() features = RealFeatures(x.T) labels = MulticlassLabels(y) print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features())) assert (features.get_num_vectors() == labels.get_num_labels()) distance = EuclideanDistance(features, features) k = 2 knn = KNN(k, distance, labels) plot_data(x, y, axarr[0]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0]) axarr[0].set_aspect('equal') axarr[0].set_xlim(-6, 4) axarr[0].set_ylim(-3, 2) lmnn = LMNN(features, labels, k) lmnn.set_maxiter(10000) lmnn.train() L = lmnn.get_linear_transform() knn.set_distance(lmnn.get_distance()) plot_data(x, y, axarr[1]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[1])
from modshogun import EuclideanDistance, KNN, MulticlassLabels, CSVFile, RealFeatures #![begin] #![load_data] trainf = CSVFile("../data/fm_train_real.dat") feats_train = RealFeatures(trainf) testf = CSVFile("../data/fm_test_real.dat") feats_test = RealFeatures(testf) train_labels = CSVFile("../data/label_train_multiclass.dat") labels = MulticlassLabels(train_labels) #![load_data] #![choose_distance] distance = EuclideanDistance(feats_train, feats_test) #![choose_distance] #![create_instance] knn = KNN(3, distance, labels) #![create_instance] #![train_and_apply] knn.train() test_labels = knn.apply(feats_test) output = test_labels.get_values() print output #![train_and_apply] #![end]
# load LMNN if os.path.exists(".lmnn_model30000_5_reg05_cor20"): sf = SerializableAsciiFile(".lmnn_model30000_5_reg05_cor20", 'r') lmnn = LMNN() lmnn.load_serializable(sf) diagonal = np.diag(lmnn.get_linear_transform()) print('%d out of %d elements are non-zero.' % (np.sum(diagonal != 0), diagonal.size)) #print diagonal dist = lmnn.get_distance() else: dist = EuclideanDistance() cm = load_confusion_matrix() print cm # classifier knn = KNN(k, dist, prod_labels) parallel = knn.get_global_parallel() parallel.set_num_threads(4) knn.set_global_parallel(parallel) knn.train(prod_features) print "Classifying test set..." pred = knn.apply_multiclass(test_features) results = predict_class_prob(pred, cm) print_prediction_output(results)