def classifier_multiclass_shareboost (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,lawidth=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, RealSubsetFeatures, MulticlassLabels from modshogun import ShareBoost #print('Working on a problem of %d features and %d samples' % fm_train_real.shape) feats_train = RealFeatures(fm_train_real) labels = MulticlassLabels(label_train_multiclass) shareboost = ShareBoost(feats_train, labels, min(fm_train_real.shape[0]-1, 30)) shareboost.train(); #print(shareboost.get_activeset()) feats_test = RealSubsetFeatures(RealFeatures(fm_test_real), shareboost.get_activeset()) label_pred = shareboost.apply(feats_test) out = label_pred.get_labels() if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, labels_test) #print('Accuracy = %.4f' % acc) return out
def classifier_multiclassliblinear_modular( fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, width=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import MulticlassLibLinear feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = MulticlassLibLinear(C, feats_train, labels) classifier.train() label_pred = classifier.apply(feats_test) out = label_pred.get_labels() if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, labels_test) print('Accuracy = %.4f' % acc) return out
def classifier_multiclasslogisticregression_modular( fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, z=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels try: from modshogun import MulticlassLogisticRegression except ImportError: print("recompile shogun with Eigen3 support") return feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = MulticlassLogisticRegression(z, feats_train, labels) classifier.train() label_pred = classifier.apply(feats_test) out = label_pred.get_labels() if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, labels_test) print('Accuracy = %.4f' % acc) return out
def classifier_multiclass_relaxedtree(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, lawidth=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import RelaxedTree, MulticlassLibLinear from modshogun import GaussianKernel #print('Working on a problem of %d features and %d samples' % fm_train_real.shape) feats_train = RealFeatures(fm_train_real) labels = MulticlassLabels(label_train_multiclass) machine = RelaxedTree() machine.set_machine_for_confusion_matrix(MulticlassLibLinear()) machine.set_kernel(GaussianKernel()) machine.set_labels(labels) machine.train(feats_train) label_pred = machine.apply_multiclass(RealFeatures(fm_test_real)) out = label_pred.get_labels() if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, labels_test) print('Accuracy = %.4f' % acc) return out
def classifier_multiclasslinearmachine_modular( fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, width=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine, MulticlassOneVsOneStrategy, MulticlassOneVsRestStrategy feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = LibLinear(L2R_L2LOSS_SVC) classifier.set_epsilon(epsilon) classifier.set_bias_enabled(True) mc_classifier = LinearMulticlassMachine(MulticlassOneVsOneStrategy(), feats_train, classifier, labels) mc_classifier.train() label_pred = mc_classifier.apply() out = label_pred.get_labels() if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, labels_test) print('Accuracy = %.4f' % acc) return out
def evaluation_multiclassaccuracy_modular(ground_truth, predicted): from modshogun import MulticlassLabels from modshogun import MulticlassAccuracy ground_truth_labels = MulticlassLabels(ground_truth) predicted_labels = MulticlassLabels(predicted) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(predicted_labels, ground_truth_labels) return accuracy
def classifier_multiclass_ecoc_random(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, lawidth=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine from modshogun import ECOCStrategy, ECOCRandomSparseEncoder, ECOCRandomDenseEncoder, ECOCHDDecoder from modshogun import Math_init_random Math_init_random(12345) feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = LibLinear(L2R_L2LOSS_SVC) classifier.set_epsilon(epsilon) classifier.set_bias_enabled(True) rnd_dense_strategy = ECOCStrategy(ECOCRandomDenseEncoder(), ECOCHDDecoder()) rnd_sparse_strategy = ECOCStrategy(ECOCRandomSparseEncoder(), ECOCHDDecoder()) dense_classifier = LinearMulticlassMachine(rnd_dense_strategy, feats_train, classifier, labels) dense_classifier.train() label_dense = dense_classifier.apply(feats_test) out_dense = label_dense.get_labels() sparse_classifier = LinearMulticlassMachine(rnd_sparse_strategy, feats_train, classifier, labels) sparse_classifier.train() label_sparse = sparse_classifier.apply(feats_test) out_sparse = label_sparse.get_labels() if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc_dense = evaluator.evaluate(label_dense, labels_test) acc_sparse = evaluator.evaluate(label_sparse, labels_test) print('Random Dense Accuracy = %.4f' % acc_dense) print('Random Sparse Accuracy = %.4f' % acc_sparse) return out_sparse, out_dense
def classifier_multiclass_ecoc_ovr(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, lawidth=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine from modshogun import ECOCStrategy, ECOCOVREncoder, ECOCLLBDecoder, MulticlassOneVsRestStrategy feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = LibLinear(L2R_L2LOSS_SVC) classifier.set_epsilon(epsilon) classifier.set_bias_enabled(True) mc_classifier = LinearMulticlassMachine(MulticlassOneVsRestStrategy(), feats_train, classifier, labels) mc_classifier.train() label_mc = mc_classifier.apply(feats_test) out_mc = label_mc.get_labels() ecoc_strategy = ECOCStrategy(ECOCOVREncoder(), ECOCLLBDecoder()) ecoc_classifier = LinearMulticlassMachine(ecoc_strategy, feats_train, classifier, labels) ecoc_classifier.train() label_ecoc = ecoc_classifier.apply(feats_test) out_ecoc = label_ecoc.get_labels() n_diff = (out_mc != out_ecoc).sum() #if n_diff == 0: # print("Same results for OvR and ECOCOvR") #else: # print("Different results for OvR and ECOCOvR (%d out of %d are different)" % (n_diff, len(out_mc))) if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc_mc = evaluator.evaluate(label_mc, labels_test) acc_ecoc = evaluator.evaluate(label_ecoc, labels_test) #print('Normal OVR Accuracy = %.4f' % acc_mc) #print('ECOC OVR Accuracy = %.4f' % acc_ecoc) return out_ecoc, out_mc
def fit(self, X, labels): self.X = X self.L = np.eye(X.shape[1]) labels = MulticlassLabels(labels.astype(np.float64)) self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.params['k']) self._lmnn.set_maxiter(self.params['max_iter']) self._lmnn.set_obj_threshold(self.params['convergence_tol']) self._lmnn.set_regularization(self.params['regularization']) self._lmnn.set_stepsize(self.params['learn_rate']) if self.params['use_pca']: self._lmnn.train() else: self._lmnn.train(self.L) self.L = self._lmnn.get_linear_transform() return self
def fit(self, X, labels, verbose=False): self.X = X self.L = np.eye(X.shape[1]) labels = MulticlassLabels(labels.astype(np.float64)) self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.params['k']) self._lmnn.set_maxiter(self.params['max_iter']) self._lmnn.set_obj_threshold(self.params['convergence_tol']) self._lmnn.set_regularization(self.params['regularization']) self._lmnn.set_stepsize(self.params['learn_rate']) if self.params['use_pca']: self._lmnn.train() else: self._lmnn.train(self.L) self.L = self._lmnn.get_linear_transform() return self
def evaluation_clustering(features=fea, ground_truth=gnd_raw, ncenters=10): from modshogun import ClusteringAccuracy, ClusteringMutualInformation from modshogun import MulticlassLabels from modshogun import Math # reproducable results Math.init_random(1) centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) #print(('Clustering accuracy = %.4f' % accuracy)) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) # TODO mutual information does not work with serialization #return gnd, gnd_hat, accuracy, MIEval, mutual_info return gnd, gnd_hat, accuracy
def metric_lmnn_modular(train_fname=traindat, test_fname=testdat, label_train_fname=label_traindat, k=3): try: from modshogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile except ImportError: return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = MulticlassLabels(CSVFile(label_train_fname)) # LMNN lmnn = LMNN(feats_train, labels, k) lmnn.train() lmnn_distance = lmnn.get_distance() # perform classification with KNN knn = KNN(k, lmnn_distance, labels) knn.train() output = knn.apply(feats_test).get_labels() return lmnn, output
def RunNBCShogun(): totalTimer = Timer() self.predictions = None Log.Info("Loading dataset", self.verbose) try: # Load train and test dataset. trainData = np.genfromtxt(self.dataset[0], delimiter=',') testData = np.genfromtxt(self.dataset[1], delimiter=',') # Labels are the last row of the training set. labels = MulticlassLabels(trainData[:, (trainData.shape[1] - 1)]) with totalTimer: # Transform into features. trainFeat = RealFeatures(trainData[:, :-1].T) testFeat = RealFeatures(testData.T) # Create and train the classifier. self.model = self.BuildModel(trainFeat, labels, options) # Run Naive Bayes Classifier on the test dataset. self.predictions = self.model.apply_multiclass( testFeat).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() if len(self.dataset) > 1: return [time, self.predictions] return [time]
def fit(self, feats, labels): self.eigenvecs, self.space = self.space_model.fit(feats, labels) feat = RealFeatures(self.space) self.metric_model = shogun_LMNN( feat, MulticlassLabels(labels.astype(np.float64)), self.k) self.metric_model.set_maxiter(1000) self.metric_model.set_regularization(0.50) self.metric_model.set_obj_threshold(0.001) self.metric_model.set_stepsize(1e-7) #pdb.set_trace() L = np.eye(self.space.shape[1]) self.metric_model.train(L) stats = self.metric_model.get_statistics() #plt.plot(stats.obj.get()) #plt.grid(True) #plt.show() self.linear_transform = self.metric_model.get_linear_transform() self.projected_data = np.dot(self.linear_transform, self.space) norms = np.linalg.norm(self.projected_data, axis=0) self.projected_data /= norms # Fit the data with PCA first. # pdb.set_trace() return self.eigenvecs, self.projected_data
def classifier_multiclassmachine_modular(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, width=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) kernel = GaussianKernel(feats_train, feats_train, width) labels = MulticlassLabels(label_train_multiclass) classifier = LibSVM() classifier.set_epsilon(epsilon) #print labels.get_labels() mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(), kernel, classifier, labels) mc_classifier.train() kernel.init(feats_train, feats_test) out = mc_classifier.apply().get_labels() return out
def RunDTCShogun(q): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run the CARTree Classifier on the test dataset. self.model.apply_multiclass(testData).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def BuildModel(self, data, responses): # Create and train the classifier. model = Perceptron(RealFeatures(data.T), MulticlassLabels(responses)) if self.iterations: model.set_max_iter(self.iterations) model.train() return model
def RunRandomForestShogun(q): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) # Number of Trees. n = re.search("-n (\d+)", options) # Number of attributes to be chosen randomly to select from. f = re.search("-f (\d+)", options) self.form = 1 if not f else int(f.group(1)) self.numTrees = 10 if not n else int(n.group(1)) try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run the Random Forest Classifier on the test dataset. self.model.apply_multiclass(testData).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def RunNBCShogun(q): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) try: # Load train and test dataset. trainData = np.genfromtxt(self.dataset[0], delimiter=',') testData = np.genfromtxt(self.dataset[1], delimiter=',') # Labels are the last row of the training set. labels = MulticlassLabels(trainData[:, (trainData.shape[1] - 1)]) with totalTimer: # Transform into features. trainFeat = RealFeatures(trainData[:, :-1].T) testFeat = RealFeatures(testData.T) # Create and train the classifier. nbc = GaussianNaiveBayes(trainFeat, labels) nbc.train() # Run Naive Bayes Classifier on the test dataset. nbc.apply(testFeat).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def multiclass_randomforest_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, RandomForest, MajorityVote except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # Random Forest formation rand_forest = RandomForest(feats_train, train_labels, 20, 1) rand_forest.set_feature_types(ft) rand_forest.set_combination_rule(MajorityVote()) rand_forest.train() # Classify test data output = rand_forest.apply_multiclass(feats_test).get_labels() return rand_forest, output
def RunRandomForestShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) if "num_trees" in options: self.numTrees = int(options.pop("num_trees")) else: Log.Fatal("Required parameter 'num_trees' not specified!") raise Exception("missing parameter") self.form = 1 if "dimensions" in options: self.form = int(options.pop("dimensions")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run the Random Forest Classifier on the test dataset. self.predictions = self.model.apply_multiclass( testData).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() return [time, self.predictions]
def classifier_multiclassocas_modular (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1): from modshogun import RealFeatures, MulticlassLabels from modshogun import MulticlassOCAS from modshogun import Math_init_random # reproducible results random.seed(seed) Math_init_random(seed) # generate some training data where each class pair is linearly separable label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") fm_train=array(random.randn(num_class,num_vec)) fm_test=array(random.randn(num_class,num_vec)) for i in range(len(label_train)): fm_train[label_train[i],i]+=distance fm_test[label_test[i],i]+=distance feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) labels=MulticlassLabels(label_train) classifier = MulticlassOCAS(C,feats_train,labels) classifier.train() out = classifier.apply(feats_test).get_labels() #print label_test #print out return out,classifier
def fit(self, feats, labels): self.X_tr = feats self.y_train = labels feats = feats.astype(np.float64) feat = RealFeatures(feats.T) self.metric_model = shogun_LMNN( feat, MulticlassLabels(labels.astype(np.float64)), self.k) self.metric_model.set_maxiter(self.max_iter) self.metric_model.set_regularization(self.regularization) self.metric_model.set_obj_threshold(self.convergence_tol) self.metric_model.set_stepsize(self.learn_rate) self.metric_model.train() stats = self.metric_model.get_statistics() #pdb.set_trace() #plt.plot(stats.obj.get()) #plt.grid(True) #plt.show() self.linear_transform = self.metric_model.get_linear_transform() #self.projected_data = np.dot(self.linear_transform, feats.T) #norms = np.linalg.norm(self.projected_data, axis=0) #self.projected_data /= norms # Fit the data with PCA first. # pdb.set_trace() return self
def multiclass_c45classifiertree_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset = int32(random.permutation(feats_train.get_num_vectors())) vsubset = subset[1:subset.size / 3] trsubset = subset[1 + subset.size / 3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c = C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train, train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output = c.apply_multiclass(feats_test).get_labels() output_certainty = c.get_certainty_vector() return c, output, output_certainty
def BuildModel(self, data, responses): # Create and train the classifier. model = MulticlassLogisticRegression(self.z, RealFeatures(data.T), MulticlassLabels(responses)) if self.max_iter is not None: model.set_max_iter(self.max_iter); model.train() return model
def RunAllKnnShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the query # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') queryFeat = RealFeatures(queryFeat.T) else: referenceData = np.genfromtxt(self.dataset, delimiter=',') # Labels are the last row of the dataset. labels = MulticlassLabels( referenceData[:, (referenceData.shape[1] - 1)]) referenceData = referenceData[:, :-1] with totalTimer: # Get all the parameters. k = re.search("-k (\d+)", options) if not k: Log.Fatal( "Required option: Number of furthest neighbors to find." ) q.put(-1) return -1 else: k = int(k.group(1)) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) q.put(-1) return -1 referenceFeat = RealFeatures(referenceData.T) distance = EuclideanDistance(referenceFeat, referenceFeat) # Perform All K-Nearest-Neighbors. model = SKNN(k, distance, labels) model.train() if len(self.dataset) == 2: out = model.apply(queryFeat).get_labels() else: out = model.apply(referenceFeat).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def RunLDAShogun(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the test file. try: if len(self.dataset) > 1: testSet = LoadDataset(self.dataset[1]) # Use the last row of the training set as the responses. trainSet, trainLabels = SplitTrainData(self.dataset) # if the labels are not in {0,1,2,...,num_classes-1}, map them to this set and store the mapping # shogun's MCLDA class requires the labels to be in {0,1,2,...,num_classes-1} distinctLabels = list(set(trainLabels)) mapping = {} reverseMapping = {} idx = 0 for label in distinctLabels: mapping[label] = idx reverseMapping[idx] = label idx += 1 for i in range(len(trainLabels)): trainLabels[i] = mapping[trainLabels[i]] trainFeat = RealFeatures(trainSet.T) trainLabels = MulticlassLabels(trainLabels) # Gather optional parameters. if "tolerance" in options: self.tolerance = float(options.pop("tolerance")) if "store" in options: self.store = bool(options.pop("store")) if (len(options) > 0): Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") with totalTimer: self.model = MCLDA(trainFeat, trainLabels, self.tolerance, self.store) self.model.train() if (len(self.dataset) > 0): self.predictions = self.model.apply_multiclass( RealFeatures(testSet.T)) self.predictions = self.predictions.get_labels() # reverse map the predicted labels to actual labels for i in range(len(self.predictions)): self.predictions[i] = reverseMapping[ self.predictions[i]] except Exception as e: Log.Info("Exception: " + str(e)) return -1 time = totalTimer.ElapsedTime() return time
def _svm_new(self, kernel_width, c, epsilon): if self.x == None or self.y == None: raise Exception("No training data loaded.") x = RealFeatures(self.x) y = MulticlassLabels(self.y) self.svm = GMNPSVM(c, GaussianKernel(x, x, kernel_width), y) self.svm.set_epsilon(epsilon)
def main(): # Get training file name from the command line traindatafile = sys.argv[1] # The training file is in libSVM format with open(traindatafile, mode="r") as myFile: lines = myFile.readlines() random.shuffle(lines) open("tempdata.dat", 'w').writelines(lines) tr_data = load_svmlight_file("tempdata.dat") #To randomly select 5000 points Xtr = tr_data[0].toarray() # Converts sparse matrices to dense Ytr = tr_data[1] # The trainig labels Xtr = Xtr[:5000] Ytr = Ytr[:5000] # Cast data to Shogun format to work with LMNN features = RealFeatures(Xtr.T) labels = MulticlassLabels(Ytr.astype(np.float64)) #print(Xtr.shape) ### Do magic stuff here to learn the best metric you can ### kmax = 25 #inductive bias values = list(range(1, kmax + 1)) k = predict(Xtr, Ytr, values) # Number of target neighbours per example - tune this using validation #print(k) # Initialize the LMNN package print("K : "), print(k) k = 5 lmnn = LMNN(features, labels, k) init_transform = np.eye(Xtr.shape[1]) # Choose an appropriate timeout lmnn.set_maxiter(25000) lmnn.train(init_transform) # Let LMNN do its magic and return a linear transformation # corresponding to the Mahalanobis metric it has learnt L = lmnn.get_linear_transform() M = np.matrix(np.dot(L.T, L)) print("LMNN done") #print(M) # Save the model for use in testing phase # Warning: do not change this file name np.save("model.npy", M)
def load_train(self): ims, labels = self.load( self.test_images, self.test_labels) self.test_images = ims self.test_labels = labels labels_numbers = MulticlassLabels(self.test_labels) feats = RealFeatures(self.test_images.T) dist = EuclideanDistance() self.knn = KNN(self.k, dist, labels_numbers) self.knn.train(feats)
def classifier_multiclass_ecoc_discriminant( fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, lawidth=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine from modshogun import ECOCStrategy, ECOCDiscriminantEncoder, ECOCHDDecoder feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = LibLinear(L2R_L2LOSS_SVC) classifier.set_epsilon(epsilon) classifier.set_bias_enabled(True) encoder = ECOCDiscriminantEncoder() encoder.set_features(feats_train) encoder.set_labels(labels) encoder.set_sffs_iterations(50) strategy = ECOCStrategy(encoder, ECOCHDDecoder()) classifier = LinearMulticlassMachine(strategy, feats_train, classifier, labels) classifier.train() label_pred = classifier.apply(feats_test) out = label_pred.get_labels() if label_test_multiclass is not None: from modshogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, labels_test) print('Accuracy = %.4f' % acc) return out
def load_data(num_train_samples=7291, m_data_dict=data_dict): from modshogun import RealFeatures, MulticlassLabels import numpy train_vec = m_data_dict['yTr'][0][:num_train_samples].astype(numpy.float64) train_labels = MulticlassLabels(train_vec) test_vec = m_data_dict['yTe'][0].astype(numpy.float64) test_labels = MulticlassLabels(test_vec) print "#train_labels = " + str(train_labels.get_num_labels()) print "#test_labels = " + str(test_labels.get_num_labels()) train_mat = m_data_dict['xTr'][:,:num_train_samples].astype(numpy.float64) train_features = RealFeatures(train_mat) test_mat = m_data_dict['xTe'].astype(numpy.float64) test_features = RealFeatures(test_mat) print "#train_vectors = " + str(train_features.get_num_vectors()) print "#test_vectors = " + str(test_features.get_num_vectors()) print "data dimension = " + str(test_features.get_num_features()) return train_features, train_labels, test_features, test_labels
def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train)) feats_test=RealFeatures(CSVFile(test)) train_labels=MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset=int32(random.permutation(feats_train.get_num_vectors())) vsubset=subset[1:subset.size/3] trsubset=subset[1+subset.size/3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c=C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train,train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output=c.apply_multiclass(feats_test).get_labels() output_certainty=c.get_certainty_vector() return c,output,output_certainty
def metric_lmnn_statistics( k=3, fname_features="../../data/fm_train_multiclass_digits.dat.gz", fname_labels="../../data/label_train_multiclass_digits.dat", ): try: from modshogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print "Error importing modshogun or other required modules. Please, verify their installation." return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert features.get_num_vectors() == labels.get_num_labels() # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print "Training LMNN, this will take about two minutes..." lmnn.train() print "Training done!" # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel("Iterations") pyplot.ylabel("LMNN objective") pyplot.title("LMNN objective during training for the multiclass digits data set") pyplot.show()
def features_io_modular (fm_train_real, label_train_twoclass): import numpy from modshogun import SparseRealFeatures, RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import LibSVMFile, CSVFile, BinaryFile, HDF5File feats=SparseRealFeatures(fm_train_real) feats2=SparseRealFeatures() f=BinaryFile("tmp/fm_train_sparsereal.bin","w") feats.save(f) f=LibSVMFile("tmp/fm_train_sparsereal.ascii","w") feats.save(f) f=BinaryFile("tmp/fm_train_sparsereal.bin") feats2.load(f) f=LibSVMFile("tmp/fm_train_sparsereal.ascii") feats2.load(f) feats=RealFeatures(fm_train_real) feats2=RealFeatures() f=BinaryFile("tmp/fm_train_real.bin","w") feats.save(f) f=HDF5File("tmp/fm_train_real.h5","w", "/data/doubles") feats.save(f) f=CSVFile("tmp/fm_train_real.ascii","w") feats.save(f) f=BinaryFile("tmp/fm_train_real.bin") feats2.load(f) #print("diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))) f=CSVFile("tmp/fm_train_real.ascii") feats2.load(f) #print("diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))) lab=MulticlassLabels(numpy.array([0.0,1.0,2.0,3.0])) lab2=MulticlassLabels() f=CSVFile("tmp/label_train_twoclass.ascii","w") lab.save(f) f=BinaryFile("tmp/label_train_twoclass.bin","w") lab.save(f) f=HDF5File("tmp/label_train_real.h5","w", "/data/labels") lab.save(f) f=CSVFile("tmp/label_train_twoclass.ascii") lab2.load(f) f=BinaryFile("tmp/label_train_twoclass.bin") lab2.load(f) f=HDF5File("tmp/fm_train_real.h5","r", "/data/doubles") feats2.load(f) #print(feats2.get_feature_matrix()) f=HDF5File("tmp/label_train_real.h5","r", "/data/labels") lab2.load(f) #print(lab2.get_labels()) #clean up import os for f in ['tmp/fm_train_sparsereal.bin','tmp/fm_train_sparsereal.ascii', 'tmp/fm_train_real.bin','tmp/fm_train_real.h5','tmp/fm_train_real.ascii', 'tmp/label_train_real.h5', 'tmp/label_train_twoclass.ascii','tmp/label_train_twoclass.bin']: os.unlink(f) return feats, feats2, lab, lab2
def __init__(self, X, labels, k=3): self.X = X self.L = np.eye(X.shape[1]) labels = MulticlassLabels(labels.astype(np.float64)) self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, k)
def plot_data(x,y,axis): for idx,val in enumerate(numpy.unique(y)): xi = x[y==val] axis.scatter(xi[:,0], xi[:,1], s=50, facecolors='none', edgecolors=COLS[idx]) def plot_neighborhood_graph(x, nn, axis): for i in xrange(x.shape[0]): xs = [x[i,0], x[nn[1,i], 0]] ys = [x[i,1], x[nn[1,i], 1]] axis.plot(xs, ys, COLS[int(y[i])]) figure, axarr = pyplot.subplots(3, 1) x, y = sandwich_data() features = RealFeatures(x.T) labels = MulticlassLabels(y) print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features())) assert(features.get_num_vectors() == labels.get_num_labels()) distance = EuclideanDistance(features, features) k = 2 knn = KNN(k, distance, labels) plot_data(x, y, axarr[0]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0]) axarr[0].set_aspect('equal') axarr[0].set_xlim(-6, 4) axarr[0].set_ylim(-3, 2) lmnn = LMNN(features, labels, k)