def metric_lmnn_statistics(k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'): try: from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print 'Error importing shogun or other required modules. Please, verify their installation.' return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert(features.get_num_vectors() == labels.get_num_labels()) # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print 'Training LMNN, this will take about two minutes...' lmnn.train() print 'Training done!' # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel('Iterations') pyplot.ylabel('LMNN objective') pyplot.title('LMNN objective during training for the multiclass digits data set') pyplot.show()
def evaluation_multiclassaccuracy (ground_truth, predicted): from shogun import MulticlassLabels from shogun import MulticlassAccuracy ground_truth_labels = MulticlassLabels(ground_truth) predicted_labels = MulticlassLabels(predicted) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels) return accuracy
def BuildModel(self, data, responses): # Create and train the classifier. model = Perceptron(RealFeatures(data.T), MulticlassLabels(responses)) if self.iterations: model.set_max_iter(self.iterations) model.train() return model
def __init__(self, method_param, run_param): self.info = "SHOGUN_DTC (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) # Flag for Cross Validation Pruning self.cv_prune = False if "pruning" in method_param: self.cv_prune = bool(method_param["pruning"]) self.num_folds = 2 if "k" in method_param: # Making sure that the value is of the right type self.num_folds = int(method_param["k"])
def RunNBCShogun(): totalTimer = Timer() self.predictions = None Log.Info("Loading dataset", self.verbose) try: # Load train and test dataset. trainData = np.genfromtxt(self.dataset[0], delimiter=',') testData = np.genfromtxt(self.dataset[1], delimiter=',') # Labels are the last row of the training set. labels = MulticlassLabels(trainData[:, (trainData.shape[1] - 1)]) with totalTimer: # Transform into features. trainFeat = RealFeatures(trainData[:, :-1].T) testFeat = RealFeatures(testData.T) # Create and train the classifier. self.model = self.BuildModel(trainFeat, labels, options) # Run Naive Bayes Classifier on the test dataset. self.predictions = self.model.apply_multiclass( testFeat).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() if len(self.dataset) > 1: return [time, self.predictions] return [time]
def __init__(self, method_param, run_param): self.info = "SHOGUN_RANDOMFOREST (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) self.num_trees = 50 if "num-trees" in method_param: self.num_trees = int(method_param["num-trees"]) self.form = 1 if "dimensions" in method_param: self.form = int(method_param["dimensions"]) self.solver = "auto" if "solver" in method_param: self.solver = str(method_param["solver"])
def classifier_multiclassocas (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1): from shogun import RealFeatures, MulticlassLabels from shogun import Math_init_random try: from shogun import MulticlassOCAS except ImportError: print("MulticlassOCAS not available") return # reproducible results random.seed(seed) Math_init_random(seed) # generate some training data where each class pair is linearly separable label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") fm_train=array(random.randn(num_class,num_vec)) fm_test=array(random.randn(num_class,num_vec)) for i in range(len(label_train)): fm_train[int(label_train[i]),i]+=distance fm_test[int(label_test[i]),i]+=distance feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) labels=MulticlassLabels(label_train) classifier = MulticlassOCAS(C,feats_train,labels) classifier.train() out = classifier.apply(feats_test).get_labels() #print label_test #print out return out,classifier
def __init__(self, method_param, run_param): self.info = "SHOGUN_LDA (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) self.tolerance = 1e-4 if "tolerance" in method_param: self.tolerance = float(method_param["tolerance"]) self.store_cov = False if "store-covar" in method_param: self.store_cov = bool(method_param["store-covar"]) self.solver = "auto" if "solver" in method_param: self.solver = str(method_param["solver"])
def __init__(self, method_param, run_param): self.info = "SHOGUN_KNN (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) self.k = 3 if "k" in method_param: self.k = int(method_param["k"]) self.distance = "Euclidean" if "distance" in method_param: self.distance = str(method_param["distance"]) self.solver = "Brute" if "solver" in method_param: self.solver = str(method_param["solver"]) self.degree = 3 if "degree" in method_param: self.degree = float(method_param["degree"])
def evaluation_clustering(features=fea, ground_truth=gnd_raw, ncenters=10): from shogun import ClusteringAccuracy, ClusteringMutualInformation from shogun import MulticlassLabels from shogun import Math # reproducable results Math.init_random(1) centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) #print(('Clustering accuracy = %.4f' % accuracy)) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) # TODO mutual information does not work with serialization #return gnd, gnd_hat, accuracy, MIEval, mutual_info return gnd, gnd_hat, accuracy
def mkl_multiclass_1(fm_train_real, fm_test_real, label_train_multiclass, C): kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() for i in range(-10, 11): subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(pow(2, i + 1)) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = MulticlassLabels(label_train_multiclass) mkl = MKLMulticlass(C, kernel, labels) mkl.set_epsilon(1e-2) mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(1) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out
def metric_lmnn(train_fname=traindat, test_fname=testdat, label_train_fname=label_traindat, k=3): try: from shogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile except ImportError: return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = MulticlassLabels(CSVFile(label_train_fname)) # LMNN lmnn = LMNN(feats_train, labels, k) lmnn.train() lmnn_distance = lmnn.get_distance() # perform classification with KNN knn = KNN(k, lmnn_distance, labels) knn.train() output = knn.apply(feats_test).get_labels() return lmnn, output
def RunRandomForestShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) if "num_trees" in options: self.numTrees = int(options.pop("num_trees")) else: Log.Fatal("Required parameter 'num_trees' not specified!") raise Exception("missing parameter") self.form = 1 if "dimensions" in options: self.form = int(options.pop("dimensions")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run the Random Forest Classifier on the test dataset. self.predictions = self.model.apply_multiclass(testData).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() return [time, self.predictions]
def RunDTCShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run the CARTree Classifier on the test dataset. self.predictions = self.model.apply_multiclass( testData).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() if len(self.dataset) > 1: return [time, self.predictions] return [time]
def multiclass_c45classifiertree(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from shogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset = int32(random.permutation(feats_train.get_num_vectors())) vsubset = subset[1:int(subset.size / 3)] trsubset = subset[1 + int(subset.size / 3):subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c = C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train, train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output = c.apply_multiclass(feats_test).get_labels() output_certainty = c.get_certainty_vector() return c, output, output_certainty
def BuildModel(self, data, responses): # Create and train the classifier. model = MulticlassLogisticRegression(self.z, RealFeatures(data.T), MulticlassLabels(responses)) if self.max_iter is not None: model.set_max_iter(self.max_iter) model.train() return model
def RunLDAShogun(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the test file. try: if len(self.dataset) > 1: testSet = LoadDataset(self.dataset[1]) # Use the last row of the training set as the responses. trainSet, trainLabels = SplitTrainData(self.dataset) # if the labels are not in {0,1,2,...,num_classes-1}, map them to this set and store the mapping # shogun's MCLDA class requires the labels to be in {0,1,2,...,num_classes-1} distinctLabels = list(set(trainLabels)) mapping = {} reverseMapping = {} idx = 0 for label in distinctLabels: mapping[label] = idx reverseMapping[idx] = label idx += 1 for i in range(len(trainLabels)): trainLabels[i] = mapping[trainLabels[i]] trainFeat = RealFeatures(trainSet.T) trainLabels = MulticlassLabels(trainLabels) # Gather optional parameters. if "tolerance" in options: self.tolerance = float(options.pop("tolerance")) if "store" in options: self.store = bool(options.pop("store")) if (len(options) > 0): Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") with totalTimer: self.model = MCLDA(trainFeat, trainLabels, self.tolerance, self.store) self.model.train() if (len(self.dataset) > 0): self.predictions = self.model.apply_multiclass( RealFeatures(testSet.T)) self.predictions = self.predictions.get_labels() # reverse map the predicted labels to actual labels for i in range(len(self.predictions)): self.predictions[i] = reverseMapping[ self.predictions[i]] except Exception as e: Log.Info("Exception: " + str(e)) return -1 time = totalTimer.ElapsedTime() return time
def _svm_new(self, kernel_width, c, epsilon): if self.x == None or self.y == None: raise Exception("No training data loaded.") x = RealFeatures(self.x) y = MulticlassLabels(self.y) self.svm = GMNPSVM(c, GaussianKernel(x, x, kernel_width), y) self.svm.set_epsilon(epsilon)
def RunAllKnnShogun(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the query # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') queryFeat = RealFeatures(queryFeat.T) else: referenceData = np.genfromtxt(self.dataset, delimiter=',') # Labels are the last row of the dataset. labels = MulticlassLabels( referenceData[:, (referenceData.shape[1] - 1)]) referenceData = referenceData[:, :-1] with totalTimer: # Get all the parameters. if "k" in options: k = int(options.pop("k")) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) return -1 else: Log.Fatal( "Required option: Number of furthest neighbors to find." ) return -1 if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") referenceFeat = RealFeatures(referenceData.T) distance = EuclideanDistance(referenceFeat, referenceFeat) # Perform All K-Nearest-Neighbors. model = SKNN(k, distance, labels) model.train() if len(self.dataset) == 2: out = model.apply(queryFeat).get_labels() else: out = model.apply(referenceFeat).get_labels() except Exception as e: return -1 return totalTimer.ElapsedTime()
def assign_labels(data, centroids, ncenters): from shogun import EuclideanDistance from shogun import RealFeatures, MulticlassLabels from shogun import KNN from numpy import arange labels = MulticlassLabels(arange(0., ncenters)) fea = RealFeatures(data) fea_centroids = RealFeatures(centroids) distance = EuclideanDistance(fea_centroids, fea_centroids) knn = KNN(1, distance, labels) knn.train() return knn.apply(fea)
def metric_lmnn_statistics( k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'): try: from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print 'Error importing shogun or other required modules. Please, verify their installation.' return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert (features.get_num_vectors() == labels.get_num_labels()) # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print 'Training LMNN, this will take about two minutes...' lmnn.train() print 'Training done!' # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel('Iterations') pyplot.ylabel('LMNN objective') pyplot.title( 'LMNN objective during training for the multiclass digits data set') pyplot.show()
def classifier_multiclassliblinear(fm_train_real, fm_test_real, label_train_multiclass, C): feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = MulticlassLibLinear(C, feats_train, labels) classifier.parallel.set_num_threads(num_threads) classifier.train() label_pred = classifier.apply(feats_test) out = label_pred.get_labels() return out
def classifier_multiclassliblinear (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5): from shogun import RealFeatures, MulticlassLabels from shogun import MulticlassLibLinear feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) labels=MulticlassLabels(label_train_multiclass) classifier = MulticlassLibLinear(C,feats_train,labels) classifier.train() label_pred = classifier.apply(feats_test) out = label_pred.get_labels() if label_test_multiclass is not None: from shogun import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(label_pred, labels_test) print('Accuracy = %.4f' % acc) return out
def mkl_multiclass(fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from shogun import CombinedFeatures, RealFeatures, MulticlassLabels from shogun import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel from shogun import MKLMulticlass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10, 2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = MulticlassLabels(label_train_multiclass) mkl = MKLMulticlass(C, kernel, labels) mkl.set_epsilon(epsilon) mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out
def evaluation_clustering_simple(n_data=100, sqrt_num_blobs=4, distance=5): from shogun import ClusteringAccuracy, ClusteringMutualInformation from shogun import MulticlassLabels, GaussianBlobsDataGenerator from shogun import Math # reproducable results Math.init_random(1) # produce sone Gaussian blobs to cluster ncenters = sqrt_num_blobs**2 stretch = 1 angle = 1 gen = GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle) features = gen.get_streamed_features(n_data) X = features.get_feature_matrix() # compute approximate "ground truth" labels via taking the closest blob mean coords = array(range(0, sqrt_num_blobs * distance, distance)) idx_0 = [abs(coords - x).argmin() for x in X[0]] idx_1 = [abs(coords - x).argmin() for x in X[1]] ground_truth = array( [idx_0[i] * sqrt_num_blobs + idx_1[i] for i in range(n_data)], dtype="float64") #for label in unique(ground_truth): # indices=ground_truth==label # plot(X[0][indices], X[1][indices], 'o') #show() centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) # in this case we know that the clustering has to be very good #print(('Clustering accuracy = %.4f' % accuracy)) assert (accuracy > 0.8) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) return gnd, accuracy, mutual_info
def __init__(self, method_param, run_param): self.info = "SHOGUN_GPC (" + str(method_param) + ")" #Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_features = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_features = RealFeatures(self.data[1].T) self.method_param = method_param
def classifier_larank(num_vec, num_class, distance, C=0.9, num_threads=1, num_iter=5, seed=1): from shogun import RealFeatures, MulticlassLabels from shogun import GaussianKernel from shogun import LaRank from shogun import Math_init_random # reproducible results Math_init_random(seed) random.seed(seed) # generate some training data where each class pair is linearly separable label_train = array([mod(x, num_class) for x in range(num_vec)], dtype="float64") label_test = array([mod(x, num_class) for x in range(num_vec)], dtype="float64") fm_train = array(random.randn(num_class, num_vec)) fm_test = array(random.randn(num_class, num_vec)) for i in range(len(label_train)): fm_train[int(label_train[i]), i] += distance fm_test[int(label_test[i]), i] += distance feats_train = RealFeatures(fm_train) feats_test = RealFeatures(fm_test) width = 2.1 kernel = GaussianKernel(feats_train, feats_train, width) epsilon = 1e-5 labels = MulticlassLabels(label_train) svm = LaRank(C, kernel, labels) #svm.set_tau(1e-3) svm.set_batch_mode(False) #svm.io.enable_progress() svm.set_epsilon(epsilon) svm.train() out = svm.apply(feats_test).get_labels() predictions = svm.apply() return predictions, svm, predictions.get_labels()
def classifier_gmnpsvm(fm_train_real, fm_test_real, label_train_multiclass, C): feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) kernel = GaussianKernel(feats_train, feats_train, width) import time start = time.time() tmp = kernel.get_kernel_matrix() end = time.time() labels = MulticlassLabels(label_train_multiclass) svm = GMNPSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train(feats_train) out = svm.apply(feats_test).get_labels() return out
def main(actual, predicted): LOGGER.info("SVM Multiclass evaluator") # Load SVMLight dataset feats, labels = get_features_and_labels(LibSVMFile(actual)) # Load predicted labels with open(predicted, 'r') as f: predicted_labels_arr = np.array([float(l) for l in f]) predicted_labels = MulticlassLabels(predicted_labels_arr) # Evaluate accuracy multiclass_measures = MulticlassAccuracy() LOGGER.info("Accuracy = %s" % multiclass_measures.evaluate(labels, predicted_labels)) LOGGER.info("Confusion matrix:") res = multiclass_measures.get_confusion_matrix(labels, predicted_labels) print res
def RunSVMShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run Support vector machines on the test dataset. self.model.apply(testData).get_labels() except Exception as e: return -1 return totalTimer.ElapsedTime()
def classifier_multiclassmachine (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from shogun import RealFeatures, MulticlassLabels from shogun import GaussianKernel from shogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=MulticlassLabels(label_train_multiclass) classifier = LibSVM() classifier.set_epsilon(epsilon) #print labels.get_labels() mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels) mc_classifier.train() kernel.init(feats_train, feats_test) out = mc_classifier.apply().get_labels() return out
def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat): from shogun import CrossValidation, CrossValidationResult from shogun import ParameterObserverCV from shogun import MulticlassAccuracy, F1Measure from shogun import StratifiedCrossValidationSplitting from shogun import MulticlassLabels from shogun import RealFeatures, CombinedFeatures from shogun import GaussianKernel, CombinedKernel from shogun import MKLMulticlass from shogun import Statistics, MSG_DEBUG, Math from shogun import ROCEvaluation Math.init_random(1) # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLMulticlass(1.0,kernel,labels); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 3) # evaluation method evaluation_criterium=MulticlassAccuracy() # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross validation parameter observer multiclass_storage=ParameterObserverCV() cross_validation.subscribe_to_parameters(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() # get first observation and first fold obs = multiclass_storage.get_observations()[0] fold = obs.get_folds_results()[0] # get fold ROC for first class eval_ROC = ROCEvaluation() pred_lab_binary = MulticlassLabels.obtain_from_generic(fold.get_test_result()).get_binary_for_class(0) true_lab_binary = MulticlassLabels.obtain_from_generic(fold.get_test_true_result()).get_binary_for_class(0) eval_ROC.evaluate(pred_lab_binary, true_lab_binary) print eval_ROC.get_ROC() # get fold evaluation result acc_measure = F1Measure() print acc_measure.evaluate(pred_lab_binary, true_lab_binary)
def features_io (fm_train_real, label_train_twoclass): import numpy from shogun import SparseRealFeatures, RealFeatures, MulticlassLabels from shogun import GaussianKernel from shogun import LibSVMFile, CSVFile, BinaryFile, HDF5File from tempfile import NamedTemporaryFile feats=SparseRealFeatures(fm_train_real) feats2=SparseRealFeatures() tmp_fm_train_sparsereal_bin = NamedTemporaryFile(suffix='sparsereal.bin') f=BinaryFile(tmp_fm_train_sparsereal_bin.name, "w") feats.save(f) tmp_fm_train_sparsereal_ascii = NamedTemporaryFile(suffix='sparsereal.ascii') f=LibSVMFile(tmp_fm_train_sparsereal_ascii.name, "w") feats.save(f) f=BinaryFile(tmp_fm_train_sparsereal_bin.name) feats2.load(f) f=LibSVMFile(tmp_fm_train_sparsereal_ascii.name) feats2.load(f) feats=RealFeatures(fm_train_real) feats2=RealFeatures() tmp_fm_train_real_bin = NamedTemporaryFile(suffix='real.bin') f=BinaryFile(tmp_fm_train_real_bin.name, "w") feats.save(f) tmp_fm_train_real_h5 = NamedTemporaryFile(suffix='real.h5') f=HDF5File(tmp_fm_train_real_h5.name, "w", "/data/doubles") feats.save(f) tmp_fm_train_real_ascii = NamedTemporaryFile(suffix='real.ascii') f=CSVFile(tmp_fm_train_real_ascii.name, "w") feats.save(f) f=BinaryFile(tmp_fm_train_real_bin.name) feats2.load(f) #print("diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))) f=CSVFile(tmp_fm_train_real_ascii.name) feats2.load(f) #print("diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))) lab=MulticlassLabels(numpy.array([0.0,1.0,2.0,3.0])) lab2=MulticlassLabels() tmp_label_train_twoclass_ascii = NamedTemporaryFile(suffix='twoclass.ascii') f=CSVFile(tmp_label_train_twoclass_ascii.name, "w") lab.save(f) tmp_label_train_twoclass_bin = NamedTemporaryFile(suffix='twoclass.bin') f=BinaryFile(tmp_label_train_twoclass_bin.name, "w") lab.save(f) tmp_label_train_real_h5 = NamedTemporaryFile(suffix='real.h5') f=HDF5File(tmp_label_train_real_h5.name, "w", "/data/labels") lab.save(f) f=CSVFile(tmp_label_train_twoclass_ascii.name) lab2.load(f) f=BinaryFile(tmp_label_train_twoclass_bin.name) lab2.load(f) f=HDF5File(tmp_fm_train_real_h5.name, "r", "/data/doubles") feats2.load(f) #print(feats2.get_feature_matrix()) f=HDF5File(tmp_label_train_real_h5.name, "r", "/data/labels") lab2.load(f) #print(lab2.get_labels()) return feats, feats2, lab, lab2