def converter_localitypreservingprojections_modular(data_fname, k): try: from modshogun import RealFeatures, LocalityPreservingProjections, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = LocalityPreservingProjections() converter.set_target_dim(1) converter.set_k(k) converter.set_tau(2.0) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def converter_localtangentspacealignment_modular(data_fname, k): try: from modshogun import RealFeatures, LocalTangentSpaceAlignment, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = LocalTangentSpaceAlignment() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def classifier_libsvm_modular(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, width=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, BinaryLabels from modshogun import GaussianKernel, LibSVM, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) kernel = GaussianKernel(feats_train, feats_train, width) svm = LibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() supportvectors = sv_idx = svm.get_support_vectors() alphas = svm.get_alphas() predictions = svm.apply(feats_test) #print predictions.get_labels() return predictions, svm, predictions.get_labels()
def evaluation_cross_validation_regression(train_fname=traindat, label_fname=label_traindat, width=0.8, tau=1e-6): from modshogun import CrossValidation, CrossValidationResult from modshogun import MeanSquaredError, CrossValidationSplitting from modshogun import RegressionLabels, RealFeatures from modshogun import GaussianKernel, KernelRidgeRegression, CSVFile # training data features = RealFeatures(CSVFile(train_fname)) labels = RegressionLabels(CSVFile(label_fname)) # kernel and predictor kernel = GaussianKernel() predictor = KernelRidgeRegression(tau, kernel, labels) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but here, the std x-val is used splitting_strategy = CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MeanSquaredError() # cross-validation instance cross_validation = CrossValidation(predictor, features, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val 10 times cross_validation.set_num_runs(10) # (optional) tell machine to precompute kernel matrix. speeds up. may not work predictor.data_lock(labels, features) # perform cross-validation and print(results) result = cross_validation.evaluate()
def multiclass_chaidtree_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, CHAIDTree except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # CHAID Tree formation with nominal dependent variable c = CHAIDTree(0, feattypes, 10) c.set_labels(train_labels) c.train(feats_train) # Classify test data output = c.apply_multiclass(feats_test).get_labels() return c, output
def converter_diffusionmaps_modular(data_fname, t): try: from modshogun import RealFeatures, DiffusionMaps, GaussianKernel, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = DiffusionMaps() converter.set_target_dim(1) converter.set_kernel(GaussianKernel(10, 10.0)) converter.set_t(t) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def multiclass_cartree_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, CARTree, PT_MULTICLASS except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # CART Tree formation with 5 fold cross-validation pruning c = CARTree(ft, PT_MULTICLASS, 5, True) c.set_labels(train_labels) c.train(feats_train) # Classify test data output = c.apply_multiclass(feats_test).get_labels() return c, output
def kernel_io_modular(train_fname=traindat, test_fname=testdat, width=1.9): from modshogun import RealFeatures, GaussianKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() f = CSVFile("tmp/gaussian_train.csv", "w") kernel.save(f) del f kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() f = CSVFile("tmp/gaussian_test.csv", "w") kernel.save(f) del f #clean up import os os.unlink("tmp/gaussian_test.csv") os.unlink("tmp/gaussian_train.csv") return km_train, km_test, kernel
def converter_laplacianeigenmaps_modular (data_fname,k): try: from modshogun import RealFeatures, LaplacianEigenmaps, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = LaplacianEigenmaps() converter.set_target_dim(1) converter.set_k(k) converter.set_tau(20.0) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def converter_kernellocallylinearembedding_modular(data_fname, k): try: from modshogun import RealFeatures, KernelLocallyLinearEmbedding, LinearKernel, CSVFile features = RealFeatures(CSVFile(data_fname)) kernel = LinearKernel() converter = KernelLocallyLinearEmbedding(kernel) converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def converter_factoranalysis_modular(data_fname): try: import numpy from modshogun import RealFeatures, FactorAnalysis, EuclideanDistance, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = FactorAnalysis() converter.set_target_dim(2) embedding = converter.apply(features) X = embedding.get_feature_matrix() covdet = numpy.linalg.det(numpy.dot(X, X.T)) return covdet > 0 except ImportError: print('No Eigen3 available')
def converter_tdistributedstochasticneighborembedding_modular( data_fname, seed=1): try: from modshogun import RealFeatures, TDistributedStochasticNeighborEmbedding from modshogun import Math_init_random, CSVFile # reproducible results Math_init_random(seed) features = RealFeatures(CSVFile(data_fname)) converter = TDistributedStochasticNeighborEmbedding() converter.set_target_dim(2) embedding = converter.apply(features) return embedding except ImportError: print('No Eigen3 available')
def converter_locallylinearembedding_modular(data_fname, k): try: from modshogun import RealFeatures, CSVFile try: from modshogun import LocallyLinearEmbedding except ImportError: print("LocallyLinearEmbedding not available") exit(0) features = RealFeatures(CSVFile(data_fname)) converter = LocallyLinearEmbedding() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def converter_stochasticproximityembedding_modular (data_fname, k): try: from modshogun import RealFeatures,StochasticProximityEmbedding, SPE_GLOBAL, SPE_LOCAL, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = StochasticProximityEmbedding() converter.set_target_dim(1) converter.set_nupdates(40) # Embed with local strategy converter.set_k(k) converter.set_strategy(SPE_LOCAL) converter.embed(features) # Embed with global strategy converter.set_strategy(SPE_GLOBAL) converter.embed(features) return features except ImportError: print('No Eigen3 available')
def metric_lmnn_statistics( k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'): try: from modshogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print 'Error importing modshogun or other required modules. Please, verify their installation.' return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert (features.get_num_vectors() == labels.get_num_labels()) # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print 'Training LMNN, this will take about two minutes...' lmnn.train() print 'Training done!' # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel('Iterations') pyplot.ylabel('LMNN objective') pyplot.title( 'LMNN objective during training for the multiclass digits data set') pyplot.show()
#!/usr/bin/python from modshogun import CSVFile, RealFeatures, RescaleFeatures from scipy.linalg import solve_triangular, cholesky, sqrtm, inv import matplotlib.pyplot as pyplot import numpy # load wine features features = RealFeatures(CSVFile('../data/fm_wine.dat')) print('%d vectors with %d features.' % (features.get_num_vectors(), features.get_num_features())) print('original features mean = ' + str(numpy.mean(features, axis=1))) # rescale the features to [0,1] feature_rescaling = RescaleFeatures() feature_rescaling.init(features) features.add_preprocessor(feature_rescaling) features.apply_preprocessor() print('mean after rescaling = ' + str(numpy.mean(features, axis=1))) # remove mean from data data = features.get_feature_matrix() data = data.T data -= numpy.mean(data, axis=0) print numpy.mean(data, axis=0) fig, axarr = pyplot.subplots(1, 2) axarr[0].matshow(numpy.cov(data.T))
def features_io_modular(fm_train_real, label_train_twoclass): import numpy from modshogun import SparseRealFeatures, RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import LibSVMFile, CSVFile, BinaryFile, HDF5File feats = SparseRealFeatures(fm_train_real) feats2 = SparseRealFeatures() f = BinaryFile("fm_train_sparsereal.bin", "w") feats.save(f) f = LibSVMFile("fm_train_sparsereal.ascii", "w") feats.save(f) f = BinaryFile("fm_train_sparsereal.bin") feats2.load(f) f = LibSVMFile("fm_train_sparsereal.ascii") feats2.load(f) feats = RealFeatures(fm_train_real) feats2 = RealFeatures() f = BinaryFile("fm_train_real.bin", "w") feats.save(f) f = HDF5File("fm_train_real.h5", "w", "/data/doubles") feats.save(f) f = CSVFile("fm_train_real.ascii", "w") feats.save(f) f = BinaryFile("fm_train_real.bin") feats2.load(f) #print("diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))) f = CSVFile("fm_train_real.ascii") feats2.load(f) #print("diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten()))) lab = MulticlassLabels(numpy.array([0.0, 1.0, 2.0, 3.0])) lab2 = MulticlassLabels() f = CSVFile("label_train_twoclass.ascii", "w") lab.save(f) f = BinaryFile("label_train_twoclass.bin", "w") lab.save(f) f = HDF5File("label_train_real.h5", "w", "/data/labels") lab.save(f) f = CSVFile("label_train_twoclass.ascii") lab2.load(f) f = BinaryFile("label_train_twoclass.bin") lab2.load(f) f = HDF5File("fm_train_real.h5", "r", "/data/doubles") feats2.load(f) #print(feats2.get_feature_matrix()) f = HDF5File("label_train_real.h5", "r", "/data/labels") lab2.load(f) #print(lab2.get_labels()) #clean up import os for f in [ 'fm_train_sparsereal.bin', 'fm_train_sparsereal.ascii', 'fm_train_real.bin', 'fm_train_real.h5', 'fm_train_real.ascii', 'label_train_real.h5', 'label_train_twoclass.ascii', 'label_train_twoclass.bin' ]: os.unlink(f) return feats, feats2, lab, lab2
knn.train() test_features, test_labels = testdat.features, testdat.labels predicted_labels = knn.apply(test_features) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(predicted_labels, test_labels) err = 1 - acc return err features_file = '../data/fm_ape_gut.txt' labels_file = '../data/label_ape_gut.txt' features = RealFeatures(CSVFile(features_file)) labels = MulticlassLabels(CSVFile(labels_file)) # reduce the number of features to use so that the training is faster but still # the results of feature selection are significant fm = features.get_feature_matrix() features = RealFeatures(fm[:500, :]) assert (features.get_num_vectors() == labels.get_num_labels()) print('Number of examples = %d, number of features = %d.' % (features.get_num_vectors(), features.get_num_features())) visualize_tdsne(features, labels) lmnn = diagonal_lmnn(features, labels, max_iter=1200)