def transfer_multitask_l12_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskL12LogisticRegression except ImportError: print("MultitaskL12LogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskL12LogisticRegression(0.1,0.1,features,labels,task_group) mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.set_max_iter(10) mtlr.train() mtlr.set_current_task(0) out = mtlr.apply_regression().get_labels() return out
def features_dense_modular (A=matrixA,B=matrixB,C=matrixC): a=RealFeatures(A) b=LongIntFeatures(B) c=ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print(some statistics about a) # get first feature vector and set it a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix() assert(all(a_out==A)) assert(all(b_out==B)) assert(all(c_out==C)) return a_out,b_out,c_out,a,b,c
def features_dense_io_modular(): from modshogun import RealFeatures, CSVFile feats=RealFeatures() f=CSVFile("../data/fm_train_real.dat","r") f.set_delimiter(" ") feats.load(f) return feats
def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskLeastSquaresRegression except ImportError: print("MultitaskLeastSquaresRegression not available") exit(0) features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group) mtlsr.set_regularization(1) # use regularization ratio mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def classifier_featureblock_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup try: from modshogun import FeatureBlockLogisticRegression except ImportError: print("FeatureBlockLogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_features = features.get_num_features() block_one = IndexBlock(0,n_features//2) block_two = IndexBlock(n_features//2,n_features) block_group = IndexBlockGroup() block_group.add_block(block_one) block_group.add_block(block_two) mtlr = FeatureBlockLogisticRegression(0.1,features,labels,block_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() out = mtlr.apply().get_labels() return out
def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG try: from modshogun import MultitaskClusteredLogisticRegression except ImportError: print("MultitaskClusteredLogisticRegression not available") exit() features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat)))) labels = BinaryLabels(hstack((label_train,label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//3) task_two = Task(n_vectors//3,2*n_vectors//3) task_three = Task(2*n_vectors//3,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out
def features_dense_zero_copy_modular (in_data=data): feats = None if numpy.__version__ >= '1.5': feats=numpy.array(in_data, dtype=float64, order='F') a=RealFeatures() a.frombuffer(feats, False) b=numpy.array(a, copy=False) c=numpy.array(a, copy=True) d=RealFeatures() d.frombuffer(a, False) e=RealFeatures() e.frombuffer(a, True) a[:,0]=0 #print a[0:4] #print b[0:4] #print c[0:4] #print d[0:4] #print e[0:4] else: print("numpy version >= 1.5 is needed") return feats
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors): # init seed for reproducability Math.init_random(1) random.seed(1); # create some (non-sense) data matrix=random.rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features=RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels=BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i%2==0 else -1) # create svm classifier=LibSVM() # splitting strategy splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets) # accuracy evaluation evaluation_criterion=ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #classifier.print_modsel_params() # model parameter selection param_tree=create_param_tree() #param_tree.print_tree() grid_search=GridSearchModelSelection(cross, param_tree) print_state=False best_combination=grid_search.select_model(print_state) #print("best parameter(s):") #best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have tighter confidence intervals cross.set_num_runs(10) cross.set_conf_int_alpha(0.01) result=cross.evaluate() casted=CrossValidationResult.obtain_from_generic(result); #print "result mean:", casted.mean return classifier,result,casted.mean
def features_dense_real_modular (A=matrix): # ... of type Real, LongInt and Byte a=RealFeatures(A) # print(some statistics about a) #print(a.get_num_vectors()) #print(a.get_num_features()) # get first feature vector and set it #print(a.get_feature_vector(0)) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrix a_out = a.get_feature_matrix() assert(all(a_out==A)) return a_out
def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train)) feats_test=RealFeatures(CSVFile(test)) train_labels=MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset=int32(random.permutation(feats_train.get_num_vectors())) vsubset=subset[1:subset.size/3] trsubset=subset[1+subset.size/3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c=C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train,train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output=c.apply_multiclass(feats_test).get_labels() output_certainty=c.get_certainty_vector() return c,output,output_certainty
def neuralnets_simple_modular (train_fname, test_fname, label_fname, C, epsilon): from modshogun import NeuralLayers, NeuralNetwork, RealFeatures, BinaryLabels from modshogun import Math_init_random, CSVFile Math_init_random(17) feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) layers = NeuralLayers() network = NeuralNetwork(layers.input(feats_train.get_num_features()).linear(50).softmax(2).done()) network.quick_connect() network.initialize_neural_network() network.set_labels(labels) network.train(feats_train) return network, network.apply_multiclass(feats_test)
def transfer_multitask_group_regression(fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup, MultitaskLSRegression features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors/2) task_two = Task(n_vectors/2,n_vectors) task_group = TaskGroup() task_group.add_task(task_one) task_group.add_task(task_two) mtlsr = MultitaskLSRegression(0.1,features,labels,task_group) mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def load_data(num_train_samples=7291, m_data_dict=data_dict): from modshogun import RealFeatures, MulticlassLabels import numpy train_vec = m_data_dict['yTr'][0][:num_train_samples].astype(numpy.float64) train_labels = MulticlassLabels(train_vec) test_vec = m_data_dict['yTe'][0].astype(numpy.float64) test_labels = MulticlassLabels(test_vec) print "#train_labels = " + str(train_labels.get_num_labels()) print "#test_labels = " + str(test_labels.get_num_labels()) train_mat = m_data_dict['xTr'][:,:num_train_samples].astype(numpy.float64) train_features = RealFeatures(train_mat) test_mat = m_data_dict['xTe'].astype(numpy.float64) test_features = RealFeatures(test_mat) print "#train_vectors = " + str(train_features.get_num_vectors()) print "#test_vectors = " + str(test_features.get_num_vectors()) print "data dimension = " + str(test_features.get_num_features()) return train_features, train_labels, test_features, test_labels
def preprocessor_randomfouriergausspreproc_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from modshogun import Chi2Kernel from modshogun import RealFeatures from modshogun import RandomFourierGaussPreproc feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=RandomFourierGaussPreproc() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def transfer_multitask_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MultitaskLogisticRegression features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors/2) task_two = Task(n_vectors/2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskLogisticRegression(0.1,features,labels,task_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() mtlr.set_current_task(0) out = mtlr.apply().get_labels() return out
def preprocessor_prunevarsubmean_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from modshogun import Chi2Kernel from modshogun import RealFeatures from modshogun import PruneVarSubMean feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=PruneVarSubMean() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def metric_lmnn_statistics( k=3, fname_features="../../data/fm_train_multiclass_digits.dat.gz", fname_labels="../../data/label_train_multiclass_digits.dat", ): try: from modshogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print "Error importing modshogun or other required modules. Please, verify their installation." return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert features.get_num_vectors() == labels.get_num_labels() # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print "Training LMNN, this will take about two minutes..." lmnn.train() print "Training done!" # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel("Iterations") pyplot.ylabel("LMNN objective") pyplot.title("LMNN objective during training for the multiclass digits data set") pyplot.show()
def train(self, images, labels): """ Train eigenfaces """ print "Train...", #copy labels self._labels = labels; #transform the numpe vector to shogun structure features = RealFeatures(images) #PCA self.pca = PCA() #set dimension self.pca.set_target_dim(self._num_components); #compute PCA self.pca.init(features) for sampleIdx in range(features.get_num_vectors()): v = features.get_feature_vector(sampleIdx); p = self.pca.apply_to_feature_vector(v); self._projections.insert(sampleIdx, p); print "ok!"
def preprocessor_normone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from modshogun import Chi2Kernel from modshogun import RealFeatures from modshogun import NormOne feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preprocessor=NormOne() preprocessor.init(feats_train) feats_train.add_preprocessor(preprocessor) feats_train.apply_preprocessor() feats_test.add_preprocessor(preprocessor) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def preprocessor_fisherlda_modular (data, labels, method): from modshogun import RealFeatures, MulticlassLabels, CANVAR_FLDA from modshogun import FisherLda from modshogun import MulticlassLabels sg_features = RealFeatures(data) sg_labels = MulticlassLabels(labels) preprocessor=FisherLda(method) preprocessor.init(sg_features, sg_labels, 1) yn=preprocessor.apply_to_feature_matrix(sg_features) return yn
def fit(self, X, y): self.X_, y = check_X_y(X, y, dtype=float) labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) self._lmnn.set_maxiter(self.max_iter) self._lmnn.set_obj_threshold(self.convergence_tol) self._lmnn.set_regularization(self.regularization) self._lmnn.set_stepsize(self.learn_rate) if self.use_pca: self._lmnn.train() else: self._lmnn.train(np.eye(X.shape[1])) self.L_ = self._lmnn.get_linear_transform() return self
def run_clustering(data, k): from modshogun import KMeans from modshogun import Math_init_random from modshogun import EuclideanDistance from modshogun import RealFeatures fea = RealFeatures(data) distance = EuclideanDistance(fea, fea) kmeans = KMeans(k, distance) #print("Running clustering...") kmeans.train() return kmeans.get_cluster_centers()
def transfer_multitask_leastsquares_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup, MultitaskLeastSquaresRegression features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 2) task_two = Task(n_vectors // 2, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlsr = MultitaskLeastSquaresRegression(0.1, features, labels, task_group) mtlsr.set_regularization(1) # use regularization ratio mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def RunSVMShogun(q): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run Support vector machines on the test dataset. self.model.apply(testData).get_labels() except Exception as e: Log.Debug(str(e)) q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def converter_localitypreservingprojections_modular(data_fname, k): try: from modshogun import RealFeatures, LocalityPreservingProjections, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = LocalityPreservingProjections() converter.set_target_dim(1) converter.set_k(k) converter.set_tau(2.0) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def classifier_libsvm_modular(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, width=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, BinaryLabels from modshogun import GaussianKernel, LibSVM, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) kernel = GaussianKernel(feats_train, feats_train, width) svm = LibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() supportvectors = sv_idx = svm.get_support_vectors() alphas = svm.get_alphas() predictions = svm.apply(feats_test) #print predictions.get_labels() return predictions, svm, predictions.get_labels()
def classifier_svmlin_modular(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, C=0.9, epsilon=1e-5, num_threads=1): from modshogun import RealFeatures, SparseRealFeatures, BinaryLabels from modshogun import SVMLin, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) svm = SVMLin(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(True) svm.train() bias = svm.get_bias() w = svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def classifier_featureblock_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup, FeatureBlockLogisticRegression features = RealFeatures(hstack((traindat, traindat))) labels = BinaryLabels(hstack((label_train, label_train))) n_features = features.get_num_features() block_one = IndexBlock(0, n_features / 2) block_two = IndexBlock(n_features / 2, n_features) block_group = IndexBlockGroup() block_group.add_block(block_one) block_group.add_block(block_two) mtlr = FeatureBlockLogisticRegression(0.1, features, labels, block_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() out = mtlr.apply().get_labels() return out
def train(self, images, labels): """ Train eigenfaces """ print "Train..." #copy labels self._labels = labels #transform the numpe vector to shogun structure features = RealFeatures(images) #PCA self.pca = PCA() #set dimension self.pca.set_target_dim(self._num_components) #compute PCA self.pca.init(features) for sampleIdx in range(features.get_num_vectors()): v = features.get_feature_vector(sampleIdx) p = self.pca.apply_to_feature_vector(v) self._projections.insert(sampleIdx, p) print "Train ok!"
def RunKNCShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run the k-nearest neighbors Classifier on the test dataset. self.predictions = self.model.apply_multiclass( testData).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() if len(self.dataset) > 1: return [time, self.predictions] return [time]
def converter_isomap_modular(data_fname): try: from modshogun import RealFeatures, Isomap, CSVFile features = RealFeatures(CSVFile(data)) converter = Isomap() converter.set_k(20) converter.set_target_dim(1) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def clustering_kmeans_modular (fm_train=traindat,k=3): from modshogun import EuclideanDistance, RealFeatures, KMeans, Math_init_random, CSVFile Math_init_random(17) feats_train=RealFeatures(CSVFile(fm_train)) distance=EuclideanDistance(feats_train, feats_train) kmeans=KMeans(k, distance) kmeans.train() out_centers = kmeans.get_cluster_centers() kmeans.get_radiuses() return out_centers, kmeans
def converter_linearlocaltangentspacealignment_modular(data_fname, k): try: from modshogun import RealFeatures, LinearLocalTangentSpaceAlignment, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = LinearLocalTangentSpaceAlignment() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def RunMetrics(self, options): if len(self.dataset) >= 3: X, y = SplitTrainData(self.dataset) tau = re.search("-t (\d+)", options) tau = 1.0 if not tau else int(tau.group(1)) model = LRR(tau, RealFeatures(X.T), RegressionLabels(y)) model.train() testData = LoadDataset(self.dataset[1]) truelabels = LoadDataset(self.dataset[2]) predictedlabels = model.apply_regression(RealFeatures( testData.T)).get_labels() SimpleMSE = Metrics.SimpleMeanSquaredError(truelabels, predictedlabels) metrics_dict = {} metrics_dict['Simple MSE'] = SimpleMSE return metrics_dict else: Log.Fatal("This method requires three datasets!")
def classifier_multilabeloutputliblinear_modular( fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, width=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels, MultilabelLabels from modshogun import MulticlassLibLinear feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = MulticlassLibLinear(C, feats_train, labels) classifier.train() label_pred = classifier.apply_multilabel_output(feats_test, 2) out = label_pred.get_labels() #print out return out
def converter_hessianlocallylinearembedding_modular(data_fname, k): try: from modshogun import RealFeatures, HessianLocallyLinearEmbedding, CSVFile features = RealFeatures(CSVFile(data)) converter = HessianLocallyLinearEmbedding() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def RunMetrics(self, options): Log.Info("Perform Linear Ridge Regression.", self.verbose) results = self.LinearRidgeRegressionShogun(options) if results < 0: return results metrics = {'Runtime': results} if len(self.dataset) >= 3: X, y = SplitTrainData(self.dataset) if "alpha" in options: tau = float(options.pop("alpha")) else: Log.Fatal("Required parameter 'alpha' not specified!") raise Exception("missing parameter") if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") model = LRR(tau, RealFeatures(X.T), RegressionLabels(y)) model.train() testData = LoadDataset(self.dataset[1]) truelabels = LoadDataset(self.dataset[2]) predictedlabels = model.apply_regression(RealFeatures( testData.T)).get_labels() SimpleMSE = Metrics.SimpleMeanSquaredError(truelabels, predictedlabels) metrics['Simple MSE'] = SimpleMSE return metrics else: Log.Fatal("This method requires three datasets!")
def converter_laplacianeigenmaps_modular (data_fname,k): try: from modshogun import RealFeatures, LaplacianEigenmaps, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = LaplacianEigenmaps() converter.set_target_dim(1) converter.set_k(k) converter.set_tau(20.0) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def preprocessor_dimensionreductionpreprocessor_modular (data, k): from modshogun import RealFeatures from modshogun import DimensionReductionPreprocessor from modshogun import LocallyLinearEmbedding features = RealFeatures(data) converter = LocallyLinearEmbedding() converter.set_k(k) preprocessor = DimensionReductionPreprocessor(converter) preprocessor.init(features) preprocessor.apply_to_feature_matrix(features) return features
def multiclass_chaidtree_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, CHAIDTree except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # CHAID Tree formation with nominal dependent variable c = CHAIDTree(0, feattypes, 10) c.set_labels(train_labels) c.train(feats_train) # Classify test data output = c.apply_multiclass(feats_test).get_labels() return c, output
def regression_randomforest_modular(num_train=500, num_test=50, x_range=15, noise_var=0.2, ft=feattypes): try: from modshogun import RealFeatures, RegressionLabels, CSVFile, RandomForest, MeanRule, PT_REGRESSION except ImportError: print("Could not import Shogun modules") return random.seed(1) # form training dataset : y=x with noise X_train = random.rand(1, num_train) * x_range Y_train = X_train + random.randn(num_train) * noise_var # form test dataset X_test = array([[float(i) / num_test * x_range for i in range(num_test)]]) # wrap features and labels into Shogun objects feats_train = RealFeatures(X_train) feats_test = RealFeatures(X_test) train_labels = RegressionLabels(Y_train[0]) # Random Forest formation rand_forest = RandomForest(feats_train, train_labels, 20, 1) rand_forest.set_feature_types(ft) rand_forest.set_machine_problem_type(PT_REGRESSION) rand_forest.set_combination_rule(MeanRule()) rand_forest.train() # Regress test data output = rand_forest.apply_regression(feats_test).get_labels() return rand_forest, output
def fit(self, X, labels): self.X = X self.L = np.eye(X.shape[1]) labels = MulticlassLabels(labels.astype(np.float64)) self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.params['k']) self._lmnn.set_maxiter(self.params['max_iter']) self._lmnn.set_obj_threshold(self.params['convergence_tol']) self._lmnn.set_regularization(self.params['regularization']) self._lmnn.set_stepsize(self.params['learn_rate']) if self.params['use_pca']: self._lmnn.train() else: self._lmnn.train(self.L) self.L = self._lmnn.get_linear_transform() return self
def kernel_io_modular(train_fname=traindat, test_fname=testdat, width=1.9): from modshogun import RealFeatures, GaussianKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() f = CSVFile("tmp/gaussian_train.csv", "w") kernel.save(f) del f kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() f = CSVFile("tmp/gaussian_test.csv", "w") kernel.save(f) del f #clean up import os os.unlink("tmp/gaussian_test.csv") os.unlink("tmp/gaussian_train.csv") return km_train, km_test, kernel
def fit(self, X, y): X, y = self._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.k) self._lmnn.set_maxiter(self.max_iter) self._lmnn.set_obj_threshold(self.convergence_tol) self._lmnn.set_regularization(self.regularization) self._lmnn.set_stepsize(self.learn_rate) if self.use_pca: self._lmnn.train() else: self._lmnn.train(np.eye(X.shape[1])) self.transformer_ = self._lmnn.get_linear_transform(X) return self
def classifier_multiclasslibsvm_modular(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, width=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import MulticlassLibSVM feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) kernel = GaussianKernel(feats_train, feats_train, width) labels = MulticlassLabels(label_train_multiclass) svm = MulticlassLibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() predictions = svm.apply() return predictions, svm, predictions.get_labels()
def RunLinearRegressionShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the responses # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: testSet = np.genfromtxt(self.dataset[1], delimiter=',') # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") with totalTimer: # Perform linear regression. model = LeastSquaresRegression(RealFeatures(X.T), RegressionLabels(y)) model.train() b = model.get_w() if len(self.dataset) == 2: pred = classifier.apply(RealFeatures(testSet.T)) self.predictions = pred.get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def compute_output_plot_isolines_sine(classifier, kernel, train, regression=False): x = 4 * rand(1, 500) - 2 x.sort() test = RealFeatures(x) kernel.init(train, test) if regression: y = classifier.apply().get_labels() else: y = classifier.apply().get_values() return x, y
def transfer_multitask_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MultitaskLogisticRegression features = RealFeatures(hstack((traindat, traindat))) labels = BinaryLabels(hstack((label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 2) task_two = Task(n_vectors // 2, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskLogisticRegression(0.1, features, labels, task_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() mtlr.set_current_task(0) out = mtlr.apply().get_labels() return out
def converter_diffusionmaps_modular(data_fname, t): try: from modshogun import RealFeatures, DiffusionMaps, GaussianKernel, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = DiffusionMaps() converter.set_target_dim(1) converter.set_kernel(GaussianKernel(10, 10.0)) converter.set_t(t) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def stochasticgbmachine_modular(train=traindat,train_labels=label_traindat,ft=feat_types): try: from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats=RealFeatures(CSVFile(train)) labels=RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p=np.random.permutation(labels.get_num_labels()) num=labels.get_num_labels()*0.9 cart=CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss=SquaredLoss() s=StochasticGBMachine(cart,loss,500,0.01,0.6) # train feats.add_subset(np.int32(p[0:num])) labels.add_subset(np.int32(p[0:num])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[num:len(p)])) labels.add_subset(np.int32(p[num:len(p)])) output=s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s,output
def statistics_hsic (n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import BOOTSTRAP, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Math, Statistics, IntVector # init seed for reproducability Math.init_random(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic #print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def predict_new_data(graph_file, cons_file, tri_file, other_feature_file): print "reading extracted features" graph_feature = read_feature_data(graph_file) graph_feature = get_normalized_given_max_min(graph_feature, "models/grtaph_max_size") cons_feature = read_feature_data(cons_file) cons_feature = get_normalized_given_max_min(cons_feature, "models/cons_max_size") CC_feature = read_feature_data(tri_file) CC_feature = get_normalized_given_max_min(CC_feature, "models/tri_max_size") ATOS_feature = read_feature_data(other_feature_file) ATOS_feature = get_normalized_given_max_min(ATOS_feature, "models/alu_max_size") width, C, epsilon, num_threads, mkl_epsilon, mkl_norm = 0.5, 1.2, 1e-5, 1, 0.001, 3.5 kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() # pdb.set_trace() subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(graph_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/graph.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(cons_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/cons.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(CC_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/tri.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(ATOS_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/alu.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) model_file = "models/mkl.dat" if not os.path.exists(model_file): print "downloading model file" url_add = "http://rth.dk/resources/mirnasponge/data/mkl.dat" urllib.urlretrieve(url_add, model_file) print "loading trained model" fstream = SerializableAsciiFile("models/mkl.dat", "r") new_mkl = MKLClassification() status = new_mkl.load_serializable(fstream) print "model predicting" kernel.init(feats_train, feats_test) new_mkl.set_kernel(kernel) y_out = new_mkl.apply().get_labels() return y_out
def plot_data(x,y,axis): for idx,val in enumerate(numpy.unique(y)): xi = x[y==val] axis.scatter(xi[:,0], xi[:,1], s=50, facecolors='none', edgecolors=COLS[idx]) def plot_neighborhood_graph(x, nn, axis): for i in xrange(x.shape[0]): xs = [x[i,0], x[nn[1,i], 0]] ys = [x[i,1], x[nn[1,i], 1]] axis.plot(xs, ys, COLS[int(y[i])]) figure, axarr = pyplot.subplots(3, 1) x, y = sandwich_data() features = RealFeatures(x.T) labels = MulticlassLabels(y) print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features())) assert(features.get_num_vectors() == labels.get_num_labels()) distance = EuclideanDistance(features, features) k = 2 knn = KNN(k, distance, labels) plot_data(x, y, axarr[0]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0]) axarr[0].set_aspect('equal') axarr[0].set_xlim(-6, 4) axarr[0].set_ylim(-3, 2)
knn = KNN(k, distance, train_labels) knn.train() test_features, test_labels = testdat.features, testdat.labels predicted_labels = knn.apply(test_features) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(predicted_labels, test_labels) err = 1-acc return err features_file = '../data/fm_ape_gut.txt' labels_file = '../data/label_ape_gut.txt' features = RealFeatures(CSVFile(features_file)) labels = MulticlassLabels(CSVFile(labels_file)) # reduce the number of features to use so that the training is faster but still # the results of feature selection are significant fm = features.get_feature_matrix() features = RealFeatures(fm[:500, :]) assert(features.get_num_vectors() == labels.get_num_labels()) print('Number of examples = %d, number of features = %d.' % (features.get_num_vectors(), features.get_num_features())) visualize_tdsne(features, labels) lmnn = diagonal_lmnn(features, labels, max_iter=1200) diagonal_transform = lmnn.get_linear_transform()
#!/usr/bin/python from scipy import io data_dict = io.loadmat('../data/NBData20_train_preprocessed.mat') xt = data_dict['xt'] yt = data_dict['yt'] import numpy from modshogun import RealFeatures,MulticlassLabels,LMNN,MSG_DEBUG features = RealFeatures(xt.T) labels = MulticlassLabels(numpy.squeeze(yt)) k = 6 lmnn = LMNN(features,labels,k) lmnn.io.set_loglevel(MSG_DEBUG) lmnn.set_diagonal(True) lmnn.set_maxiter(10000) lmnn.train(numpy.eye(features.get_num_features()))
def feature_function(): from modshogun import RealFeatures from modshogun import CSVFile import numpy as np #3x3 random matrix feat_arr = np.random.rand(3, 3) #initialize RealFeatures from numpy array features = RealFeatures(feat_arr) #get matrix value function print features.get_feature_matrix(features) #get selected column of matrix print features.get_feature_vector(1) #get number of columns print features.get_num_features() #get number of rows print features.get_num_vectors() feats_from_csv = RealFeatures(CSVFile("csv/feature.csv")) print "csv is ", feats_from_csv.get_feature_matrix()
#!/usr/bin/env python2.7 # # This software is distributed under BSD 3-clause license (see LICENSE file). # # Copyright (C) 2014 Thoralf Klein # from modshogun import RealFeatures, BinaryLabels, LibLinear from numpy import random, mean X_train = RealFeatures(random.randn(30, 100)) Y_train = BinaryLabels(random.randn(X_train.get_num_vectors())) svm = LibLinear(1.0, X_train, Y_train) svm.train() Y_pred = svm.apply_binary(X_train) Y_train.get_labels() == Y_pred.get_labels() print "accuracy:", mean(Y_train.get_labels() == Y_pred.get_labels())
#!/usr/bin/python from modshogun import CSVFile, RealFeatures, RescaleFeatures from scipy.linalg import solve_triangular, cholesky, sqrtm, inv import matplotlib.pyplot as pyplot import numpy # load wine features features = RealFeatures(CSVFile('../data/fm_wine.dat')) print('%d vectors with %d features.' % (features.get_num_vectors(), features.get_num_features())) print('original features mean = ' + str(numpy.mean(features, axis=1))) # rescale the features to [0,1] feature_rescaling = RescaleFeatures() feature_rescaling.init(features) features.add_preprocessor(feature_rescaling) features.apply_preprocessor() print('mean after rescaling = ' + str(numpy.mean(features, axis=1))) # remove mean from data data = features.get_feature_matrix() data = data.T data-= numpy.mean(data, axis=0) print numpy.mean(data, axis=0) fig, axarr = pyplot.subplots(1,2) axarr[0].matshow(numpy.cov(data.T)) #### whiten data
def serialization_complex_example (num=5, dist=1, dim=10, C=2.0, width=10): import os from numpy import concatenate, zeros, ones from numpy.random import randn, seed from modshogun import RealFeatures, MulticlassLabels from modshogun import GMNPSVM from modshogun import GaussianKernel try: from modshogun import SerializableHdf5File,SerializableAsciiFile, \ SerializableJsonFile,SerializableXmlFile,MSG_DEBUG except ImportError: return from modshogun import NormOne, LogPlusOne seed(17) data=concatenate((randn(dim, num), randn(dim, num) + dist, randn(dim, num) + 2*dist, randn(dim, num) + 3*dist), axis=1) lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num))) feats=RealFeatures(data) #feats.io.set_loglevel(MSG_DEBUG) #feats.io.enable_file_and_line() kernel=GaussianKernel(feats, feats, width) labels=MulticlassLabels(lab) svm = GMNPSVM(C, kernel, labels) feats.add_preprocessor(NormOne()) feats.add_preprocessor(LogPlusOne()) feats.set_preprocessed(1) svm.train(feats) bias_ref = svm.get_svm(0).get_bias() #svm.print_serializable() fstream = SerializableHdf5File("blaah.h5", "w") status = svm.save_serializable(fstream) check_status(status,'h5') fstream = SerializableAsciiFile("blaah.asc", "w") status = svm.save_serializable(fstream) check_status(status,'asc') fstream = SerializableJsonFile("blaah.json", "w") status = svm.save_serializable(fstream) check_status(status,'json') fstream = SerializableXmlFile("blaah.xml", "w") status = svm.save_serializable(fstream) check_status(status,'xml') fstream = SerializableHdf5File("blaah.h5", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'h5') new_svm.train() bias_h5 = new_svm.get_svm(0).get_bias() fstream = SerializableAsciiFile("blaah.asc", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'asc') new_svm.train() bias_asc = new_svm.get_svm(0).get_bias() fstream = SerializableJsonFile("blaah.json", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'json') new_svm.train() bias_json = new_svm.get_svm(0).get_bias() fstream = SerializableXmlFile("blaah.xml", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'xml') new_svm.train() bias_xml = new_svm.get_svm(0).get_bias() os.unlink("blaah.h5") os.unlink("blaah.asc") os.unlink("blaah.json") os.unlink("blaah.xml") return svm,new_svm, bias_ref, bias_h5, bias_asc, bias_json, bias_xml
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=np.median(distances) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=np.median(distances) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot=hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)