def createFeatures(self, examples): """Converts numpy arrays or sequences into shogun features""" if self.kparam['name'] == 'gauss' or self.kparam['name'] == 'linear' or self.kparam['name'] == 'poly': examples = numpy.array(examples) feats = RealFeatures(examples) elif self.kparam['name'] == 'wd' or self.kparam['name'] == 'localalign' or self.kparam['name'] == 'localimprove': #examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif self.kparam['name'] == 'spec': #examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) wf = StringUlongFeatures( feats.get_alphabet() ) wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec') del feats if train_mode: preproc = SortUlongString() preproc.init(wf) wf.add_preproc(preproc) ret = wf.apply_preproc() feats = wf else: print 'Unknown kernel %s' % self.kparam['name'] raise ValueError return feats
def features_simple_modular(A=matrixA,B=matrixB,C=matrixC): a=RealFeatures(A) b=LongIntFeatures(B) c=ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print some statistics about a # get first feature vector and set it a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix() assert(all(a_out==A)) assert(all(b_out==B)) assert(all(c_out==C)) return a_out,b_out,c_out,a,b,c
def prune_var_sub_mean (): print 'PruneVarSubMean' from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.PreProc import PruneVarSubMean feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=PruneVarSubMean() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() width=1.4 size_cache=10 kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix()
def norm_one (): print 'NormOne' from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.PreProc import NormOne feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=NormOne() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() width=1.4 size_cache=10 kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix()
def features_dense_zero_copy_modular (in_data=data): feats = None if numpy.__version__ >= '1.5': feats=numpy.array(in_data, dtype=float64, order='F') a=RealFeatures() a.frombuffer(feats, False) b=numpy.array(a, copy=False) c=numpy.array(a, copy=True) d=RealFeatures() d.frombuffer(a, False) e=RealFeatures() e.frombuffer(a, True) a[:,0]=0 print a[0:4] print b[0:4] print c[0:4] print d[0:4] print e[0:4] else: print "numpy version >= 1.5 is needed" return feats
def modelselection_grid_search_kernel(): num_subsets=3 num_vectors=20 dim_vectors=3 # create some (non-sense) data matrix=rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features=RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels=BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i%2==0 else -1) # create svm classifier=LibSVM() # splitting strategy splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets) # accuracy evaluation evaluation_criterion=ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list classifier.print_modsel_params() # model parameter selection param_tree=create_param_tree() param_tree.print_tree() grid_search=GridSearchModelSelection(param_tree, cross) print_state=True best_combination=grid_search.select_model(print_state) print("best parameter(s):") best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have tighter confidence intervals cross.set_num_runs(10) cross.set_conf_int_alpha(0.01) result=cross.evaluate() print("result: ") result.print_result() return 0
def distance_mahalanobis_modular (fm_train_real = traindat, fm_test_real = testdat): from shogun.Features import RealFeatures from shogun.Distance import MahalanobisDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = MahalanobisDistance(feats_test, feats_train) for i in range(feats_test.get_num_vectors()): for j in range(feats_train.get_num_vectors()): dm = distance.distance(i, j) print dm
def distance_braycurtis_modular(fm_train_real=traindat, fm_test_real=testdat): from shogun.Features import RealFeatures from shogun.Distance import BrayCurtisDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = BrayCurtisDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_sigmoid_modular(fm_train_real=traindat,fm_test_real=testdat,size_cache=10,gamma=1.2,coef0=1.3): from shogun.Features import RealFeatures from shogun.Kernel import SigmoidKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_gaussiannaivebayes_modular( fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat): from shogun.Features import RealFeatures, Labels from shogun.Classifier import GaussianNaiveBayes feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = Labels(label_train_multiclass) gnb = GaussianNaiveBayes(feats_train, labels) gnb_train = gnb.train() output = gnb.apply(feats_test).get_labels() return gnb, gnb_train, output
def distance_chebyshew_modular(fm_train_real=traindat, fm_test_real=testdat): from shogun.Features import RealFeatures from shogun.Distance import ChebyshewMetric feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = ChebyshewMetric(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def classifier_multiclass_ecoc_random(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, label_test_multiclass=label_testdat, lawidth=2.1, C=1, epsilon=1e-5): from shogun.Features import RealFeatures, MulticlassLabels from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine from shogun.Classifier import ECOCStrategy, ECOCRandomSparseEncoder, ECOCRandomDenseEncoder, ECOCHDDecoder feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) classifier = LibLinear(L2R_L2LOSS_SVC) classifier.set_epsilon(epsilon) classifier.set_bias_enabled(True) rnd_dense_strategy = ECOCStrategy(ECOCRandomDenseEncoder(), ECOCHDDecoder()) rnd_sparse_strategy = ECOCStrategy(ECOCRandomSparseEncoder(), ECOCHDDecoder()) dense_classifier = LinearMulticlassMachine(rnd_dense_strategy, feats_train, classifier, labels) dense_classifier.train() label_dense = dense_classifier.apply(feats_test) out_dense = label_dense.get_labels() sparse_classifier = LinearMulticlassMachine(rnd_sparse_strategy, feats_train, classifier, labels) sparse_classifier.train() label_sparse = sparse_classifier.apply(feats_test) out_sparse = label_sparse.get_labels() if label_test_multiclass is not None: from shogun.Evaluation import MulticlassAccuracy labels_test = MulticlassLabels(label_test_multiclass) evaluator = MulticlassAccuracy() acc_dense = evaluator.evaluate(label_dense, labels_test) acc_sparse = evaluator.evaluate(label_sparse, labels_test) print('Random Dense Accuracy = %.4f' % acc_dense) print('Random Sparse Accuracy = %.4f' % acc_sparse) return out_sparse, out_dense
def compute_output_plot_isolines(classifier, kernel=None, train=None, sparse=False, pos=None, neg=None, regression=False): size=100 if pos is not None and neg is not None: x1_max=max(1.2*pos[0,:]) x1_min=min(1.2*neg[0,:]) x2_min=min(1.2*neg[1,:]) x2_max=max(1.2*pos[1,:]) x1=linspace(x1_min, x1_max, size) x2=linspace(x2_min, x2_max, size) else: x1=linspace(-5, 5, size) x2=linspace(-5, 5, size) x, y=meshgrid(x1, x2) dense=RealFeatures(array((ravel(x), ravel(y)))) if sparse: test=SparseRealFeatures() test.obtain_from_simple(dense) else: test=dense if kernel and train: kernel.init(train, test) else: classifier.set_features(test) labels = None if regression: labels=classifier.apply().get_labels() else: labels=classifier.apply().get_values() z=labels.reshape((size, size)) return x, y, z
def kernel_wave_modular (fm_train_real=traindat,fm_test_real=testdat, theta=1.0): from shogun.Features import RealFeatures from shogun.Kernel import WaveKernel from shogun.Distance import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) kernel=WaveKernel(feats_train, feats_train, theta, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def statistics_kmm (n,d): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel, MSG_DEBUG from shogun.Statistics import KernelMeanMatching from shogun.Mathematics import Math # init seed for reproducability Math.init_random(1) random.seed(1); data = random.randn(d,n) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) kernel.init(features,features) kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32)) w = kmm.compute_weights() #print w return w
def kernel_chi2_modular(fm_train_real=traindat, fm_test_real=testdat, width=1.4, size_cache=10): from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) kernel = Chi2Kernel(feats_train, feats_train, width, size_cache) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def RunLinearRegressionShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the responses # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: X = np.genfromtxt(self.dataset[0], delimiter=',') y = np.genfromtxt(self.dataset[1], delimiter=',') else: X = np.genfromtxt(self.dataset, delimiter=',') y = X[:, (X.shape[1] - 1)] X = X[:, :-1] with totalTimer: # Perform linear regression. model = LeastSquaresRegression(RealFeatures(X.T), RegressionLabels(y)) model.train() b = model.get_w() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def so_multiclass(fm_train_real=traindat, label_train_multiclass=label_traindat): try: from shogun.Features import RealFeatures from shogun.Loss import HingeLoss from shogun.Structure import MulticlassModel, MulticlassSOLabels, PrimalMosekSOSVM, RealNumber except ImportError: print("Mosek not available") return labels = MulticlassSOLabels(label_train_multiclass) features = RealFeatures(fm_train_real.T) model = MulticlassModel(features, labels) loss = HingeLoss() sosvm = PrimalMosekSOSVM(model, loss, labels) sosvm.train() out = sosvm.apply() count = 0 for i in xrange(out.get_num_labels()): yi_pred = RealNumber.obtain_from_generic(out.get_label(i)) if yi_pred.value == label_train_multiclass[i]: count = count + 1 print("Correct classification rate: %0.2f" % (100.0 * count / out.get_num_labels()))
def prepare_feats(desc, l=2, as_shogun=False): if l==2: desc = np.sqrt(desc) #bias not afected by sqrt norms = np.apply_along_axis(np.linalg.norm, 0, desc[:-1,:], l) #leave bias alone np.seterr(divide='ignore', invalid='ignore') desc[:-1,:]=desc[:-1,:]/norms #leave bias alone np.seterr(divide='warn', invalid='warn') if l==1: desc=desc[:-1,:] #removing bias dim if L1 -> nonlinear TODO find better way... desc[np.isnan(desc)]=0 #handle NaNs if as_shogun: desc=RealFeatures(desc.astype('float')) return desc
def preprocessor_kernelpca_modular(data, threshold, width): from shogun.Features import RealFeatures from shogun.Preprocessor import KernelPCA from shogun.Kernel import GaussianKernel features = RealFeatures(data) kernel = GaussianKernel(features, features, width) preprocessor = KernelPCA(kernel) preprocessor.init(features) preprocessor.set_target_dim(2) #X=preprocessor.get_transformation_matrix() X2 = preprocessor.apply_to_feature_matrix(features) lx0 = len(X2) modified_d1 = zeros((lx0, number_of_points_for_circle1)) modified_d2 = zeros((lx0, number_of_points_for_circle2)) modified_d1 = [X2[i][0:number_of_points_for_circle1] for i in range(lx0)] modified_d2 = [ X2[i][number_of_points_for_circle1:(number_of_points_for_circle1 + number_of_points_for_circle2)] for i in range(lx0) ] p.plot(modified_d1[0][:], modified_d1[1][:], 'o', modified_d2[0][:], modified_d2[1][:], 'x') p.title('final data') p.show() return features
def distance_normsquared_modular (fm_train_real=traindat,fm_test_real=testdat): from shogun.Features import RealFeatures from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance(feats_train, feats_train) distance.set_disable_sqrt(True) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def converter_multidimensionalscaling_modular(data): try: from shogun.Features import RealFeatures from shogun.Converter import MultidimensionalScaling from shogun.Distance import EuclideanDistance features = RealFeatures(data) distance_before = EuclideanDistance() distance_before.init(features, features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding, embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after - distance_matrix_before ) / numpy.linalg.norm(distance_matrix_before) except ImportError: print('No Eigen3 available')
def log_pdf(self, thetas): assert (len(shape(thetas)) == 2) assert (shape(thetas)[1] == self.dimension) result = zeros(len(thetas)) for i in range(len(thetas)): labels = BinaryLabels(self.y) feats_train = RealFeatures(self.X.T) # ARD: set set theta, which is in log-scale, as kernel weights kernel = GaussianARDKernel(10, 1) kernel.set_weights(exp(thetas[i])) mean = ZeroMean() likelihood = LogitLikelihood() inference = LaplacianInferenceMethod(kernel, feats_train, mean, labels, likelihood) # fix kernel scaling for now inference.set_scale(exp(0)) if self.ridge is not None: log_ml_estimate = inference.get_marginal_likelihood_estimate( self.n_importance, self.ridge) else: log_ml_estimate = inference.get_marginal_likelihood_estimate( self.n_importance) # prior is also in log-domain, so no exp of theta log_prior = self.prior.log_pdf(thetas[i].reshape( 1, len(thetas[i]))) result[i] = log_ml_estimate + log_prior return result
def kernel_tstudent_modular (fm_train_real=traindat,fm_test_real=testdat, degree=2.0): from shogun.Features import RealFeatures from shogun.Kernel import TStudentKernel from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance(feats_train, feats_train) kernel=TStudentKernel(feats_train, feats_train, degree, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_conjugateindex_modular(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat): from shogun.Features import RealFeatures, MulticlassLabels from shogun.Classifier import ConjugateIndex feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = MulticlassLabels(label_train_multiclass) ci = ConjugateIndex(feats_train, labels) ci.train() res = ci.apply(feats_test).get_labels() return ci, res
def kernel_distance_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.7): from shogun.Kernel import DistanceKernel from shogun.Features import RealFeatures from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance() kernel=DistanceKernel(feats_train, feats_test, width, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_multiclassmultipleoutputliblinear_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5): from shogun.Features import RealFeatures, MulticlassLabels, MulticlassMultipleOutputLabels from shogun.Classifier import MulticlassLibLinear feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) labels=MulticlassLabels(label_train_multiclass) classifier = MulticlassLibLinear(C,feats_train,labels) classifier.train() label_pred = classifier.apply_multiclass_multiple_output(feats_test,2) out = label_pred.get_labels() #print out return out
def kernel_multiquadric_modular (fm_train_real=traindat,fm_test_real=testdat, shift_coef=1.0): from shogun.Features import RealFeatures from shogun.Kernel import MultiquadricKernel from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance(feats_train, feats_train) kernel=MultiquadricKernel(feats_train, feats_train, shift_coef, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def modelselection_grid_search_linear_modular(traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.ModelSelection import GridSearchModelSelection from shogun.ModelSelection import ModelSelectionParameters, R_EXP from shogun.ModelSelection import ParameterCombination from shogun.Features import Labels from shogun.Features import RealFeatures from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC # build parameter tree to select C1 and C2 param_tree_root = ModelSelectionParameters() c1 = ModelSelectionParameters("C1") param_tree_root.append_child(c1) c1.build_values(-2.0, 2.0, R_EXP) c2 = ModelSelectionParameters("C2") param_tree_root.append_child(c2) c2.build_values(-2.0, 2.0, R_EXP) # training data features = RealFeatures(traindat) labels = Labels(label_traindat) # classifier classifier = LibLinear(L2R_L2LOSS_SVC) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list classifier.print_modsel_params() # splitting strategy for cross-validation splitting_strategy = StratifiedCrossValidationSplitting(labels, 10) # evaluation method evaluation_criterium = ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation = CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterium) # model selection instance model_selection = GridSearchModelSelection(param_tree_root, cross_validation) # perform model selection with selected methods #print "performing model selection of" #param_tree_root.print_tree() best_parameters = model_selection.select_model() # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(classifier) result = cross_validation.evaluate()
def RunLARSShogun(q): totalTimer = Timer() # Load input dataset. try: Log.Info("Loading dataset", self.verbose) inputData = np.genfromtxt(self.dataset[0], delimiter=',') responsesData = np.genfromtxt(self.dataset[1], delimiter=',') inputFeat = RealFeatures(inputData.T) responsesFeat = RegressionLabels(responsesData) # Get all the parameters. lambda1 = re.search("-l (\d+)", options) lambda1 = 0.0 if not lambda1 else int(lambda1.group(1)) with totalTimer: # Perform LARS. model = LeastAngleRegression(False) model.set_max_l1_norm(lambda1) model.set_labels(responsesFeat) model.train(inputFeat) model.get_w(model.get_path_size() - 1) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def kernel_exponential_modular (fm_train_real=traindat,fm_test_real=testdat, tau_coef=1.0): from shogun.Features import RealFeatures from shogun.Kernel import ExponentialKernel from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance = EuclidianDistance(feats_train, feats_train) kernel=ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def RunKMeansShogun(q): import numpy as np from shogun.Distance import EuclideanDistance from shogun.Features import RealFeatures from shogun import Clustering from shogun.Mathematics import Math_init_random totalTimer = Timer() if seed: Math_init_random(seed.group(1)) try: data = np.genfromtxt(self.dataset, delimiter=',') dataFeat = RealFeatures(data.T) distance = EuclideanDistance(dataFeat, dataFeat) # Create the K-Means object and perform K-Means clustering. with totalTimer: model = Clustering.KMeans(int(clusters.group(1)), distance) model.set_max_iter(maxIterations) model.train() labels = model.apply().get_labels() centers = model.get_cluster_centers() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def RunGMMShogun(q): totalTimer = Timer() try: # Load input dataset. Log.Info("Loading dataset", self.verbose) dataPoints = np.genfromtxt(self.dataset, delimiter=',') dataFeat = RealFeatures(dataPoints.T) # Get all the parameters. g = re.search("-g (\d+)", options) n = re.search("-n (\d+)", options) s = re.search("-n (\d+)", options) g = 1 if not g else int(g.group(1)) n = 250 if not n else int(n.group(1)) # Create the Gaussian Mixture Model. model = Clustering.GMM(g) model.set_features(dataFeat) with totalTimer: model.train_em(1e-9, n, 1e-9) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def kernel_cauchy_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0): from shogun.Features import RealFeatures from shogun.Kernel import CauchyKernel from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance(feats_train, feats_train) kernel=CauchyKernel(feats_train, feats_train, sigma, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_wavelet_modular(fm_train_real=traindat, fm_test_real=testdat, dilation=1.5, translation=1.0): from shogun.Features import RealFeatures from shogun.Kernel import WaveletKernel feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) kernel = WaveletKernel(feats_train, feats_train, 10, dilation, translation) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def create_features(examples, param): """ factory for features @param examples: list/array of examples @type examples: list @return subclass of shogun Features object @rtype: Features """ assert (len(examples) > 0) feat = None #TODO: refactor if param and param.flags.has_key( "svm_type") and param.flags["svm_type"] == "liblineardual": # create hashed promoter features return create_hashed_promoter_features(examples, param.flags) if param and param.kernel == "Promoter": print "creating promoter features" # create promoter features return create_promoter_features(examples, param.flags) #auto_detect string type if type(examples[0]) == str: # check what alphabet is used longstr = "" num_seqs = min(len(examples), 20) for i in range(num_seqs): longstr += examples[i] if len(set([letter for letter in longstr])) > 5: feat = StringCharFeatures(PROTEIN) if param and param.flags.has_key("debug"): print "FEATURES: StringCharFeatures(PROTEIN)" else: feat = StringCharFeatures(DNA) if param and param.flags.has_key("debug"): print "FEATURES: StringCharFeatures(DNA)" feat.set_features(examples) else: # assume real features examples = numpy.array(examples, dtype=numpy.float64) examples = numpy.transpose(examples) feat = RealFeatures(examples) if param and param.flags.has_key("debug"): print "FEATURES: RealFeatures" return feat
def classifier_knn_modular(fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat, k=3 ): from shogun.Features import RealFeatures, Labels from shogun.Classifier import KNN from shogun.Distance import EuclidianDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclidianDistance(feats_train, feats_train) labels=Labels(label_train_multiclass) knn=KNN(k, distance, labels) knn_train = knn.train() output=knn.apply(feats_test).get_labels() multiple_k=knn.classify_for_multiple_k() return knn,knn_train,output,multiple_k
def bench_shogun(X, y, T, valid): # # .. Shogun .. # from shogun.Classifier import LibSVM from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel start = datetime.now() feat = RealFeatures(X.T) feat_test = RealFeatures(T.T) labels = Labels(y.astype(np.float64)) kernel = GaussianKernel(feat, feat, sigma) shogun_svm = LibSVM(1., kernel, labels) shogun_svm.train() dec_func = shogun_svm.classify(feat_test).get_labels() score = np.mean(np.sign(dec_func) == valid) return score, datetime.now() - start
def features_dense_real_modular(A=matrix): # ... of type Real, LongInt and Byte a = RealFeatures(A) # print(some statistics about a) # print(a.get_num_vectors()) # print(a.get_num_features()) # get first feature vector and set it # print(a.get_feature_vector(0)) a.set_feature_vector(array([1, 4, 0, 0, 0, 9], dtype=float64), 0) # get matrix a_out = a.get_feature_matrix() assert all(a_out == A) return a_out
def features_director_dot_modular (fm_train_real, fm_test_real, label_train_twoclass, C, epsilon): from shogun.Features import RealFeatures, SparseRealFeatures, BinaryLabels from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC_DUAL from shogun.Mathematics import Math_init_random Math_init_random(17) feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) labels=BinaryLabels(label_train_twoclass) dfeats_train=NumpyFeatures(fm_train_real) dfeats_test=NumpyFeatures(fm_test_real) dlabels=BinaryLabels(label_train_twoclass) print feats_train.get_computed_dot_feature_matrix() print dfeats_train.get_computed_dot_feature_matrix() svm=LibLinear(C, feats_train, labels) svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL) svm.set_epsilon(epsilon) svm.set_bias_enabled(True) svm.train() svm.set_features(feats_test) svm.apply().get_labels() predictions = svm.apply() dfeats_train.__disown__() dfeats_train.parallel.set_num_threads(1) dsvm=LibLinear(C, dfeats_train, dlabels) dsvm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL) dsvm.set_epsilon(epsilon) dsvm.set_bias_enabled(True) dsvm.train() dfeats_test.__disown__() dfeats_test.parallel.set_num_threads(1) dsvm.set_features(dfeats_test) dsvm.apply().get_labels() dpredictions = dsvm.apply() return predictions, svm, predictions.get_labels()
def kernel_anova_modular (fm_train_real=traindat,fm_test_real=testdat,cardinality=2, size_cache=10): from shogun.Kernel import ANOVAKernel from shogun.Features import RealFeatures feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=ANOVAKernel(feats_train, feats_train, cardinality, size_cache) for i in range(0,feats_train.get_num_vectors()): for j in range(0,feats_train.get_num_vectors()): k1 = kernel.compute_rec1(i,j) k2 = kernel.compute_rec2(i,j) #if abs(k1-k2) > 1e-10: # print "|%s|%s|" % (k1, k2) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train, km_test, kernel
def preproc_prunevarsubmean_modular(fm_train_real=traindat, fm_test_real=testdat, width=1.4, size_cache=10): from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.PreProc import PruneVarSubMean feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) preproc = PruneVarSubMean() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() kernel = Chi2Kernel(feats_train, feats_train, width, size_cache) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def preprocessor_randomfouriergausspreproc_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.Preprocessor import RandomFourierGaussPreproc feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=RandomFourierGaussPreproc() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def preprocessor_normone_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.Preprocessor import NormOne feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preprocessor=NormOne() preprocessor.init(feats_train) feats_train.add_preprocessor(preprocessor) feats_train.apply_preprocessor() feats_test.add_preprocessor(preprocessor) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def preproc_logplusone_modular(fm_train_real=traindat, fm_test_real=testdat, width=1.4, size_cache=10): from shogun.Kernel import Chi2Kernel from shogun.Features import RealFeatures from shogun.PreProc import LogPlusOne feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) preproc = LogPlusOne() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() feats_test.add_preproc(preproc) feats_test.apply_preproc() kernel = Chi2Kernel(feats_train, feats_train, width, size_cache) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
from shogun.Features import RealFeatures, LongIntFeatures, ByteFeatures from numpy import array, float64, int64, uint8, all # create dense matrices A,B,C A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) B=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=int64) C=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=uint8) # ... of type Real, LongInt and Byte a=RealFeatures(A) b=LongIntFeatures(B) c=ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print some statistics about a print a.get_num_vectors() print a.get_num_features() # get first feature vector and set it print a.get_feature_vector(0) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix()
def serialization_complex_example(num=5, dist=1, dim=10, C=2.0, width=10): import os from numpy import concatenate, zeros, ones from numpy.random import randn, seed from shogun.Features import RealFeatures, Labels from shogun.Classifier import GMNPSVM from shogun.Kernel import GaussianKernel from shogun.IO import SerializableHdf5File,SerializableAsciiFile, \ SerializableJsonFile,SerializableXmlFile,MSG_DEBUG from shogun.Preprocessor import NormOne, LogPlusOne seed(17) data=concatenate((randn(dim, num), randn(dim, num) + dist, randn(dim, num) + 2*dist, randn(dim, num) + 3*dist), axis=1) lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num))) feats=RealFeatures(data) #feats.io.set_loglevel(MSG_DEBUG) kernel=GaussianKernel(feats, feats, width) labels=Labels(lab) svm = GMNPSVM(C, kernel, labels) feats.add_preprocessor(NormOne()) feats.add_preprocessor(LogPlusOne()) feats.set_preprocessed(1) svm.train(feats) #svm.print_serializable() fstream = SerializableHdf5File("blaah.h5", "w") status = svm.save_serializable(fstream) check_status(status) fstream = SerializableAsciiFile("blaah.asc", "w") status = svm.save_serializable(fstream) check_status(status) fstream = SerializableJsonFile("blaah.json", "w") status = svm.save_serializable(fstream) check_status(status) fstream = SerializableXmlFile("blaah.xml", "w") status = svm.save_serializable(fstream) check_status(status) fstream = SerializableHdf5File("blaah.h5", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status) new_svm.train() fstream = SerializableAsciiFile("blaah.asc", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status) new_svm.train() fstream = SerializableJsonFile("blaah.json", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status) new_svm.train() fstream = SerializableXmlFile("blaah.xml", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status) new_svm.train() os.unlink("blaah.h5") os.unlink("blaah.asc") os.unlink("blaah.json") os.unlink("blaah.xml") return svm,new_svm
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con): """Converts numpy arrays or sequences into shogun features""" if kname == 'gauss' or kname == 'linear' or kname == 'poly': examples = numpy.array(examples) feats = RealFeatures(examples) elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) elif kname == 'spec' or kname == 'cumspec': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) wf = StringUlongFeatures( feats.get_alphabet() ) wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec') del feats if train_mode: preproc = SortUlongString() preproc.init(wf) wf.add_preproc(preproc) ret = wf.apply_preproc() #assert(ret) feats = wf elif kname == 'spec2' or kname == 'cumspec2': # spectrum kernel on two sequences feats = {} feats['combined'] = CombinedFeatures() reversed = kname=='cumspec2' (ex0,ex1) = zip(*examples) f0 = StringCharFeatures(list(ex0), DNA) wf = StringWordFeatures(f0.get_alphabet()) wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed) del f0 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessors() assert(ret) feats['combined'].append_feature_obj(wf) feats['f0'] = wf f1 = StringCharFeatures(list(ex1), DNA) wf = StringWordFeatures( f1.get_alphabet() ) wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed) del f1 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preproc(preproc) ret = wf.apply_preproc() assert(ret) feats['combined'].append_feature_obj(wf) feats['f1'] = wf else: print 'Unknown kernel %s' % kname return (feats,preproc)
# parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 # Note that kernels per data can be different kernel_x=GaussianKernel(10,8) kernel_y=GaussianKernel(10,8) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled
def statistics_hsic (n, difference, angle): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA from shogun.Distance import EuclideanDistance from shogun.Mathematics import Math, Statistics, IntVector # init seed for reproducability Math.init_random(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic #print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def features_dense_protocols_modular(in_data=data): m_real=array(in_data, dtype=float64, order='F') f_real=RealFeatures(m_real) print m_real print f_real print f_real[-1] print f_real[1, 2] print f_real[-1:3] print f_real[2, 0:2] print f_real[0:3, 1] print f_real[0:3, 1:2] print f_real[:,1] print f_real[1,:] print m_real[-2] f_real[-1]=m_real[-2] print f_real[-1] print m_real[0, 1] f_real[1,2]=m_real[0,1] print f_real[1, 2] print m_real[0:2] f_real[1:3]=m_real[0:2] print f_real[1:3] print m_real[0, 0:2] f_real[2, 0:2]=m_real[0,0:2] print f_real[2, 0:2] print m_real[0:3, 2] f_real[0:3,1]=m_real[0:3, 2] print f_real[0:3, 1] print m_real[0:3, 0:1] f_real[0:3,1:2]=m_real[0:3,0:1] print f_real[0:3, 1:2] f_real[:,0]=0 print f_real.get_feature_matrix() if numpy.__version__ >= '1.5': f_real+=m_real f_real*=m_real f_real-=m_real else: print "numpy version >= 1.5 is needed" return None f_real+=f_real f_real*=f_real f_real-=f_real print f_real print f_real.get_feature_matrix() try: mem_real=memoryview(f_real) except NameError: print "Python2.7 is needed for memoryview class" return None ret_real=array(f_real) print ret_real return f_real[:,0]
np.dot(np.random.randn(N, dim), covs[1]) + np.array([-10, -10]), np.dot(np.random.randn(N, dim), covs[2]) + np.array([10, -10])]; Y = np.hstack((np.zeros(N), np.ones(N), 2*np.ones(N))) return X, Y # Number of classes M = 3 # Number of samples of each class N = 50 # Dimension of the data dim = 2 X, y = gen_data() labels = MulticlassSOLabels(y) features = RealFeatures(X.T) model = MulticlassModel(features, labels) loss = HingeLoss() risk = MulticlassRiskFunction() risk_data = MulticlassRiskData(features, labels, model.get_dim(), features.get_num_vectors()) lambda_ = 1e3 sosvm = DualLibQPBMSOSVM(model, loss, labels, features, lambda_, risk, risk_data) sosvm.set_cleanAfter(10) # number of iterations that cutting plane has to be inactive for to be removed sosvm.set_cleanICP(True) # enables inactive cutting plane removal feature sosvm.set_TolRel(0.001) # set relative tolerance sosvm.set_verbose(True) # enables verbosity of the solver
# parameters, change to get different results m=1000 # set to 10000 for a good test result dim=2 # setting the difference of the first dimension smaller makes a harder test difference=1 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_mean_data(m,dim,difference) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2
def features_director_dot_modular (fm_train_real, fm_test_real, label_train_twoclass, C, epsilon): try: from shogun.Features import DirectorDotFeatures from shogun.Library import RealVector except ImportError: print "recompile shogun with --enable-swig-directors" return class NumpyFeatures(DirectorDotFeatures): # variables data=numpy.empty((1,1)) # constructor def __init__(self, d): DirectorDotFeatures.__init__(self) self.data = d # overloaded methods def add_to_dense_sgvec(self, alpha, vec_idx1, vec2, abs): if abs: vec2+=alpha*numpy.abs(self.data[:,vec_idx1]) else: vec2+=alpha*self.data[:,vec_idx1] def dot(self, vec_idx1, df, vec_idx2): return numpy.dot(self.data[:,vec_idx1], df.get_computed_dot_feature_vector(vec_idx2)) def dense_dot_sgvec(self, vec_idx1, vec2): return numpy.dot(self.data[:,vec_idx1], vec2[0:vec2.vlen]) def get_num_vectors(self): return self.data.shape[1] def get_dim_feature_space(self): return self.data.shape[0] # operators # def __add__(self, other): # return NumpyFeatures(self.data+other.data) # def __sub__(self, other): # return NumpyFeatures(self.data-other.data) # def __iadd__(self, other): # return NumpyFeatures(self.data+other.data) # def __isub__(self, other): # return NumpyFeatures(self.data-other.data) from shogun.Features import RealFeatures, SparseRealFeatures, BinaryLabels from shogun.Classifier import LibLinear, L2R_L2LOSS_SVC_DUAL from shogun.Mathematics import Math_init_random Math_init_random(17) feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) labels=BinaryLabels(label_train_twoclass) dfeats_train=NumpyFeatures(fm_train_real) dfeats_test=NumpyFeatures(fm_test_real) dlabels=BinaryLabels(label_train_twoclass) print feats_train.get_computed_dot_feature_matrix() print dfeats_train.get_computed_dot_feature_matrix() svm=LibLinear(C, feats_train, labels) svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL) svm.set_epsilon(epsilon) svm.set_bias_enabled(True) svm.train() svm.set_features(feats_test) svm.apply().get_labels() predictions = svm.apply() dfeats_train.__disown__() dfeats_train.parallel.set_num_threads(1) dsvm=LibLinear(C, dfeats_train, dlabels) dsvm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL) dsvm.set_epsilon(epsilon) dsvm.set_bias_enabled(True) dsvm.train() dfeats_test.__disown__() dfeats_test.parallel.set_num_threads(1) dsvm.set_features(dfeats_test) dsvm.apply().get_labels() dpredictions = dsvm.apply() return predictions, svm, predictions.get_labels()
def statistics_linear_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=LinearTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
# parameters, change to get different results m=100 dim=2 # setting the difference of the first dimension smaller makes a harder test difference=0.5 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_mean_data(m,dim,difference) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) # use biased statistic mmd=QuadraticTimeMMD(kernel,features, m) mmd.set_statistic_type(BIASED) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_mean_data(m,dim,difference) features.set_feature_matrix(data)
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # bootstrapping, biased statistic hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(num_null_samples) null_samples_boot=hsic.bootstrap_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
# parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=Math.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2
def statistics_quadratic_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n=500 dim=2 difference=0.5 # use data generator class to produce example data data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=QuadraticTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=mmd.compute_statistic() alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n-10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples=mmd.sample_null_spectrum(50,10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)