def evaluation_cross_validation_classification(traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import BinaryLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import LibSVM, MKLClassification from shogun.Mathematics import Statistics # training data, combined features all on same data features = RealFeatures(traindat) comb_features = CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels = BinaryLabels(label_traindat) # kernel, different Gaussians combined kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm = MKLClassification(LibSVM()) svm.set_interleaved_optimization_enabled(False) svm.set_kernel(kernel) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation = CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) mkl_storage = CrossValidationMKLStorage() cross_validation.add_cross_validation_output(mkl_storage) cross_validation.set_num_runs(3) # perform cross-validation result = cross_validation.evaluate() # print mkl weights weights = mkl_storage.get_mkl_weights() print "mkl weights during cross--validation" print weights print "mean per kernel" print Statistics.matrix_mean(weights, False) print "variance per kernel" print Statistics.matrix_variance(weights, False) print "std-dev per kernel" print Statistics.matrix_std_deviation(weights, False)
def statistics_hsic (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, IntVector # note that the HSIC has to store kernel matrices # which upper bounds the sample size n=250 difference=3 angle=pi/3 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() print "HSIC:", statistic alpha=0.05 print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value=hsic.compute_p_value(statistic) thresh=hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value<alpha print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value=hsic.compute_p_value(statistic) thresh=hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=LinearTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_quadratic_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, IntVector # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n=500 dim=2 difference=0.5 # streaming data generator for mean shift distributions gen_p=MeanShiftRealDataGenerator(0, dim) gen_q=MeanShiftRealDataGenerator(difference, dim) # Stream examples and merge them in order to compute median on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features=gen_p.get_streamed_features(n) features=features.create_merged_copy(gen_q.get_streamed_features(n)) # use data generator class to produce example data data=features.get_feature_matrix() print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Use a permutation set to temporarily merge features in merged examples subset=IntVector.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=QuadraticTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=mmd.compute_statistic() alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n-10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples=mmd.sample_null_spectrum(50,10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd (n,dim,difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value_boot=mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian=mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def mathematics_logdet(matrix=mtx, max_iter_eig=1000, max_iter_lin=1000, num_samples=1): from scipy.sparse import eye # Create a Hermitian sparse matrix rows = matrix.shape[0] cols = matrix.shape[1] A = matrix.transpose() * matrix + eye(rows, cols) from scipy.sparse import csc_matrix try: from shogun.Mathematics import RealSparseMatrixOperator from shogun.Mathematics import LanczosEigenSolver from shogun.Mathematics import CGMShiftedFamilySolver from shogun.Mathematics import LogRationalApproximationCGM from shogun.Mathematics import ProbingSampler from shogun.Mathematics import LogDetEstimator from shogun.Mathematics import Statistics from shogun.Library import SerialComputationEngine # creating the linear operator, eigen-solver op = RealSparseMatrixOperator(A.tocsc()) eig_solver = LanczosEigenSolver(op) # we can set the iteration limit high for poorly conditioned matrices eig_solver.set_max_iteration_limit(max_iter_eig) # alternatively, if the matrix is small, we can compute eigenvalues externally # and set min/max eigenvalues into the eigensolver # from scipy.sparse.linalg import eigsh # eigenvalues=eigsh(A, rows-1) # eig_solver.set_min_eigenvalue(eigenvalues[0][0]) # eig_solver.set_max_eigenvalue(eigenvalues[0][-1]) # create the shifted-family linear solver which solves for all the shifts # using as many matrix-vector products as one shift in CG iterations lin_solver = CGMShiftedFamilySolver() lin_solver.set_iteration_limit(max_iter_lin) # computation engine engine = SerialComputationEngine() # set the desired accuracy tighter to obtain better results # this determines the number of contour points in conformal mapping of # the rational approximation of the Cauchy's integral of f(A)*s, f=log desired_accuracy = 1e-5 # creating the log-linear-operator function op_func = LogRationalApproximationCGM(op, engine, eig_solver, lin_solver, desired_accuracy) # set the trace sampler to be probing sampler, in which samples are obtained # by greedy graph coloring of the power of sparse matrix (default is power=1, # 2-distance coloring) trace_sampler = ProbingSampler(op) # estimating log-det log_det_estimator = LogDetEstimator(trace_sampler, op_func, engine) # set the number of samples as required estimates = log_det_estimator.sample(num_samples) estimated_logdet = sum(estimates) / len(estimates) actual_logdet = Statistics.log_det(A) print actual_logdet, estimated_logdet return estimates except ImportError: print "One or many of the dependencies (Eigen3/LaPack/ColPack) not found!"
def statistics_linear_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # streaming data generator for mean shift distributions gen_p=MeanShiftRealDataGenerator(0, dim) gen_q=MeanShiftRealDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd(n, dim, difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha = 0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 50) # normally, far more iterations are needed p_value_boot = mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian = mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 10) # normally, far more iterations are needed null_samples = mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
# create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=Math.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this.
def evaluation_cross_validation_classification (traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import BinaryLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import LibSVM, MKLClassification from shogun.Mathematics import Statistics # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=BinaryLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLClassification(LibSVM()); svm.set_interleaved_optimization_enabled(False); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) mkl_storage=CrossValidationMKLStorage() cross_validation.add_cross_validation_output(mkl_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() # print mkl weights weights=mkl_storage.get_mkl_weights() print "mkl weights during cross--validation" print weights print "mean per kernel" print Statistics.matrix_mean(weights, False) print "variance per kernel" print Statistics.matrix_variance(weights, False) print "std-dev per kernel" print Statistics.matrix_std_deviation(weights, False)
def hsic_graphical(): # parameters, change to get different results m = 250 difference = 3 # setting the angle lower makes a harder test angle = pi / 30 # number of samples taken from null and alternative distribution num_null_samples = 500 # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) # create shogun feature representation features_x = RealFeatures(array([data[0]])) features_y = RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = int32(array([x for x in range(features_x.get_num_vectors()) ])) # numpy subset = random.permutation(subset) # numpy permutation subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_y = median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # sample alternative distribution alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic = HSIC(kernel_x, kernel_y, features_x, features_y) alt_samples[i] = hsic.compute_statistic() # sample from null distribution # bootstrapping, biased statistic hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(num_null_samples) null_samples_boot = hsic.bootstrap_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params = hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # plot figure() # plot data x and y subplot(2, 2, 1) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 2, 2) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)]) ] # plot null distribution with threshold subplot(2, 2, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2, 2, 4) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_linear_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n = 10000 dim = 2 difference = 0.5 # streaming data generator for mean shift distributions gen_p = MeanShiftRealDataGenerator(0, dim) gen_q = MeanShiftRealDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha = 0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 50) # normally, far more iterations are needed p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 10) # normally, far more iterations are needed null_samples = mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_hsic (n, difference, angle): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA from shogun.Distance import EuclideanDistance from shogun.Mathematics import Math, Statistics, IntVector # init seed for reproducability Math.init_random(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic #print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
# create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # use biased statistic mmd=LinearTimeMMD(kernel,features, m) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_mean_data(m,dim,difference) features.set_feature_matrix(data) alt_samples[i]=mmd.compute_statistic() # sample from null distribution
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # bootstrapping, biased statistic hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(num_null_samples) null_samples_boot=hsic.bootstrap_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_quadratic_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, IntVector # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n = 100 dim = 2 difference = 0.5 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute median on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features = gen_p.get_streamed_features(n) features = features.create_merged_copy(gen_q.get_streamed_features(n)) # use data generator class to produce example data data = features.get_feature_matrix() print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n : 2 * n + 1].T) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Use a permutation set to temporarily merge features in merged examples subset = IntVector.randperm_vec(features.get_num_vectors()) subset = subset[0:200] features.add_subset(subset) dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() features.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma = median_distance ** 2 print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) mmd = QuadraticTimeMMD(kernel, features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = mmd.compute_statistic() alpha = 0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n - 10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples = mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples = mmd.sample_null_spectrum(50, 10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def mathematics_logdet(matrix=mtx, max_iter_eig=1000, max_iter_lin=1000, num_samples=1): from scipy.sparse import eye # Create a Hermitian sparse matrix rows = matrix.shape[0] cols = matrix.shape[1] A = matrix.transpose() * matrix + eye(rows, cols) from scipy.sparse import csc_matrix try: from shogun.Mathematics import RealSparseMatrixOperator from shogun.Mathematics import LanczosEigenSolver from shogun.Mathematics import CGMShiftedFamilySolver from shogun.Mathematics import LogRationalApproximationCGM from shogun.Mathematics import ProbingSampler from shogun.Mathematics import LogDetEstimator from shogun.Mathematics import Statistics from shogun.Library import SerialComputationEngine # creating the linear operator, eigen-solver op = RealSparseMatrixOperator(A.tocsc()) eig_solver = LanczosEigenSolver(op) # we can set the iteration limit high for poorly conditioned matrices eig_solver.set_max_iteration_limit(max_iter_eig) # alternatively, if the matrix is small, we can compute eigenvalues externally # and set min/max eigenvalues into the eigensolver # from scipy.sparse.linalg import eigsh # eigenvalues=eigsh(A, rows-1) # eig_solver.set_min_eigenvalue(eigenvalues[0][0]) # eig_solver.set_max_eigenvalue(eigenvalues[0][-1]) # create the shifted-family linear solver which solves for all the shifts # using as many matrix-vector products as one shift in CG iterations lin_solver = CGMShiftedFamilySolver() lin_solver.set_iteration_limit(max_iter_lin) # computation engine engine = SerialComputationEngine() # set the desired accuracy tighter to obtain better results # this determines the number of contour points in conformal mapping of # the rational approximation of the Cauchy's integral of f(A)*s, f=log desired_accuracy = 1E-5 # creating the log-linear-operator function op_func=LogRationalApproximationCGM(op, engine, eig_solver, lin_solver,\ desired_accuracy) # set the trace sampler to be probing sampler, in which samples are obtained # by greedy graph coloring of the power of sparse matrix (default is power=1, # 2-distance coloring) trace_sampler = ProbingSampler(op) # estimating log-det log_det_estimator = LogDetEstimator(trace_sampler, op_func, engine) # set the number of samples as required estimates = log_det_estimator.sample(num_samples) estimated_logdet = sum(estimates) / len(estimates) actual_logdet = Statistics.log_det(A) print(actual_logdet, estimated_logdet) return estimates except ImportError: print( 'One or many of the dependencies (Eigen3/LaPack/ColPack) not found!' )