def statistics_linear_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) mmd=LinearTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd_kernel_choice(): from shogun.Features import RealFeatures, CombinedFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n=50000 dim=5 difference=2 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create kernels/features to choose from # here: just a bunch of Gaussian Kernels with different widths # real sigmas are 2^-5, ..., 2^10 sigmas=array([pow(2,x) for x in range(-5,10)]) # shogun has a different parametrization of the Gaussian kernel shogun_sigmas=array([x*x*2 for x in sigmas]) # We will use multiple kernels kernel=CombinedKernel() # two separate feature objects here, could also be one with appended data features=CombinedFeatures() # all kernels work on same features for i in range(len(sigmas)): kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i])) features.append_feature_obj(RealFeatures(data)) mmd=LinearTimeMMD(kernel,features, n) print "start learning kernel weights" mmd.set_opt_regularization_eps(10E-5) mmd.set_opt_low_cut(10E-5) mmd.set_opt_max_iterations(1000) mmd.set_opt_epsilon(10E-7) mmd.optimize_kernel_weights() weights=kernel.get_subkernel_weights() print "learned weights:", weights #pyplot.plot(array(range(len(sigmas))), weights) #pyplot.show() print "index of max weight", weights.argmax()
def statistics_linear_time_mmd_kernel_choice(): from shogun.Features import RealFeatures, CombinedFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n = 50000 dim = 5 difference = 2 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data = DataGenerator.generate_mean_data(n, dim, difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2 * n + 1].T) # create kernels/features to choose from # here: just a bunch of Gaussian Kernels with different widths # real sigmas are 2^-5, ..., 2^10 sigmas = array([pow(2, x) for x in range(-5, 10)]) # shogun has a different parametrization of the Gaussian kernel shogun_sigmas = array([x * x * 2 for x in sigmas]) # We will use multiple kernels kernel = CombinedKernel() # two separate feature objects here, could also be one with appended data features = CombinedFeatures() # all kernels work on same features for i in range(len(sigmas)): kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i])) features.append_feature_obj(RealFeatures(data)) mmd = LinearTimeMMD(kernel, features, n) print "start learning kernel weights" mmd.set_opt_regularization_eps(10E-5) mmd.set_opt_low_cut(10E-5) mmd.set_opt_max_iterations(1000) mmd.set_opt_epsilon(10E-7) mmd.optimize_kernel_weights() weights = kernel.get_subkernel_weights() print "learned weights:", weights #pyplot.plot(array(range(len(sigmas))), weights) #pyplot.show() print "index of max weight", weights.argmax()
# for nice plotting that fits into our shogun tutorial import latex_plot_inits # parameters, change to get different results m=1000 # set to 10000 for a good test result dim=2 # setting the difference of the first dimension smaller makes a harder test difference=1 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_mean_data(m,dim,difference) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix()
def statistics_hsic (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, IntVector # note that the HSIC has to store kernel matrices # which upper bounds the sample size n=250 difference=3 angle=pi/3 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() print "HSIC:", statistic alpha=0.05 print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value=hsic.compute_p_value(statistic) thresh=hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value<alpha print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value=hsic.compute_p_value(statistic) thresh=hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=LinearTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # bootstrapping, biased statistic hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(num_null_samples) null_samples_boot=hsic.bootstrap_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_quadratic_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n=500 dim=2 difference=0.5 # use data generator class to produce example data data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) mmd=QuadraticTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=mmd.compute_statistic() alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n-10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples=mmd.sample_null_spectrum(50,10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
# for nice plotting that fits into our shogun tutorial import latex_plot_inits # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=Math.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix()
def statistics_quadratic_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n = 500 dim = 2 difference = 0.5 # use data generator class to produce example data data = DataGenerator.generate_mean_data(n, dim, difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2 * n + 1].T) # create shogun feature representation features = RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = Math.randperm_vec(features.get_num_vectors()) subset = subset[0:200] features.add_subset(subset) dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() features.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) mmd = QuadraticTimeMMD(kernel, features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = mmd.compute_statistic() alpha = 0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n - 10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples = mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples = mmd.sample_null_spectrum(50, 10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
# for nice plotting that fits into our shogun tutorial import latex_plot_inits # parameters, change to get different results m = 1000 # set to 10000 for a good test result dim = 2 # setting the difference of the first dimension smaller makes a harder test difference = 1 # number of samples taken from null and alternative distribution num_null_samples = 500 # use data generator class to produce example data data = DataGenerator.generate_mean_data(m, dim, difference) # create shogun feature representation features = RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset = Math.randperm_vec(features.get_num_vectors()) subset = subset[0:200] features.add_subset(subset) dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix()
def hsic_graphical(): # parameters, change to get different results m = 250 difference = 3 # setting the angle lower makes a harder test angle = pi / 30 # number of samples taken from null and alternative distribution num_null_samples = 500 # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) # create shogun feature representation features_x = RealFeatures(array([data[0]])) features_y = RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = int32(array([x for x in range(features_x.get_num_vectors()) ])) # numpy subset = random.permutation(subset) # numpy permutation subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_y = median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # sample alternative distribution alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic = HSIC(kernel_x, kernel_y, features_x, features_y) alt_samples[i] = hsic.compute_statistic() # sample from null distribution # bootstrapping, biased statistic hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(num_null_samples) null_samples_boot = hsic.bootstrap_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params = hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # plot figure() # plot data x and y subplot(2, 2, 1) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 2, 2) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)]) ] # plot null distribution with threshold subplot(2, 2, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2, 2, 4) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_hsic (n, difference, angle): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA from shogun.Distance import EuclideanDistance from shogun.Mathematics import Math, Statistics, IntVector # init seed for reproducability Math.init_random(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic #print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def statistics_hsic(): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA # note that the HSIC has to store kernel matrices # which upper bounds the sample size n=250 difference=3 angle=pi/3 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) hsic=HSIC(kernel,kernel,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() print "HSIC:", statistic alpha=0.05 print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value=hsic.compute_p_value(statistic) thresh=hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value<alpha print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value=hsic.compute_p_value(statistic) thresh=hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_quadratic_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n=500 dim=2 difference=0.5 # use data generator class to produce example data data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=QuadraticTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=mmd.compute_statistic() alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n-10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples=mmd.sample_null_spectrum(50,10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)