def statistics_linear_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) mmd=LinearTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd(): from shogun.Features import RealFeatures from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # data is standard normal distributed. only one dimension of Y has a mean # shift of difference # in pratice, this generate data function could be replaced by a method # that obtains data from a stream (X,Y)=gen_data.create_mean_data(n,dim,difference) print "dimension means of X", [mean(x) for x in X] print "dimension means of Y", [mean(x) for x in Y] # create shogun feature representation features_x=RealFeatures(X) features_y=RealFeatures(Y) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) mmd=LinearTimeMMD(kernel,features_x, features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 # for the linear time mmd, the statistic has to be computed on different # data than the p-value, so first, compute statistic, and then compute # p-value on other data # this annoying property is since the null-distribution should stay normal # which is not the case if "training/test" data would be the same statistic=mmd.compute_statistic() print "test statistic:", statistic # generate new data (same distributions as old) and new statistic object (X,Y)=gen_data.create_mean_data(n,dim,difference) features_x=RealFeatures(X) features_y=RealFeatures(Y) mmd=LinearTimeMMD(kernel,features_x, features_y) # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=LinearTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd (n,dim,difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value_boot=mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian=mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def statistics_linear_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # streaming data generator for mean shift distributions gen_p=MeanShiftRealDataGenerator(0, dim) gen_q=MeanShiftRealDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n = 10000 dim = 2 difference = 0.5 # streaming data generator for mean shift distributions gen_p = MeanShiftRealDataGenerator(0, dim) gen_q = MeanShiftRealDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha = 0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 50) # normally, far more iterations are needed p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 10) # normally, far more iterations are needed null_samples = mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd(n, dim, difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha = 0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 50) # normally, far more iterations are needed p_value_boot = mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian = mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 10) # normally, far more iterations are needed null_samples = mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors