def distance_director_euclidean_modular(fm_train_real=traindat, fm_test_real=testdat, scale=1.2): from shogun.Distance import EuclideanDistance from modshogun import Time feats_train = RealFeatures(fm_train_real) feats_train.io.set_loglevel(0) feats_train.parallel.set_num_threads(1) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance() distance.init(feats_train, feats_test) ddistance = DirectorEuclideanDistance() ddistance.init(feats_train, feats_test) print "dm_train" t = Time() dm_train = distance.get_distance_matrix() t1 = t.cur_time_diff(True) print "ddm_train" t = Time() ddm_train = ddistance.get_distance_matrix() t2 = t.cur_time_diff(True) print "dm_train", dm_train print "ddm_train", ddm_train return dm_train, ddm_train
def converter_multidimensionalscaling_modular(data): try: from shogun.Features import RealFeatures from shogun.Converter import MultidimensionalScaling from shogun.Distance import EuclideanDistance features = RealFeatures(data) distance_before = EuclideanDistance() distance_before.init(features, features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding, embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after - distance_matrix_before ) / numpy.linalg.norm(distance_matrix_before) except ImportError: print('No Eigen3 available')
def converter_multidimensionalscaling_modular (data): try: from shogun.Features import RealFeatures from shogun.Converter import MultidimensionalScaling from shogun.Distance import EuclideanDistance features = RealFeatures(data) distance_before = EuclideanDistance() distance_before.init(features,features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding,embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after-distance_matrix_before)/numpy.linalg.norm(distance_matrix_before) < 1e-6 except ImportError: print('No Eigen3 available')
def distance_euclidean_modular (fm_train_real=traindat,fm_test_real=testdat): from shogun.Features import RealFeatures from shogun.Distance import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def RunKMeansShogun(q): import numpy as np from shogun.Distance import EuclideanDistance from shogun.Features import RealFeatures from shogun import Clustering from shogun.Mathematics import Math_init_random totalTimer = Timer() if seed: Math_init_random(seed.group(1)) try: data = np.genfromtxt(self.dataset, delimiter=',') dataFeat = RealFeatures(data.T) distance = EuclideanDistance(dataFeat, dataFeat) # Create the K-Means object and perform K-Means clustering. with totalTimer: model = Clustering.KMeans(int(clusters.group(1)), distance) model.set_max_iter(maxIterations) model.train() labels = model.apply().get_labels() centers = model.get_cluster_centers() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def distance_director_euclidean_modular(fm_train_real=traindat, fm_test_real=testdat, scale=1.2): try: from shogun.Distance import DirectorDistance except ImportError: print("recompile shogun with --enable-swig-directors") return class DirectorEuclideanDistance(DirectorDistance): def __init__(self): DirectorDistance.__init__(self, True) def distance_function(self, idx_a, idx_b): seq1 = self.get_lhs().get_feature_vector(idx_a) seq2 = self.get_rhs().get_feature_vector(idx_b) return numpy.linalg.norm(seq1 - seq2) from shogun.Distance import EuclideanDistance from modshogun import Time feats_train = RealFeatures(fm_train_real) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.parallel.set_num_threads(1) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance() distance.init(feats_train, feats_test) ddistance = DirectorEuclideanDistance() ddistance.init(feats_train, feats_test) #print "dm_train" t = Time() dm_train = distance.get_distance_matrix() #t1=t.cur_time_diff(True) #print "ddm_train" t = Time() ddm_train = ddistance.get_distance_matrix() #t2=t.cur_time_diff(True) #print "dm_train", dm_train #print "ddm_train", ddm_train return dm_train, ddm_train
def RunAllKnnShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the query # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') queryFeat = RealFeatures(queryFeat.T) else: referenceData = np.genfromtxt(self.dataset, delimiter=',') # Labels are the last row of the dataset. labels = MulticlassLabels( referenceData[:, (referenceData.shape[1] - 1)]) referenceData = referenceData[:, :-1] with totalTimer: # Get all the parameters. k = re.search("-k (\d+)", options) if not k: Log.Fatal( "Required option: Number of furthest neighbors to find." ) q.put(-1) return -1 else: k = int(k.group(1)) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) q.put(-1) return -1 referenceFeat = RealFeatures(referenceData.T) distance = EuclideanDistance(referenceFeat, referenceFeat) # Perform All K-Nearest-Neighbors. model = SKNN(k, distance, labels) model.train() if len(self.dataset) == 2: out = model.apply(queryFeat).get_labels() else: out = model.apply(referenceFeat).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def distance_director_euclidean_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2): try: from shogun.Distance import DirectorDistance except ImportError: print "recompile shogun with --enable-swig-directors" return class DirectorEuclideanDistance(DirectorDistance): def __init__(self): DirectorDistance.__init__(self, True) def distance_function(self, idx_a, idx_b): seq1 = self.get_lhs().get_feature_vector(idx_a) seq2 = self.get_rhs().get_feature_vector(idx_b) return numpy.linalg.norm(seq1-seq2) from shogun.Distance import EuclideanDistance from modshogun import Time feats_train=RealFeatures(fm_train_real) feats_train.io.set_loglevel(0) feats_train.parallel.set_num_threads(1) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance() distance.init(feats_train, feats_test) ddistance=DirectorEuclideanDistance() ddistance.init(feats_train, feats_test) print "dm_train" t=Time() dm_train=distance.get_distance_matrix() t1=t.cur_time_diff(True) print "ddm_train" t=Time() ddm_train=ddistance.get_distance_matrix() t2=t.cur_time_diff(True) print "dm_train", dm_train print "ddm_train", ddm_train return dm_train, ddm_train
def run_clustering(data, k): from shogun.Clustering import KMeans from shogun.Distance import EuclideanDistance from shogun.Features import RealFeatures fea = RealFeatures(data) distance = EuclideanDistance(fea, fea) kmeans = KMeans(k, distance) #print("Running clustering...") kmeans.train() return kmeans.get_cluster_centers()
def assign_labels(data, centroids, ncenters): from shogun.Distance import EuclideanDistance from shogun.Features import RealFeatures, MulticlassLabels from shogun.Classifier import KNN from numpy import arange labels = MulticlassLabels(arange(0., ncenters)) fea = RealFeatures(data) fea_centroids = RealFeatures(centroids) distance = EuclideanDistance(fea_centroids, fea_centroids) knn = KNN(1, distance, labels) knn.train() return knn.apply(fea)
def distance_normsquared_modular(fm_train_real=traindat, fm_test_real=testdat): from shogun.Features import RealFeatures from shogun.Distance import EuclideanDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance(feats_train, feats_train) distance.set_disable_sqrt(True) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_spherical_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0): from shogun.Features import RealFeatures from shogun.Kernel import MultiquadricKernel from shogun.Distance import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) kernel=MultiquadricKernel(feats_train, feats_train, sigma, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_rationalquadratic_modular (fm_train_real=traindat,fm_test_real=testdat, shift_coef=1.0): from shogun.Features import RealFeatures from shogun.Kernel import RationalQuadraticKernel from shogun.Distance import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def clustering_hierarchical_modular(fm_train=traindat, merges=3): from shogun.Distance import EuclideanDistance from shogun.Features import RealFeatures from shogun.Clustering import Hierarchical feats_train = RealFeatures(fm_train) distance = EuclideanDistance(feats_train, feats_train) hierarchical = Hierarchical(merges, distance) hierarchical.train() out_distance = hierarchical.get_merge_distances() out_cluster = hierarchical.get_cluster_pairs() return hierarchical, out_distance, out_cluster
def kernel_wave_modular (fm_train_real=traindat,fm_test_real=testdat, theta=1.0): from shogun.Features import RealFeatures from shogun.Kernel import WaveKernel from shogun.Distance import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) kernel=WaveKernel(feats_train, feats_train, theta, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def clustering_kmeans_modular(fm_train=traindat, k=3): from shogun.Distance import EuclideanDistance from shogun.Features import RealFeatures from shogun.Clustering import KMeans from shogun.Mathematics import Math_init_random Math_init_random(17) feats_train = RealFeatures(fm_train) distance = EuclideanDistance(feats_train, feats_train) kmeans = KMeans(k, distance) kmeans.train() out_centers = kmeans.get_cluster_centers() kmeans.get_radiuses() return out_centers, kmeans
def kernel_distance_modular(fm_train_real=traindat, fm_test_real=testdat, width=1.7): from shogun.Kernel import DistanceKernel from shogun.Features import RealFeatures from shogun.Distance import EuclideanDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance() kernel = DistanceKernel(feats_train, feats_test, width, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_power_modular(fm_train_real=traindat, fm_test_real=testdat, degree=2.0): from shogun.Features import RealFeatures from shogun.Kernel import PowerKernel from shogun.Distance import EuclideanDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance(feats_train, feats_train) kernel = PowerKernel(feats_train, feats_train, degree, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_knn_modular(fm_train_real=traindat, fm_test_real=testdat, label_train_multiclass=label_traindat, k=3): from shogun.Features import RealFeatures, MulticlassLabels from shogun.Classifier import KNN from shogun.Distance import EuclideanDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance(feats_train, feats_train) labels = MulticlassLabels(label_train_multiclass) knn = KNN(k, distance, labels) knn_train = knn.train() output = knn.apply(feats_test).get_labels() multiple_k = knn.classify_for_multiple_k() return knn, knn_train, output, multiple_k
def kernel_exponential_modular(fm_train_real=traindat, fm_test_real=testdat, tau_coef=1.0): from shogun.Features import RealFeatures from shogun.Kernel import ExponentialKernel from shogun.Distance import EuclideanDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance(feats_train, feats_train) kernel = ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def statistics_linear_time_mmd (n,dim,difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value_boot=mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian=mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def statistics_hsic (n, difference, angle): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA from shogun.Distance import EuclideanDistance from shogun.Mathematics import Math, Statistics, IntVector # init seed for reproducability Math.init_random(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic #print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def statistics_quadratic_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, IntVector # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n=500 dim=2 difference=0.5 # streaming data generator for mean shift distributions gen_p=MeanShiftRealDataGenerator(0, dim) gen_q=MeanShiftRealDataGenerator(difference, dim) # Stream examples and merge them in order to compute median on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features=gen_p.get_streamed_features(n) features=features.create_merged_copy(gen_q.get_streamed_features(n)) # use data generator class to produce example data data=features.get_feature_matrix() print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Use a permutation set to temporarily merge features in merged examples subset=IntVector.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=QuadraticTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=mmd.compute_statistic() alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n-10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples=mmd.sample_null_spectrum(50,10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) mmd=LinearTimeMMD(kernel,features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_linear_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n = 10000 dim = 2 difference = 0.5 # streaming data generator for mean shift distributions gen_p = MeanShiftRealDataGenerator(0, dim) gen_q = MeanShiftRealDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha = 0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 50) # normally, far more iterations are needed p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 10) # normally, far more iterations are needed null_samples = mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def hsic_graphical(): # parameters, change to get different results m = 250 difference = 3 # setting the angle lower makes a harder test angle = pi / 30 # number of samples taken from null and alternative distribution num_null_samples = 500 # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) # create shogun feature representation features_x = RealFeatures(array([data[0]])) features_y = RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = int32(array([x for x in range(features_x.get_num_vectors()) ])) # numpy subset = random.permutation(subset) # numpy permutation subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_y = median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # sample alternative distribution alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic = HSIC(kernel_x, kernel_y, features_x, features_y) alt_samples[i] = hsic.compute_statistic() # sample from null distribution # bootstrapping, biased statistic hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(num_null_samples) null_samples_boot = hsic.bootstrap_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params = hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # plot figure() # plot data x and y subplot(2, 2, 1) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 2, 2) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)]) ] # plot null distribution with threshold subplot(2, 2, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2, 2, 4) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_linear_time_mmd(n, dim, difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha = 0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 50) # normally, far more iterations are needed p_value_boot = mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian = mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 10) # normally, far more iterations are needed null_samples = mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
import numpy as np import os from shogun.Features import RealFeatures, BinaryLabels from shogun.Classifier import KMeans from shogun.Distance import EuclideanDistance # Load the data. f = open(os.path.dirname(__file__) + '../data/two_cluster.data') data = np.fromfile(f, dtype=np.float64, sep=' ') data = data.reshape(-1, 2) f.close() # Perform kmean with 2 clusters. feat = RealFeatures(data.T) distance = EuclideanDistance(feat, feat) kmeans = KMeans(2, distance) kmeans.train() # Show cluster association. print kmeans.apply().get_labels().T # Show cluster centers. print kmeans.get_cluster_centers().T
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # bootstrapping, biased statistic hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(num_null_samples) null_samples_boot=hsic.bootstrap_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_linear_time_mmd (): from shogun.Features import RealFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the linear time statistic is designed for much larger datasets n=10000 dim=2 difference=0.5 # streaming data generator for mean shift distributions gen_p=MeanShiftRealDataGenerator(0, dim) gen_q=MeanShiftRealDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value=mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
# use data generator class to produce example data data=DataGenerator.generate_mean_data(m,dim,difference) # create shogun feature representation features=RealFeatures(data) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Using all distances here would blow up memory subset=Math.randperm_vec(features.get_num_vectors()) subset=subset[0:200] features.add_subset(subset) dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() features.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # use biased statistic mmd=LinearTimeMMD(kernel,features, m) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_mean_data(m,dim,difference) features.set_feature_matrix(data)
def statistics_hsic (): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, IntVector # note that the HSIC has to store kernel matrices # which upper bounds the sample size n=250 difference=3 angle=pi/3 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() print "HSIC:", statistic alpha=0.05 print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value=hsic.compute_p_value(statistic) thresh=hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value<alpha print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value=hsic.compute_p_value(statistic) thresh=hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
# streaming data generator for mean shift distributions gen_p = MeanShiftRealDataGenerator(0, dim) gen_q = MeanShiftRealDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # Stream examples and merge them in order to compute median on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) # use biased statistic mmd = QuadraticTimeMMD(kernel, features, m)
def statistics_quadratic_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, IntVector # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n = 100 dim = 2 difference = 0.5 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute median on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features = gen_p.get_streamed_features(n) features = features.create_merged_copy(gen_q.get_streamed_features(n)) # use data generator class to produce example data data = features.get_feature_matrix() print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n : 2 * n + 1].T) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Use a permutation set to temporarily merge features in merged examples subset = IntVector.randperm_vec(features.get_num_vectors()) subset = subset[0:200] features.add_subset(subset) dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() features.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma = median_distance ** 2 print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) mmd = QuadraticTimeMMD(kernel, features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = mmd.compute_statistic() alpha = 0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n - 10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples = mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples = mmd.sample_null_spectrum(50, 10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
# use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=Math.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y)