def distance_director_euclidean_modular(fm_train_real=traindat, fm_test_real=testdat, scale=1.2):

    from shogun.Distance import EuclideanDistance
    from modshogun import Time

    feats_train = RealFeatures(fm_train_real)
    feats_train.io.set_loglevel(0)
    feats_train.parallel.set_num_threads(1)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance()
    distance.init(feats_train, feats_test)

    ddistance = DirectorEuclideanDistance()
    ddistance.init(feats_train, feats_test)

    print "dm_train"
    t = Time()
    dm_train = distance.get_distance_matrix()
    t1 = t.cur_time_diff(True)

    print "ddm_train"
    t = Time()
    ddm_train = ddistance.get_distance_matrix()
    t2 = t.cur_time_diff(True)

    print "dm_train", dm_train
    print "ddm_train", ddm_train

    return dm_train, ddm_train
示例#2
0
def distance_director_euclidean_modular(fm_train_real=traindat,
                                        fm_test_real=testdat,
                                        scale=1.2):

    from shogun.Distance import EuclideanDistance
    from modshogun import Time

    feats_train = RealFeatures(fm_train_real)
    feats_train.io.set_loglevel(0)
    feats_train.parallel.set_num_threads(1)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance()
    distance.init(feats_train, feats_test)

    ddistance = DirectorEuclideanDistance()
    ddistance.init(feats_train, feats_test)

    print "dm_train"
    t = Time()
    dm_train = distance.get_distance_matrix()
    t1 = t.cur_time_diff(True)

    print "ddm_train"
    t = Time()
    ddm_train = ddistance.get_distance_matrix()
    t2 = t.cur_time_diff(True)

    print "dm_train", dm_train
    print "ddm_train", ddm_train

    return dm_train, ddm_train
示例#3
0
def converter_multidimensionalscaling_modular(data):
    try:
        from shogun.Features import RealFeatures
        from shogun.Converter import MultidimensionalScaling
        from shogun.Distance import EuclideanDistance

        features = RealFeatures(data)

        distance_before = EuclideanDistance()
        distance_before.init(features, features)

        converter = MultidimensionalScaling()
        converter.set_target_dim(2)
        converter.set_landmark(False)
        embedding = converter.apply(features)

        distance_after = EuclideanDistance()
        distance_after.init(embedding, embedding)

        distance_matrix_after = distance_after.get_distance_matrix()
        distance_matrix_before = distance_before.get_distance_matrix()

        return numpy.linalg.norm(distance_matrix_after - distance_matrix_before
                                 ) / numpy.linalg.norm(distance_matrix_before)
    except ImportError:
        print('No Eigen3 available')
def converter_multidimensionalscaling_modular (data):
	try:
		from shogun.Features import RealFeatures
		from shogun.Converter import MultidimensionalScaling
		from shogun.Distance import EuclideanDistance
		
		features = RealFeatures(data)
			
		distance_before = EuclideanDistance()
		distance_before.init(features,features)

		converter = MultidimensionalScaling()
		converter.set_target_dim(2)
		converter.set_landmark(False)
		embedding = converter.apply(features)

		distance_after = EuclideanDistance()
		distance_after.init(embedding,embedding)

		distance_matrix_after = distance_after.get_distance_matrix()
		distance_matrix_before = distance_before.get_distance_matrix()

		return numpy.linalg.norm(distance_matrix_after-distance_matrix_before)/numpy.linalg.norm(distance_matrix_before) < 1e-6
	except ImportError:
		print('No Eigen3 available')
def distance_euclidean_modular (fm_train_real=traindat,fm_test_real=testdat):

	from shogun.Features import RealFeatures
	from shogun.Distance import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	distance=EuclideanDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()

	return distance,dm_train,dm_test
示例#6
0
            def RunKMeansShogun(q):
                import numpy as np
                from shogun.Distance import EuclideanDistance
                from shogun.Features import RealFeatures
                from shogun import Clustering
                from shogun.Mathematics import Math_init_random

                totalTimer = Timer()

                if seed:
                    Math_init_random(seed.group(1))
                try:
                    data = np.genfromtxt(self.dataset, delimiter=',')

                    dataFeat = RealFeatures(data.T)
                    distance = EuclideanDistance(dataFeat, dataFeat)

                    # Create the K-Means object and perform K-Means clustering.
                    with totalTimer:
                        model = Clustering.KMeans(int(clusters.group(1)),
                                                  distance)
                        model.set_max_iter(maxIterations)
                        model.train()

                        labels = model.apply().get_labels()
                        centers = model.get_cluster_centers()
                except Exception as e:
                    q.put(-1)
                    return -1

                time = totalTimer.ElapsedTime()
                q.put(time)
                return time
def distance_director_euclidean_modular(fm_train_real=traindat,
                                        fm_test_real=testdat,
                                        scale=1.2):
    try:
        from shogun.Distance import DirectorDistance
    except ImportError:
        print("recompile shogun with --enable-swig-directors")
        return

    class DirectorEuclideanDistance(DirectorDistance):
        def __init__(self):
            DirectorDistance.__init__(self, True)

        def distance_function(self, idx_a, idx_b):
            seq1 = self.get_lhs().get_feature_vector(idx_a)
            seq2 = self.get_rhs().get_feature_vector(idx_b)
            return numpy.linalg.norm(seq1 - seq2)

    from shogun.Distance import EuclideanDistance
    from modshogun import Time

    feats_train = RealFeatures(fm_train_real)
    #feats_train.io.set_loglevel(MSG_DEBUG)
    feats_train.parallel.set_num_threads(1)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance()
    distance.init(feats_train, feats_test)

    ddistance = DirectorEuclideanDistance()
    ddistance.init(feats_train, feats_test)

    #print  "dm_train"
    t = Time()
    dm_train = distance.get_distance_matrix()
    #t1=t.cur_time_diff(True)

    #print  "ddm_train"
    t = Time()
    ddm_train = ddistance.get_distance_matrix()
    #t2=t.cur_time_diff(True)

    #print "dm_train", dm_train
    #print "ddm_train", ddm_train

    return dm_train, ddm_train
示例#8
0
        def RunAllKnnShogun(q):
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the query
            # file.
            try:
                Log.Info("Loading dataset", self.verbose)
                if len(self.dataset) == 2:
                    referenceData = np.genfromtxt(self.dataset[0],
                                                  delimiter=',')
                    queryData = np.genfromtxt(self.dataset[1], delimiter=',')
                    queryFeat = RealFeatures(queryFeat.T)
                else:
                    referenceData = np.genfromtxt(self.dataset, delimiter=',')

                # Labels are the last row of the dataset.
                labels = MulticlassLabels(
                    referenceData[:, (referenceData.shape[1] - 1)])
                referenceData = referenceData[:, :-1]

                with totalTimer:
                    # Get all the parameters.
                    k = re.search("-k (\d+)", options)
                    if not k:
                        Log.Fatal(
                            "Required option: Number of furthest neighbors to find."
                        )
                        q.put(-1)
                        return -1
                    else:
                        k = int(k.group(1))
                        if (k < 1 or k > referenceData.shape[0]):
                            Log.Fatal("Invalid k: " + k.group(1) +
                                      "; must be greater than 0" +
                                      " and less or equal than " +
                                      str(referenceData.shape[0]))
                            q.put(-1)
                            return -1

                    referenceFeat = RealFeatures(referenceData.T)
                    distance = EuclideanDistance(referenceFeat, referenceFeat)

                    # Perform All K-Nearest-Neighbors.
                    model = SKNN(k, distance, labels)
                    model.train()

                    if len(self.dataset) == 2:
                        out = model.apply(queryFeat).get_labels()
                    else:
                        out = model.apply(referenceFeat).get_labels()
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
def distance_director_euclidean_modular (fm_train_real=traindat,fm_test_real=testdat,scale=1.2):
	try:
		from shogun.Distance import DirectorDistance
	except ImportError:
		print "recompile shogun with --enable-swig-directors"
		return

	class DirectorEuclideanDistance(DirectorDistance):
		def __init__(self):
			DirectorDistance.__init__(self, True)
		def distance_function(self, idx_a, idx_b):
			seq1 = self.get_lhs().get_feature_vector(idx_a)
			seq2 = self.get_rhs().get_feature_vector(idx_b)
			return numpy.linalg.norm(seq1-seq2)

	from shogun.Distance import EuclideanDistance
	from modshogun import Time

	feats_train=RealFeatures(fm_train_real)
	feats_train.io.set_loglevel(0)
	feats_train.parallel.set_num_threads(1)
	feats_test=RealFeatures(fm_test_real)

	distance=EuclideanDistance()
	distance.init(feats_train, feats_test)

	ddistance=DirectorEuclideanDistance()
	ddistance.init(feats_train, feats_test)

	print  "dm_train"
	t=Time()
	dm_train=distance.get_distance_matrix()
	t1=t.cur_time_diff(True)

	print  "ddm_train"
	t=Time()
	ddm_train=ddistance.get_distance_matrix()
	t2=t.cur_time_diff(True)	

	print "dm_train", dm_train
	print "ddm_train", ddm_train

	return dm_train, ddm_train
示例#10
0
def run_clustering(data, k):
    from shogun.Clustering import KMeans
    from shogun.Distance import EuclideanDistance
    from shogun.Features import RealFeatures

    fea = RealFeatures(data)
    distance = EuclideanDistance(fea, fea)
    kmeans = KMeans(k, distance)

    #print("Running clustering...")
    kmeans.train()

    return kmeans.get_cluster_centers()
示例#11
0
def assign_labels(data, centroids, ncenters):
    from shogun.Distance import EuclideanDistance
    from shogun.Features import RealFeatures, MulticlassLabels
    from shogun.Classifier import KNN
    from numpy import arange

    labels = MulticlassLabels(arange(0., ncenters))
    fea = RealFeatures(data)
    fea_centroids = RealFeatures(centroids)
    distance = EuclideanDistance(fea_centroids, fea_centroids)
    knn = KNN(1, distance, labels)
    knn.train()
    return knn.apply(fea)
def distance_normsquared_modular(fm_train_real=traindat, fm_test_real=testdat):

    from shogun.Features import RealFeatures
    from shogun.Distance import EuclideanDistance

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance(feats_train, feats_train)
    distance.set_disable_sqrt(True)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()

    return distance, dm_train, dm_test
def kernel_spherical_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0):
	from shogun.Features import RealFeatures
	from shogun.Kernel import MultiquadricKernel
	from shogun.Distance import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	distance=EuclideanDistance(feats_train, feats_train)

	kernel=MultiquadricKernel(feats_train, feats_train, sigma, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#14
0
def kernel_rationalquadratic_modular (fm_train_real=traindat,fm_test_real=testdat, shift_coef=1.0):
	from shogun.Features import RealFeatures
	from shogun.Kernel import RationalQuadraticKernel
	from shogun.Distance import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	distance=EuclideanDistance(feats_train, feats_train)

	kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def clustering_hierarchical_modular(fm_train=traindat, merges=3):

    from shogun.Distance import EuclideanDistance
    from shogun.Features import RealFeatures
    from shogun.Clustering import Hierarchical

    feats_train = RealFeatures(fm_train)
    distance = EuclideanDistance(feats_train, feats_train)

    hierarchical = Hierarchical(merges, distance)
    hierarchical.train()

    out_distance = hierarchical.get_merge_distances()
    out_cluster = hierarchical.get_cluster_pairs()

    return hierarchical, out_distance, out_cluster
示例#16
0
def kernel_wave_modular (fm_train_real=traindat,fm_test_real=testdat, theta=1.0):
	from shogun.Features import RealFeatures
	from shogun.Kernel import WaveKernel
	from shogun.Distance import EuclideanDistance

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	distance=EuclideanDistance(feats_train, feats_train)

	kernel=WaveKernel(feats_train, feats_train, theta, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
示例#17
0
def clustering_kmeans_modular(fm_train=traindat, k=3):

    from shogun.Distance import EuclideanDistance
    from shogun.Features import RealFeatures
    from shogun.Clustering import KMeans
    from shogun.Mathematics import Math_init_random
    Math_init_random(17)

    feats_train = RealFeatures(fm_train)
    distance = EuclideanDistance(feats_train, feats_train)

    kmeans = KMeans(k, distance)
    kmeans.train()

    out_centers = kmeans.get_cluster_centers()
    kmeans.get_radiuses()

    return out_centers, kmeans
示例#18
0
def kernel_distance_modular(fm_train_real=traindat,
                            fm_test_real=testdat,
                            width=1.7):
    from shogun.Kernel import DistanceKernel
    from shogun.Features import RealFeatures
    from shogun.Distance import EuclideanDistance

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance()

    kernel = DistanceKernel(feats_train, feats_test, width, distance)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#19
0
def kernel_power_modular(fm_train_real=traindat,
                         fm_test_real=testdat,
                         degree=2.0):
    from shogun.Features import RealFeatures
    from shogun.Kernel import PowerKernel
    from shogun.Distance import EuclideanDistance

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance(feats_train, feats_train)

    kernel = PowerKernel(feats_train, feats_train, degree, distance)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#20
0
def classifier_knn_modular(fm_train_real=traindat,
                           fm_test_real=testdat,
                           label_train_multiclass=label_traindat,
                           k=3):
    from shogun.Features import RealFeatures, MulticlassLabels
    from shogun.Classifier import KNN
    from shogun.Distance import EuclideanDistance

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)
    distance = EuclideanDistance(feats_train, feats_train)

    labels = MulticlassLabels(label_train_multiclass)

    knn = KNN(k, distance, labels)
    knn_train = knn.train()
    output = knn.apply(feats_test).get_labels()
    multiple_k = knn.classify_for_multiple_k()
    return knn, knn_train, output, multiple_k
def kernel_exponential_modular(fm_train_real=traindat,
                               fm_test_real=testdat,
                               tau_coef=1.0):
    from shogun.Features import RealFeatures
    from shogun.Kernel import ExponentialKernel
    from shogun.Distance import EuclideanDistance

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = EuclideanDistance(feats_train, feats_train)

    kernel = ExponentialKernel(feats_train, feats_train, tau_coef, distance,
                               10)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
示例#22
0
def statistics_linear_time_mmd (n,dim,difference):
	from shogun.Features import RealFeatures
	from shogun.Features import MeanShiftDataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import LinearTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# init seed for reproducability
	Math.init_random(1)

	# note that the linear time statistic is designed for much larger datasets
	# so increase to get reasonable results

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	
	# Stream examples and merge them in order to compute median on joint sample
	features=gen_p.get_streamed_features(100)
	features=features.create_merged_copy(gen_q.get_streamed_features(100))
	
	# compute all pairwise distances
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	
	# compute median and determine kernel width (using shogun)
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	#print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	# mmd instance using streaming features, blocksize of 10000
	mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	#print "test statistic:", statistic
	
	# do the same thing using two different way to approximate null-dstribution
	# bootstrapping and gaussian approximation (ony for really large samples)
	alpha=0.05

	#print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed
	p_value_boot=mmd.compute_p_value(statistic)
	#print "p_value_boot:", p_value_boot
	#print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha
	
	#print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value_gaussian=mmd.compute_p_value(statistic)
	#print "p_value_gaussian:", p_value_gaussian
	#print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed
	null_samples=mmd.bootstrap_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)
	
	# compute type I and type II errors for Gaussian approximation
	# number of trials should be larger to compute tight confidence bounds
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	num_trials=5;
	alpha=0.05 # test power
	typeIerrors=[0 for x in range(num_trials)]
	typeIIerrors=[0 for x in range(num_trials)]
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		mmd.set_simulate_h0(True)
		typeIerrors[i]=mmd.perform_test()>alpha
		mmd.set_simulate_h0(False)
		
		typeIIerrors[i]=mmd.perform_test()>alpha
	
	#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)
	
	return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
示例#23
0
def statistics_hsic (n, difference, angle):
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import HSIC
	from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Math, Statistics, IntVector
	
	# init seed for reproducability
	Math.init_random(1)

	# note that the HSIC has to store kernel matrices
	# which upper bounds the sample size

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
	#plot(data[0], data[1], 'x');show()

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=IntVector.randperm_vec(features_x.get_num_vectors())
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	#print "median distance for Gaussian kernel on x:", sigma_x
	#print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=hsic.compute_statistic()
	#print "HSIC:", statistic
	alpha=0.05

	#print "computing p-value using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	hsic.set_bootstrap_iterations(100)
	# bootstrapping allows usage of unbiased or biased statistic
	p_value_boot=hsic.compute_p_value(statistic)
	thresh_boot=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_boot
	#print "threshold for 0.05 alpha:", thresh_boot
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

	#print "computing p-value using gamma method"
	hsic.set_null_approximation_method(HSIC_GAMMA)
	p_value_gamma=hsic.compute_p_value(statistic)
	thresh_gamma=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_gamma
	#print "threshold for 0.05 alpha:", thresh_gamma
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	#print "sampling null distribution using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(100)
	null_samples=hsic.bootstrap_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)
	#hist(null_samples, 100); show()
	
	return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def statistics_quadratic_time_mmd ():
	from shogun.Features import RealFeatures
	from shogun.Features import MeanShiftRealDataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import QuadraticTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, IntVector

	# note that the quadratic time mmd has to store kernel matrices
	# which upper bounds the sample size
	n=500
	dim=2
	difference=0.5

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftRealDataGenerator(0, dim)
	gen_q=MeanShiftRealDataGenerator(difference, dim)
	
	# Stream examples and merge them in order to compute median on joint sample
	# alternative is to call a different constructor of QuadraticTimeMMD
	features=gen_p.get_streamed_features(n)
	features=features.create_merged_copy(gen_q.get_streamed_features(n))
	
	# use data generator class to produce example data
	data=features.get_feature_matrix()
	
	print "dimension means of X", mean(data.T[0:n].T)
	print "dimension means of Y", mean(data.T[n:2*n+1].T)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	# Use a permutation set to temporarily merge features in merged examples
	subset=IntVector.randperm_vec(features.get_num_vectors())
	subset=subset[0:200]
	features.add_subset(subset)
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	features.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	mmd=QuadraticTimeMMD(kernel,features, n)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=mmd.compute_statistic()
	alpha=0.05
	
	print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	mmd.set_bootstrap_iterations(10)
	# bootstrapping allows usage of unbiased or biased statistic
	mmd.set_statistic_type(UNBIASED)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# only can do this if SHOGUN was compiled with LAPACK so check
	if "sample_null_spectrum" in dir(QuadraticTimeMMD):
		print "computing p-value using spectrum method"
		mmd.set_null_approximation_method(MMD2_SPECTRUM)
		# normally, at least 250 iterations should be done, but that takes long
		mmd.set_num_samples_sepctrum(50)
		mmd.set_num_eigenvalues_spectrum(n-10)
		# spectrum method computes p-value for biased statistics only
		mmd.set_statistic_type(BIASED)
		p_value=mmd.compute_p_value(statistic)
		print "p_value:", p_value
		print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	print "computing p-value using gamma method"
	mmd.set_null_approximation_method(MMD2_GAMMA)
	# gamma method computes p-value for biased statistics only
	mmd.set_statistic_type(BIASED)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	print "sampling null distribution using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_statistic_type(BIASED)
	mmd.set_bootstrap_iterations(10)
	null_samples=mmd.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# spectrum, biased statistic
	print "sampling null distribution using spectrum method"
	mmd.set_null_approximation_method(MMD2_SPECTRUM)
	mmd.set_statistic_type(BIASED)
	# 200 samples using 100 eigenvalues
	null_samples=mmd.sample_null_spectrum(50,10)
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
示例#25
0
def statistics_linear_time_mmd ():
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import LinearTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# note that the linear time statistic is designed for much larger datasets
	n=10000
	dim=2
	difference=0.5

	# use data generator class to produce example data
	# in pratice, this generate data function could be replaced by a method
	# that obtains data from a stream
	data=DataGenerator.generate_mean_data(n,dim,difference)
	
	print "dimension means of X", mean(data.T[0:n].T)
	print "dimension means of Y", mean(data.T[n:2*n+1].T)

	# create shogun feature representation
	features=RealFeatures(data)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	# Using all distances here would blow up memory
	subset=Math.randperm_vec(features.get_num_vectors())
	subset=subset[0:200]
	features.add_subset(subset)
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	features.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	mmd=LinearTimeMMD(kernel,features, n)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	print "test statistic:", statistic
	
	# do the same thing using two different way to approximate null-dstribution
	# bootstrapping and gaussian approximation (ony for really large samples)
	alpha=0.05

	print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed
	null_samples=mmd.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
示例#26
0
def statistics_linear_time_mmd():
    from shogun.Features import RealFeatures
    from shogun.Features import MeanShiftRealDataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import LinearTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # note that the linear time statistic is designed for much larger datasets
    n = 10000
    dim = 2
    difference = 0.5

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftRealDataGenerator(0, dim)
    gen_q = MeanShiftRealDataGenerator(difference, dim)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable

    # Stream examples and merge them in order to compute median on joint sample
    features = gen_p.get_streamed_features(100)
    features = features.create_merged_copy(gen_q.get_streamed_features(100))

    # compute all pairwise distances
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()

    # compute median and determine kernel width (using shogun)
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance**2
    print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    # mmd instance using streaming features, blocksize of 10000
    mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05
    statistic = mmd.compute_statistic()
    print "test statistic:", statistic

    # do the same thing using two different way to approximate null-dstribution
    # bootstrapping and gaussian approximation (ony for really large samples)
    alpha = 0.05

    print "computing p-value using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(
        50)  # normally, far more iterations are needed
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    print "computing p-value using gaussian approximation"
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(
        10)  # normally, far more iterations are needed
    null_samples = mmd.bootstrap_null()
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)
示例#27
0
def hsic_graphical():
    # parameters, change to get different results
    m = 250
    difference = 3

    # setting the angle lower makes a harder test
    angle = pi / 30

    # number of samples taken from null and alternative distribution
    num_null_samples = 500

    # use data generator class to produce example data
    data = DataGenerator.generate_sym_mix_gauss(m, difference, angle)

    # create shogun feature representation
    features_x = RealFeatures(array([data[0]]))
    features_y = RealFeatures(array([data[1]]))

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = int32(array([x for x in range(features_x.get_num_vectors())
                          ]))  # numpy
    subset = random.permutation(subset)  # numpy permutation
    subset = subset[0:200]
    features_x.add_subset(subset)
    dist = EuclideanDistance(features_x, features_x)
    distances = dist.get_distance_matrix()
    features_x.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma_x = median_distance**2
    features_y.add_subset(subset)
    dist = EuclideanDistance(features_y, features_y)
    distances = dist.get_distance_matrix()
    features_y.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma_y = median_distance**2
    print "median distance for Gaussian kernel on x:", sigma_x
    print "median distance for Gaussian kernel on y:", sigma_y
    kernel_x = GaussianKernel(10, sigma_x)
    kernel_y = GaussianKernel(10, sigma_y)

    # create hsic instance. Note that this is a convienience constructor which copies
    # feature data. features_x and features_y are not these used in hsic.
    # This is only for user-friendlyness. Usually, its ok to do this.
    # Below, the alternative distribution is sampled, which means
    # that new feature objects have to be created in each iteration (slow)
    # However, normally, the alternative distribution is not sampled
    hsic = HSIC(kernel_x, kernel_y, features_x, features_y)

    # sample alternative distribution
    alt_samples = zeros(num_null_samples)
    for i in range(len(alt_samples)):
        data = DataGenerator.generate_sym_mix_gauss(m, difference, angle)
        features_x.set_feature_matrix(array([data[0]]))
        features_y.set_feature_matrix(array([data[1]]))

        # re-create hsic instance everytime since feature objects are copied due to
        # useage of convienience constructor
        hsic = HSIC(kernel_x, kernel_y, features_x, features_y)
        alt_samples[i] = hsic.compute_statistic()

    # sample from null distribution
    # bootstrapping, biased statistic
    hsic.set_null_approximation_method(BOOTSTRAP)
    hsic.set_bootstrap_iterations(num_null_samples)
    null_samples_boot = hsic.bootstrap_null()

    # fit gamma distribution, biased statistic
    hsic.set_null_approximation_method(HSIC_GAMMA)
    gamma_params = hsic.fit_null_gamma()
    # sample gamma with parameters
    null_samples_gamma = array([
        gamma(gamma_params[0], gamma_params[1])
        for _ in range(num_null_samples)
    ])

    # plot
    figure()

    # plot data x and y
    subplot(2, 2, 1)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    grid(True)
    plot(data[0], data[1], 'o')
    title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m))
    xlabel('$x$')
    ylabel('$y$')

    # compute threshold for test level
    alpha = 0.05
    null_samples_boot.sort()
    null_samples_gamma.sort()
    thresh_boot = null_samples_boot[floor(
        len(null_samples_boot) * (1 - alpha))]
    thresh_gamma = null_samples_gamma[floor(
        len(null_samples_gamma) * (1 - alpha))]

    type_one_error_boot = sum(
        null_samples_boot < thresh_boot) / float(num_null_samples)
    type_one_error_gamma = sum(
        null_samples_gamma < thresh_boot) / float(num_null_samples)

    # plot alternative distribution with threshold
    subplot(2, 2, 2)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(alt_samples, 20, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples)
    title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

    # compute range for all null distribution histograms
    hist_range = [
        min([min(null_samples_boot),
             min(null_samples_gamma)]),
        max([max(null_samples_boot),
             max(null_samples_gamma)])
    ]

    # plot null distribution with threshold
    subplot(2, 2, 3)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(null_samples_boot, 20, range=hist_range, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    title('Bootstrapped Null Dist.\n' + 'Type I error is ' +
          str(type_one_error_boot))

    # plot null distribution gamma
    subplot(2, 2, 4)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    grid(True)
    hist(null_samples_gamma, 20, range=hist_range, normed=True)
    axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
    title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma))
    grid(True)

    # pull plots a bit apart
    subplots_adjust(hspace=0.5)
    subplots_adjust(wspace=0.5)
示例#28
0
def statistics_linear_time_mmd(n, dim, difference):
    from shogun.Features import RealFeatures
    from shogun.Features import MeanShiftDataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import LinearTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # init seed for reproducability
    Math.init_random(1)

    # note that the linear time statistic is designed for much larger datasets
    # so increase to get reasonable results

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable

    # Stream examples and merge them in order to compute median on joint sample
    features = gen_p.get_streamed_features(100)
    features = features.create_merged_copy(gen_q.get_streamed_features(100))

    # compute all pairwise distances
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()

    # compute median and determine kernel width (using shogun)
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance**2
    #print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    # mmd instance using streaming features, blocksize of 10000
    mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05
    statistic = mmd.compute_statistic()
    #print "test statistic:", statistic

    # do the same thing using two different way to approximate null-dstribution
    # bootstrapping and gaussian approximation (ony for really large samples)
    alpha = 0.05

    #print "computing p-value using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(
        50)  # normally, far more iterations are needed
    p_value_boot = mmd.compute_p_value(statistic)
    #print "p_value_boot:", p_value_boot
    #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

    #print "computing p-value using gaussian approximation"
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    p_value_gaussian = mmd.compute_p_value(statistic)
    #print "p_value_gaussian:", p_value_gaussian
    #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(
        10)  # normally, far more iterations are needed
    null_samples = mmd.bootstrap_null()
    #print "null mean:", mean(null_samples)
    #print "null variance:", var(null_samples)

    # compute type I and type II errors for Gaussian approximation
    # number of trials should be larger to compute tight confidence bounds
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    num_trials = 5
    alpha = 0.05  # test power
    typeIerrors = [0 for x in range(num_trials)]
    typeIIerrors = [0 for x in range(num_trials)]
    for i in range(num_trials):
        # this effectively means that p=q - rejecting is tpye I error
        mmd.set_simulate_h0(True)
        typeIerrors[i] = mmd.perform_test() > alpha
        mmd.set_simulate_h0(False)

        typeIIerrors[i] = mmd.perform_test() > alpha

    #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

    return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
示例#29
0
import numpy as np
import os
from shogun.Features import RealFeatures, BinaryLabels
from shogun.Classifier import KMeans
from shogun.Distance import EuclideanDistance

# Load the data.
f = open(os.path.dirname(__file__) + '../data/two_cluster.data')
data = np.fromfile(f, dtype=np.float64, sep=' ')
data = data.reshape(-1, 2)
f.close()

# Perform kmean with 2 clusters.
feat = RealFeatures(data.T)
distance = EuclideanDistance(feat, feat)
kmeans = KMeans(2, distance)
kmeans.train()

# Show cluster association.
print kmeans.apply().get_labels().T

# Show cluster centers.
print kmeans.get_cluster_centers().T
示例#30
0
def hsic_graphical():
	# parameters, change to get different results
	m=250
	difference=3
	
	# setting the angle lower makes a harder test
	angle=pi/30
	
	# number of samples taken from null and alternative distribution
	num_null_samples=500
	
	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)
	
	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))
	
	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy
	subset=random.permutation(subset) # numpy permutation
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	print "median distance for Gaussian kernel on x:", sigma_x
	print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)
	
	# create hsic instance. Note that this is a convienience constructor which copies
	# feature data. features_x and features_y are not these used in hsic.
	# This is only for user-friendlyness. Usually, its ok to do this.
	# Below, the alternative distribution is sampled, which means
	# that new feature objects have to be created in each iteration (slow)
	# However, normally, the alternative distribution is not sampled
	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
	
	# sample alternative distribution
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)
		features_x.set_feature_matrix(array([data[0]]))
		features_y.set_feature_matrix(array([data[1]]))
		
		# re-create hsic instance everytime since feature objects are copied due to
		# useage of convienience constructor
		hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
		alt_samples[i]=hsic.compute_statistic()
	
	# sample from null distribution
	# bootstrapping, biased statistic
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(num_null_samples)
	null_samples_boot=hsic.bootstrap_null()
	
	# fit gamma distribution, biased statistic
	hsic.set_null_approximation_method(HSIC_GAMMA)
	gamma_params=hsic.fit_null_gamma()
	# sample gamma with parameters
	null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])
	
	# plot
	figure()
	
	# plot data x and y
	subplot(2,2,1)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	grid(True)
	plot(data[0], data[1], 'o')
	title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m))
	xlabel('$x$')
	ylabel('$y$')
	
	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gamma.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];
	
	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)
	
	# plot alternative distribution with threshold
	subplot(2,2,2)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))
	
	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])]
	
	# plot null distribution with threshold
	subplot(2,2,3)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Bootstrapped Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))
	
	# plot null distribution gamma
	subplot(2,2,4)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_gamma, 20, range=hist_range, normed=True);
	axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gamma\nType I error is '  + str(type_one_error_gamma))
	grid(True)
	
	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)
def statistics_linear_time_mmd ():
	from shogun.Features import RealFeatures
	from shogun.Features import MeanShiftRealDataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import LinearTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# note that the linear time statistic is designed for much larger datasets
	n=10000
	dim=2
	difference=0.5

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftRealDataGenerator(0, dim)
	gen_q=MeanShiftRealDataGenerator(difference, dim)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	
	# Stream examples and merge them in order to compute median on joint sample
	features=gen_p.get_streamed_features(100)
	features=features.create_merged_copy(gen_q.get_streamed_features(100))
	
	# compute all pairwise distances
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	
	# compute median and determine kernel width (using shogun)
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	# mmd instance using streaming features, blocksize of 10000
	mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	print "test statistic:", statistic
	
	# do the same thing using two different way to approximate null-dstribution
	# bootstrapping and gaussian approximation (ony for really large samples)
	alpha=0.05

	print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed
	null_samples=mmd.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
# use data generator class to produce example data
data=DataGenerator.generate_mean_data(m,dim,difference)

# create shogun feature representation
features=RealFeatures(data)

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
# Using all distances here would blow up memory
subset=Math.randperm_vec(features.get_num_vectors())
subset=subset[0:200]
features.add_subset(subset)
dist=EuclideanDistance(features, features)
distances=dist.get_distance_matrix()
features.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma=median_distance**2
print "median distance for Gaussian kernel:", sigma
kernel=GaussianKernel(10,sigma)

# use biased statistic
mmd=LinearTimeMMD(kernel,features, m)

# sample alternative distribution
alt_samples=zeros(num_null_samples)
for i in range(len(alt_samples)):
	data=DataGenerator.generate_mean_data(m,dim,difference)
	features.set_feature_matrix(data)
示例#33
0
def statistics_hsic ():
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import HSIC
	from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, IntVector

	# note that the HSIC has to store kernel matrices
	# which upper bounds the sample size
	n=250
	difference=3
	angle=pi/3

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
	#plot(data[0], data[1], 'x');show()

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=IntVector.randperm_vec(features_x.get_num_vectors())
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	print "median distance for Gaussian kernel on x:", sigma_x
	print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=hsic.compute_statistic()
	print "HSIC:", statistic
	alpha=0.05

	print "computing p-value using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	hsic.set_bootstrap_iterations(100)
	# bootstrapping allows usage of unbiased or biased statistic
	p_value=hsic.compute_p_value(statistic)
	thresh=hsic.compute_threshold(alpha)
	print "p_value:", p_value
	print "threshold for 0.05 alpha:", thresh
	print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value<alpha

	print "computing p-value using gamma method"
	hsic.set_null_approximation_method(HSIC_GAMMA)
	p_value=hsic.compute_p_value(statistic)
	thresh=hsic.compute_threshold(alpha)
	print "p_value:", p_value
	print "threshold for 0.05 alpha:", thresh
	print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	print "sampling null distribution using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(100)
	null_samples=hsic.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
示例#34
0
# streaming data generator for mean shift distributions
gen_p = MeanShiftRealDataGenerator(0, dim)
gen_q = MeanShiftRealDataGenerator(difference, dim)

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable

# Stream examples and merge them in order to compute median on joint sample
features = gen_p.get_streamed_features(100)
features = features.create_merged_copy(gen_q.get_streamed_features(100))

# compute all pairwise distances
dist = EuclideanDistance(features, features)
distances = dist.get_distance_matrix()

# compute median and determine kernel width (using shogun)
median_distance = Statistics.matrix_median(distances, True)
sigma = median_distance**2
print "median distance for Gaussian kernel:", sigma
kernel = GaussianKernel(10, sigma)

# Stream examples and merge them in order to compute median on joint sample
# alternative is to call a different constructor of QuadraticTimeMMD
features = gen_p.get_streamed_features(m)
features = features.create_merged_copy(gen_q.get_streamed_features(m))

# use biased statistic
mmd = QuadraticTimeMMD(kernel, features, m)
def statistics_quadratic_time_mmd():
    from shogun.Features import RealFeatures
    from shogun.Features import MeanShiftDataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import QuadraticTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, IntVector

    # note that the quadratic time mmd has to store kernel matrices
    # which upper bounds the sample size
    n = 100
    dim = 2
    difference = 0.5

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # Stream examples and merge them in order to compute median on joint sample
    # alternative is to call a different constructor of QuadraticTimeMMD
    features = gen_p.get_streamed_features(n)
    features = features.create_merged_copy(gen_q.get_streamed_features(n))

    # use data generator class to produce example data
    data = features.get_feature_matrix()

    print "dimension means of X", mean(data.T[0:n].T)
    print "dimension means of Y", mean(data.T[n : 2 * n + 1].T)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    # Use a permutation set to temporarily merge features in merged examples
    subset = IntVector.randperm_vec(features.get_num_vectors())
    subset = subset[0:200]
    features.add_subset(subset)
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()
    features.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance ** 2
    print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    mmd = QuadraticTimeMMD(kernel, features, n)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = mmd.compute_statistic()
    alpha = 0.05

    print "computing p-value using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    # normally, at least 250 iterations should be done, but that takes long
    mmd.set_bootstrap_iterations(10)
    # bootstrapping allows usage of unbiased or biased statistic
    mmd.set_statistic_type(UNBIASED)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # only can do this if SHOGUN was compiled with LAPACK so check
    if "sample_null_spectrum" in dir(QuadraticTimeMMD):
        print "computing p-value using spectrum method"
        mmd.set_null_approximation_method(MMD2_SPECTRUM)
        # normally, at least 250 iterations should be done, but that takes long
        mmd.set_num_samples_sepctrum(50)
        mmd.set_num_eigenvalues_spectrum(n - 10)
        # spectrum method computes p-value for biased statistics only
        mmd.set_statistic_type(BIASED)
        p_value = mmd.compute_p_value(statistic)
        print "p_value:", p_value
        print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    print "computing p-value using gamma method"
    mmd.set_null_approximation_method(MMD2_GAMMA)
    # gamma method computes p-value for biased statistics only
    mmd.set_statistic_type(BIASED)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # bootstrapping, biased statistic
    print "sampling null distribution using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_statistic_type(BIASED)
    mmd.set_bootstrap_iterations(10)
    null_samples = mmd.bootstrap_null()
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # spectrum, biased statistic
    print "sampling null distribution using spectrum method"
    mmd.set_null_approximation_method(MMD2_SPECTRUM)
    mmd.set_statistic_type(BIASED)
    # 200 samples using 100 eigenvalues
    null_samples = mmd.sample_null_spectrum(50, 10)
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)
示例#36
0
# use data generator class to produce example data
data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

# create shogun feature representation
features_x=RealFeatures(array([data[0]]))
features_y=RealFeatures(array([data[1]]))

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=Math.randperm_vec(features_x.get_num_vectors())
subset=subset[0:200]
features_x.add_subset(subset)
dist=EuclideanDistance(features_x, features_x)
distances=dist.get_distance_matrix()
features_x.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_x=median_distance**2
features_y.add_subset(subset)
dist=EuclideanDistance(features_y, features_y)
distances=dist.get_distance_matrix()
features_y.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_y=median_distance**2
print "median distance for Gaussian kernel on x:", sigma_x
print "median distance for Gaussian kernel on y:", sigma_y
kernel_x=GaussianKernel(10,sigma_x)
kernel_y=GaussianKernel(10,sigma_y)