def converter_multidimensionalscaling_modular(data_fname): try: import numpy from modshogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile features = RealFeatures(CSVFile(data_fname)) distance_before = EuclideanDistance() distance_before.init(features, features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding, embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after - distance_matrix_before) / numpy.linalg.norm( distance_matrix_before) < 1e-6 except ImportError: print('No Eigen3 available')
def RunHierarchicalShogun(): totalTimer = Timer() try: # Load input dataset. Log.Info("Loading dataset", self.verbose) dataPoints = np.genfromtxt(self.dataset, delimiter=',') dataFeat = RealFeatures(dataPoints.T) # Gather all the parameters. if "merges" in options: merges = int(options.pop("merges")) else: Log.Fatal( "Missing parameter: number of merges to be done while clustering bottom up" ) raise Exception("missing parameter") # if distance metric specified, select it, otherwise Euclidean distance by default if "distance" in options: distance = str(options.pop("distance")) distance = distance.lower() if distance not in [ "euclidean", "cosine", "manhattan", "chebyshev" ]: Log.Fatal( "Distance option should be one of Euclidean, Manhattan, Cosine or Chebyshev only" ) raise Exception("unknown distance metric") if distance == "euclidean": distance = EuclideanDistance(dataFeat, dataFeat) elif distance == "manhattan": distance = ManhattanMetric(dataFeat, dataFeat) elif distance == "cosine": distance = CosineDistance(dataFeat, dataFeat) elif distance == "chebyshev": distance = ChebyshewMetric(dataFeat, dataFeat) else: # distance option not specified, default to Euclidean distance distance = EuclideanDistance(dataFeat, dataFeat) if (len(options) > 0): Log.Fatal("Unknown options: " + str(options)) raise Exception("unknown options") # Create the Hierarchical object and perform Hierarchical clustering. with totalTimer: model = Hierarchical(merges, distance) model.train() merge_distances = model.get_merge_distances() cluster_pairs = model.get_cluster_pairs() except Exception as e: Log.Info("Exception: " + str(e)) return [-1] return [totalTimer.ElapsedTime(), merge_distances, cluster_pairs]
def predict(self, image): """ Predict the face """ #image as row imageAsRow = np.asarray( image.reshape(image.shape[0] * image.shape[1], 1), np.float64) #project inthe subspace p = self.pca.apply_to_feature_vector( RealFeatures(imageAsRow).get_feature_vector(0)) #min value to find the face minDist = 1e100 #class minClass = -1 #search which face is the best match for sampleIdx in range(len(self._projections)): test = RealFeatures(np.asmatrix(p, np.float64).T) projection = RealFeatures( np.asmatrix(self._projections[sampleIdx], np.float64).T) dist = EuclideanDistance(test, projection).distance(0, 0) if (dist < minDist): minDist = dist minClass = self._labels[sampleIdx] return minClass
def RunAllKnnShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the query # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') queryFeat = RealFeatures(queryFeat.T) else: referenceData = np.genfromtxt(self.dataset, delimiter=',') # Labels are the last row of the dataset. labels = MulticlassLabels( referenceData[:, (referenceData.shape[1] - 1)]) referenceData = referenceData[:, :-1] with totalTimer: # Get all the parameters. k = re.search("-k (\d+)", options) if not k: Log.Fatal( "Required option: Number of furthest neighbors to find." ) q.put(-1) return -1 else: k = int(k.group(1)) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) q.put(-1) return -1 referenceFeat = RealFeatures(referenceData.T) distance = EuclideanDistance(referenceFeat, referenceFeat) # Perform All K-Nearest-Neighbors. model = SKNN(k, distance, labels) model.train() if len(self.dataset) == 2: out = model.apply(queryFeat).get_labels() else: out = model.apply(referenceFeat).get_labels() except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def RunKMeansShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the centroids # file. Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: data = np.genfromtxt(self.dataset[0], delimiter=',') centroids = np.genfromtxt(self.dataset[1], delimiter=',') else: data = np.genfromtxt(self.dataset[0], delimiter=',') # Gather parameters. clusters = re.search("-c (\d+)", options) maxIterations = re.search("-m (\d+)", options) seed = re.search("-s (\d+)", options) # Now do validation of options. if not clusters and len(self.dataset) != 2: Log.Fatal("Required option: Number of clusters or cluster locations.") q.put(-1) return -1 elif (not clusters or int(clusters.group(1)) < 1) and len(self.dataset) != 2: Log.Fatal("Invalid number of clusters requested! Must be greater than" + " or equal to 1.") q.put(-1) return -1 m = 1000 if not maxIterations else int(maxIterations.group(1)) if seed: Math_init_random(seed.group(1)) try: dataFeat = RealFeatures(data.T) distance = EuclideanDistance(dataFeat, dataFeat) # Create the K-Means object and perform K-Means clustering. with totalTimer: if len(self.dataset) == 2: model = KMeans(int(clusters.group(1)), distance, RealFeatures(centroids)) else: model = KMeans(int(clusters.group(1)), distance) model.set_mbKMeans_iter(m) model.train() labels = model.apply().get_labels() centers = model.get_cluster_centers() except Exception as e: print(e) q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def RunKMeansShogun(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the centroids # file. Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: data = np.genfromtxt(self.dataset[0], delimiter=',') centroids = np.genfromtxt(self.dataset[1], delimiter=',') else: data = np.genfromtxt(self.dataset[0], delimiter=',') # Gather parameters. if "clusters" in options: clusters = int(options.pop("clusters")) elif len(self.dataset) != 2: Log.Fatal( "Required option: Number of clusters or cluster locations." ) q.put(-1) return -1 if "max_iterations" in options: maxIterations = int(options.pop("max_iterations")) seed = None if "seed" in options: seed = int(options.pop("seed")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") if seed: Math_init_random(seed) try: dataFeat = RealFeatures(data.T) distance = EuclideanDistance(dataFeat, dataFeat) # Create the K-Means object and perform K-Means clustering. with totalTimer: if len(self.dataset) == 2: model = KMeans(clusters, distance, centroids.T) else: model = KMeans(clusters, distance) model.set_max_iter(m) model.train() labels = model.apply().get_labels() centers = model.get_cluster_centers() except Exception as e: print(e) q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def load_train(self): ims, labels = self.load( self.test_images, self.test_labels) self.test_images = ims self.test_labels = labels labels_numbers = MulticlassLabels(self.test_labels) feats = RealFeatures(self.test_images.T) dist = EuclideanDistance() self.knn = KNN(self.k, dist, labels_numbers) self.knn.train(feats)
def kernel_cauchy_modular(train_fname=traindat, test_fname=testdat, sigma=1.0): from modshogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel = CauchyKernel(feats_train, feats_train, sigma, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def clustering_hierarchical_modular (fm_train=traindat,merges=3): from modshogun import EuclideanDistance, RealFeatures, Hierarchical, CSVFile feats_train=RealFeatures(CSVFile(fm_train)) distance=EuclideanDistance(feats_train, feats_train) hierarchical=Hierarchical(merges, distance) hierarchical.train() out_distance = hierarchical.get_merge_distances() out_cluster = hierarchical.get_cluster_pairs() return hierarchical,out_distance,out_cluster
def knn(train_features, train_labels, test_features, test_labels, k=1): from modshogun import KNN, MulticlassAccuracy, EuclideanDistance distance = EuclideanDistance(train_features, train_features) knn = KNN(k, distance, train_labels) knn.train() train_output = knn.apply() test_output = knn.apply(test_features) evaluator = MulticlassAccuracy() print 'KNN training error is %.4f' % ( (1 - evaluator.evaluate(train_output, train_labels)) * 100) print 'KNN test error is %.4f' % ( (1 - evaluator.evaluate(test_output, test_labels)) * 100)
def run_clustering(data, k): from modshogun import KMeans from modshogun import EuclideanDistance from modshogun import RealFeatures fea = RealFeatures(data) distance = EuclideanDistance(fea, fea) kmeans = KMeans(k, distance) #print("Running clustering...") kmeans.train() return kmeans.get_cluster_centers()
def kernel_exponential_modular (train_fname=traindat,test_fname=testdat, tau_coef=1.0): from modshogun import RealFeatures, ExponentialKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel=ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def BuildModel(self, data, labels, options): # Get all the parameters. n = re.search("-n (\d+)", options) self.n_neighbors = 5 if not n else int(n.group(1)) distance = EuclideanDistance(data, data) from modshogun import KNN_KDTREE knc = KNN(self.n_neighbors, distance, labels, KNN_KDTREE) knc.set_leaf_size(30) knc.train() return knc
def assign_labels(data, centroids, ncenters): from modshogun import EuclideanDistance from modshogun import RealFeatures, MulticlassLabels from modshogun import KNN from numpy import arange labels = MulticlassLabels(arange(0., ncenters)) fea = RealFeatures(data) fea_centroids = RealFeatures(centroids) distance = EuclideanDistance(fea_centroids, fea_centroids) knn = KNN(1, distance, labels) knn.train() return knn.apply(fea)
def clustering_kmeans_modular (fm_train=traindat,k=3): from modshogun import EuclideanDistance, RealFeatures, KMeans, Math_init_random, CSVFile Math_init_random(17) feats_train=RealFeatures(CSVFile(fm_train)) distance=EuclideanDistance(feats_train, feats_train) kmeans=KMeans(k, distance) kmeans.train() out_centers = kmeans.get_cluster_centers() kmeans.get_radiuses() return out_centers, kmeans
def kernel_rationalquadratic_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0): from modshogun import RealFeatures, RationalQuadraticKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_normsquared_modular(train_fname=traindat, test_fname=testdat): from modshogun import RealFeatures, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) distance.set_disable_sqrt(True) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_power_modular(train_fname=traindat, test_fname=testdat, degree=2.0): from modshogun import RealFeatures, PowerKernel, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel = PowerKernel(feats_train, feats_train, degree, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_distance_modular(train_fname=traindat, test_fname=testdat, width=1.7): from modshogun import RealFeatures, DistanceKernel, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance() kernel = DistanceKernel(feats_train, feats_test, width, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_spherical_modular (fm_train_real=traindat,fm_test_real=testdat, sigma=1.0): from modshogun import RealFeatures from modshogun import MultiquadricKernel from modshogun import EuclideanDistance feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) distance=EuclideanDistance(feats_train, feats_train) kernel=MultiquadricKernel(feats_train, feats_train, sigma, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def BuildModel(self, data, labels, options): # Get all the parameters. if "k" in options: n_neighbors = int(options.pop("k")) else: Log.Fatal("Required parameter 'k' not specified!") raise Exception("missing parameter") if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") distance = EuclideanDistance(data, data) knc = KNN(self.n_neighbors, distance, labels, KNN_KDTREE) knc.train() return knc
def knn_classify(traindat, testdat, k=3): from modshogun import KNN, MulticlassAccuracy, EuclideanDistance train_features, train_labels = traindat.features, traindat.labels distance = EuclideanDistance(train_features, train_features) knn = KNN(k, distance, train_labels) knn.train() test_features, test_labels = testdat.features, testdat.labels predicted_labels = knn.apply(test_features) evaluator = MulticlassAccuracy() acc = evaluator.evaluate(predicted_labels, test_labels) err = 1 - acc return err
def distance_director_euclidean_modular(fm_train_real=traindat, fm_test_real=testdat, scale=1.2): try: from modshogun import DirectorDistance except ImportError: print("recompile shogun with --enable-swig-directors") return class DirectorEuclideanDistance(DirectorDistance): def __init__(self): DirectorDistance.__init__(self, True) def distance_function(self, idx_a, idx_b): seq1 = self.get_lhs().get_feature_vector(idx_a) seq2 = self.get_rhs().get_feature_vector(idx_b) return numpy.linalg.norm(seq1 - seq2) from modshogun import EuclideanDistance from modshogun import Time feats_train = RealFeatures(fm_train_real) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.parallel.set_num_threads(1) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance() distance.init(feats_train, feats_test) ddistance = DirectorEuclideanDistance() ddistance.init(feats_train, feats_test) #print "dm_train" t = Time() dm_train = distance.get_distance_matrix() #t1=t.cur_time_diff(True) #print "ddm_train" t = Time() ddm_train = ddistance.get_distance_matrix() #t2=t.cur_time_diff(True) #print "dm_train", dm_train #print "ddm_train", ddm_train return dm_train, ddm_train
def kernel_wave_modular(fm_train_real=traindat, fm_test_real=testdat, theta=1.0): from modshogun import RealFeatures from modshogun import WaveKernel from modshogun import EuclideanDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance(feats_train, feats_train) kernel = WaveKernel(feats_train, feats_train, theta, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_tstudent_modular(fm_train_real=traindat, fm_test_real=testdat, degree=2.0): from modshogun import RealFeatures from modshogun import TStudentKernel from modshogun import EuclideanDistance feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = EuclideanDistance(feats_train, feats_train) kernel = TStudentKernel(feats_train, feats_train, degree, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_knn_modular(train_fname=traindat, test_fname=testdat, label_train_fname=label_traindat, k=3): from modshogun import RealFeatures, MulticlassLabels, KNN, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) labels = MulticlassLabels(CSVFile(label_train_fname)) knn = KNN(k, distance, labels) knn_train = knn.train() output = knn.apply(feats_test).get_labels() multiple_k = knn.classify_for_multiple_k() return knn, knn_train, output, multiple_k
def KNNAccuracy(distance, data, k, flag): transformedData = np.dot(data[0], distance.T) feat = RealFeatures(transformedData.T) labels = MulticlassLabels(data[1].astype(np.float64)) dist = EuclideanDistance(feat, feat) knn = KNN(k + 1, dist, labels) knn.train(feat) # Get nearest neighbors. nn = knn.nearest_neighbors() nn = np.delete(nn, 0, 0) # Compute unique labels. uniqueLabels = np.unique(labels) # Keep count correct predictions. count = 0 # Normalize labels for i in range(data[0].shape[0]): for j in range(len(uniqueLabels)): if (labels[i] == uniqueLabels[j]): labels[i] = j break for i in range(nn.shape[1]): mapLabels = [0 for x in range(len(uniqueLabels))] for j in range(nn.shape[0]): if (flag): distPoints = np.linalg.norm(data[0][nn[j][i], :] - data[0][i, :]) # Add constant factor of 1 incase two points overlap mapLabels[int(labels[nn[j, i]])] += 1 / (distPoints + 1)**2 else: # Subtract a variable factor to avoid draw condition without # affecting actual result. mapLabels[int(labels[nn[j, i]])] += 1 - j * 1e-8 maxInd = np.argmax(mapLabels) if (maxInd == labels[i]): count += 1 accuracy = (count / nn.shape[1]) * 100 return accuracy
def hsic_graphical(): # parameters, change to get different results m = 250 difference = 3 # setting the angle lower makes a harder test angle = pi / 30 # number of samples taken from null and alternative distribution num_null_samples = 500 # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) # create shogun feature representation features_x = RealFeatures(array([data[0]])) features_y = RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = int32(array([x for x in range(features_x.get_num_vectors()) ])) # numpy subset = random.permutation(subset) # numpy permutation subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = np.median(distances) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = np.median(distances) sigma_y = median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # sample alternative distribution alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic = HSIC(kernel_x, kernel_y, features_x, features_y) alt_samples[i] = hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot = hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params = hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # plot figure() # plot data x and y subplot(2, 2, 1) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 2, 2) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)]) ] # plot null distribution with threshold subplot(2, 2, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2, 2, 4) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_hsic(n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import PERMUTATION, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Statistics, Math # for reproducable results (the numpy one might not be reproducible across # different OS/Python-distributions Math.init_random(1) np.random.seed(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(n, difference, angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x = RealFeatures(np.array([data[0]])) features_y = RealFeatures(np.array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = np.random.permutation(features_x.get_num_vectors()).astype( np.int32) subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = np.median(distances) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = np.median(distances) sigma_y = median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = hsic.compute_statistic() #print "HSIC:", statistic alpha = 0.05 #print "computing p-value using sampling null" hsic.set_null_approximation_method(PERMUTATION) # normally, at least 250 iterations should be done, but that takes long hsic.set_num_null_samples(100) # sampling null allows usage of unbiased or biased statistic p_value_boot = hsic.compute_p_value(statistic) thresh_boot = hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma = hsic.compute_p_value(statistic) thresh_gamma = hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # sampling null, biased statistic #print "sampling null distribution using sample_null" hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(100) null_samples = hsic.sample_null() #print "null mean:", np.mean(null_samples) #print "null variance:", np.var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
xs = [x[i, 0], x[nn[1, i], 0]] ys = [x[i, 1], x[nn[1, i], 1]] axis.plot(xs, ys, COLS[int(y[i])]) figure, axarr = pyplot.subplots(3, 1) x, y = sandwich_data() features = RealFeatures(x.T) labels = MulticlassLabels(y) print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features())) assert (features.get_num_vectors() == labels.get_num_labels()) distance = EuclideanDistance(features, features) k = 2 knn = KNN(k, distance, labels) plot_data(x, y, axarr[0]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0]) axarr[0].set_aspect('equal') axarr[0].set_xlim(-6, 4) axarr[0].set_ylim(-3, 2) lmnn = LMNN(features, labels, k) lmnn.set_maxiter(10000) lmnn.train() L = lmnn.get_linear_transform() knn.set_distance(lmnn.get_distance())