def evaluation_clustering_simple(n_data=100, sqrt_num_blobs=4, distance=5): from shogun.Evaluation import ClusteringAccuracy, ClusteringMutualInformation from shogun.Features import MulticlassLabels, GaussianBlobsDataGenerator from shogun.Mathematics import Math # reproducable results Math.init_random(1) # produce sone Gaussian blobs to cluster ncenters = sqrt_num_blobs**2 stretch = 1 angle = 1 gen = GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle) features = gen.get_streamed_features(n_data) X = features.get_feature_matrix() # compute approximate "ground truth" labels via taking the closest blob mean coords = array(range(0, sqrt_num_blobs * distance, distance)) idx_0 = [abs(coords - x).argmin() for x in X[0]] idx_1 = [abs(coords - x).argmin() for x in X[1]] ground_truth = array( [idx_0[i] * sqrt_num_blobs + idx_1[i] for i in range(n_data)], dtype="float64") #for label in unique(ground_truth): # indices=ground_truth==label # plot(X[0][indices], X[1][indices], 'o') #show() centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) # in this case we know that the clustering has to be very good #print(('Clustering accuracy = %.4f' % accuracy)) assert (accuracy > 0.8) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) # TODO add multiclass labels and MI once the serialization works #return gnd, accuracy, mutual_info return accuracy
def statistics_mmd_kernel_selection_combined(m, distance, stretch, num_blobs, angle, selection_method): from shogun.Features import RealFeatures from shogun.Features import GaussianBlobsDataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import MMDKernelSelectionCombMaxL2 from shogun.Statistics import MMDKernelSelectionCombOpt from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) # streaming data generator gen_p = GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q = GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot = 1000 features = gen_p.get_streamed_features(num_plot) features = features.create_merged_copy( gen_q.get_streamed_features(num_plot)) data = features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size = 10000 mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # combined kernels if selection_method == "opt": selection = MMDKernelSelectionCombOpt(mmd) elif selection_method == "l2": selection = MMDKernelSelectionCombMaxL2(mmd) # perform kernel selection (kernel is automatically set) kernel = selection.select_kernel() kernel = CombinedKernel.obtain_from_generic(kernel) #print "selected kernel weights:", kernel.get_subkernel_weights() #subplot(2,2,3) #plot(kernel.get_subkernel_weights()) #title("Kernel weights") # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel, typeIerrors, typeIIerrors