# sample alternative distribution
alt_samples=zeros(num_null_samples)
for i in range(len(alt_samples)):
	data=DataGenerator.generate_mean_data(m,dim,difference)
	features.set_feature_matrix(data)
	alt_samples[i]=mmd.compute_statistic()

# sample from null distribution
# bootstrapping, biased statistic
mmd.set_null_approximation_method(BOOTSTRAP)
mmd.set_bootstrap_iterations(num_null_samples)
null_samples_boot=mmd.bootstrap_null()

# fit normal distribution to null and sample a normal distribution
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
variance=mmd.compute_variance_estimate()
null_samples_gaussian=normal(0,sqrt(variance),num_null_samples)

# plot
figure()

# plot data of p and q
subplot(2,3,1)
grid(True)
gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5)
title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
xlabel('$x_1, y_1$')
ylabel('$x_2, y_2$')
Пример #2
0
def linear_time_mmd_graphical():

	
	# parameters, change to get different results
	m=1000 # set to 10000 for a good test result
	dim=2
	
	# setting the difference of the first dimension smaller makes a harder test
	difference=1
	
	# number of samples taken from null and alternative distribution
	num_null_samples=150
	
	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)
	
	# use the median kernel selection
	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	print "kernel widths:", widths
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# mmd instance using streaming features, blocksize of 10000
	block_size=1000
	mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)
	
	# kernel selection instance (this can easily replaced by the other methods for selecting
	# single kernels
	selection=MMDKernelSelectionOpt(mmd)

	# perform kernel selection
	kernel=selection.select_kernel()
	kernel=GaussianKernel.obtain_from_generic(kernel)
	mmd.set_kernel(kernel);
	print "selected kernel width:", kernel.get_width()
	
	# sample alternative distribution, stream ensures different samples each run
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		alt_samples[i]=mmd.compute_statistic()
	
	# sample from null distribution
	# bootstrapping, biased statistic
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(num_null_samples)
	null_samples_boot=mmd.bootstrap_null()
	
	# fit normal distribution to null and sample a normal distribution
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	variance=mmd.compute_variance_estimate()
	null_samples_gaussian=normal(0,sqrt(variance),num_null_samples)
	
	# to plot data, sample a few examples from stream first
	features=gen_p.get_streamed_features(m)
	features=features.create_merged_copy(gen_q.get_streamed_features(m))
	data=features.get_feature_matrix()
	
	# plot
	figure()
	
	# plot data of p and q
	subplot(2,3,1)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
	plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5)
	title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
	xlabel('$x_1, y_1$')
	ylabel('$x_2, y_2$')
	
	# histogram of first data dimension and pdf
	subplot(2,3,2)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
	hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
	xs=linspace(min(data[0])-1,max(data[0])+1, 50)
	plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3)
	plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3)
	xlabel('$x_1, y_1$')
	ylabel('$p(x_1), p(y_1)$')
	title('Data PDF in $x_1, y_1$')
	
	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gaussian.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))];
	
	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples)
	
	# plot alternative distribution with threshold
	subplot(2,3,4)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))
	
	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])]
	
	# plot null distribution with threshold
	subplot(2,3,3)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Bootstrapped Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))
	
	# plot null distribution gaussian
	subplot(2,3,5)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_gaussian, 20, range=hist_range, normed=True);
	axvline(thresh_gaussian, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gaussian\nType I error is '  + str(type_one_error_gaussian))
	
	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)
# sample alternative distribution
alt_samples = zeros(num_null_samples)
for i in range(len(alt_samples)):
    data = DataGenerator.generate_mean_data(m, dim, difference)
    features.set_feature_matrix(data)
    alt_samples[i] = mmd.compute_statistic()

# sample from null distribution
# bootstrapping, biased statistic
mmd.set_null_approximation_method(BOOTSTRAP)
mmd.set_bootstrap_iterations(num_null_samples)
null_samples_boot = mmd.bootstrap_null()

# fit normal distribution to null and sample a normal distribution
mmd.set_null_approximation_method(MMD1_GAUSSIAN)
variance = mmd.compute_variance_estimate()
null_samples_gaussian = normal(0, sqrt(variance), num_null_samples)

# plot
figure()

# plot data of p and q
subplot(2, 3, 1)
grid(True)
gca().xaxis.set_major_locator(MaxNLocator(nbins=4))  # reduce number of x-ticks
gca().yaxis.set_major_locator(MaxNLocator(nbins=4))  # reduce number of x-ticks
plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
plot(data[0][m + 1:2 * m], data[1][m + 1:2 * m], 'bo', label='$x$', alpha=0.5)
title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m))
xlabel('$x_1, y_1$')
ylabel('$x_2, y_2$')
def linear_time_mmd_graphical():

    # parameters, change to get different results
    m = 1000  # set to 10000 for a good test result
    dim = 2

    # setting the difference of the first dimension smaller makes a harder test
    difference = 1

    # number of samples taken from null and alternative distribution
    num_null_samples = 150

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # use the median kernel selection
    # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    sigmas = [2**x for x in range(-3, 10)]
    widths = [x * x * 2 for x in sigmas]
    print "kernel widths:", widths
    combined = CombinedKernel()
    for i in range(len(sigmas)):
        combined.append_kernel(GaussianKernel(10, widths[i]))

    # mmd instance using streaming features, blocksize of 10000
    block_size = 1000
    mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size)

    # kernel selection instance (this can easily replaced by the other methods for selecting
    # single kernels
    selection = MMDKernelSelectionOpt(mmd)

    # perform kernel selection
    kernel = selection.select_kernel()
    kernel = GaussianKernel.obtain_from_generic(kernel)
    mmd.set_kernel(kernel)
    print "selected kernel width:", kernel.get_width()

    # sample alternative distribution, stream ensures different samples each run
    alt_samples = zeros(num_null_samples)
    for i in range(len(alt_samples)):
        alt_samples[i] = mmd.compute_statistic()

    # sample from null distribution
    # bootstrapping, biased statistic
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(num_null_samples)
    null_samples_boot = mmd.bootstrap_null()

    # fit normal distribution to null and sample a normal distribution
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    variance = mmd.compute_variance_estimate()
    null_samples_gaussian = normal(0, sqrt(variance), num_null_samples)

    # to plot data, sample a few examples from stream first
    features = gen_p.get_streamed_features(m)
    features = features.create_merged_copy(gen_q.get_streamed_features(m))
    data = features.get_feature_matrix()

    # plot
    figure()

    # plot data of p and q
    subplot(2, 3, 1)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
    plot(data[0][m + 1:2 * m],
         data[1][m + 1:2 * m],
         'bo',
         label='$x$',
         alpha=0.5)
    title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m))
    xlabel('$x_1, y_1$')
    ylabel('$x_2, y_2$')

    # histogram of first data dimension and pdf
    subplot(2, 3, 2)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
    hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
    xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50)
    plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3)
    plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3)
    xlabel('$x_1, y_1$')
    ylabel('$p(x_1), p(y_1)$')
    title('Data PDF in $x_1, y_1$')

    # compute threshold for test level
    alpha = 0.05
    null_samples_boot.sort()
    null_samples_gaussian.sort()
    thresh_boot = null_samples_boot[floor(
        len(null_samples_boot) * (1 - alpha))]
    thresh_gaussian = null_samples_gaussian[floor(
        len(null_samples_gaussian) * (1 - alpha))]

    type_one_error_boot = sum(
        null_samples_boot < thresh_boot) / float(num_null_samples)
    type_one_error_gaussian = sum(
        null_samples_gaussian < thresh_boot) / float(num_null_samples)

    # plot alternative distribution with threshold
    subplot(2, 3, 4)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(alt_samples, 20, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples)
    title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

    # compute range for all null distribution histograms
    hist_range = [
        min([min(null_samples_boot),
             min(null_samples_gaussian)]),
        max([max(null_samples_boot),
             max(null_samples_gaussian)])
    ]

    # plot null distribution with threshold
    subplot(2, 3, 3)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(null_samples_boot, 20, range=hist_range, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    title('Bootstrapped Null Dist.\n' + 'Type I error is ' +
          str(type_one_error_boot))

    # plot null distribution gaussian
    subplot(2, 3, 5)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(null_samples_gaussian, 20, range=hist_range, normed=True)
    axvline(thresh_gaussian, 0, 1, linewidth=2, color='red')
    title('Null Dist. Gaussian\nType I error is ' +
          str(type_one_error_gaussian))

    # pull plots a bit apart
    subplots_adjust(hspace=0.5)
    subplots_adjust(wspace=0.5)