def job_lin_mmd(sample_source, tr, te, r): """Linear mmd with grid search to choose the best Gaussian width.""" # should be completely deterministic # If n is too large, pairwise meddian computation can cause a memory error. with util.ContextTimer() as t: X, Y = tr.xy() Xr = X[:min(X.shape[0], 1000), :] Yr = Y[:min(Y.shape[0], 1000), :] med = util.meddistance(np.vstack((Xr, Yr))) widths = [(med * f) for f in 2.0**np.linspace(-1, 4, 40)] list_kernels = [kernel.KGauss(w**2) for w in widths] # grid search to choose the best Gaussian width besti, powers = tst.LinearMMDTest.grid_search_kernel( tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] lin_mmd_test = tst.LinearMMDTest(best_ker, alpha) test_result = lin_mmd_test.perform_test(te) result = { 'test_method': lin_mmd_test, 'test_result': test_result, 'time_secs': t.secs } return result
def test_basic_H1(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 271 alpha = 0.01 for d in [1, 4]: # h1 is true ss = data.SSGaussMeanDiff(d=d, my=2.0) dat = ss.sample(n, seed=seed) xy = dat.stack_xy() sig2 = util.meddistance(xy, subsample=1000)**2 k = kernel.KGauss(sig2) # Test for J in [1, 6]: # random test locations V = util.fit_gaussian_draw(xy, J, seed=seed+1) ume = tst.UMETest(V, k, n_simulate=2000, alpha=alpha) tresult = ume.perform_test(dat) # assertions self.assertGreaterEqual(tresult['pvalue'], 0.0) # H1 is true. Should reject with a small p-value self.assertLessEqual(tresult['pvalue'], 0.1)
def test_basic_H1(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 271 alpha = 0.01 for d in [1, 4]: # h1 is true ss = data.SSGaussMeanDiff(d=d, my=2.0) dat = ss.sample(n, seed=seed) xy = dat.stack_xy() sig2 = util.meddistance(xy, subsample=1000)**2 k = kernel.KGauss(sig2) # Test for J in [1, 6]: # random test locations V = util.fit_gaussian_draw(xy, J, seed=seed + 1) ume = tst.UMETest(V, k, n_simulate=2000, alpha=alpha) tresult = ume.perform_test(dat) # assertions self.assertGreaterEqual(tresult['pvalue'], 0.0) # H1 is true. Should reject with a small p-value self.assertLessEqual(tresult['pvalue'], 0.1)
def job_quad_mmd_2U(sample_source, tr, te, r): """Quadratic mmd with grid search to choose the best Gaussian width. Use two-sample U statistics to compute k(X,Y). """ # If n is too large, pairwise meddian computation can cause a memory error. with util.ContextTimer() as t: med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 40)))) list_gwidth.sort() list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = tst.QuadMMDTest(best_ker, n_permute=1000, alpha=alpha, use_1sample_U=False) test_result = mmd_test.perform_test(te) result = { 'test_method': mmd_test, 'test_result': test_result, 'time_secs': t.secs } return result
def test(self, X, Y): XY = self.preprocess(X, Y) locations = fot_tst.MeanEmbeddingTest.init_locs_subset(XY, self.J) med = fot_util.meddistance(XY.stack_xy(), 1000) kernel = fot_kernel.KGauss(med) ME = fot_tst.MeanEmbeddingTest(locations, med, alpha=self.alpha) result = ME.perform_test(XY) p_val = result['pvalue'] return p_val
def test_optimize_locs_width(self): """ Test the function optimize_locs_width(..). Make sure it does not return unusual results. """ # sample source n = 600 dim = 2 seed = 17 ss = data.SSGaussMeanDiff(dim, my=1.0) #ss = data.SSGaussVarDiff(dim) #ss = data.SSSameGauss(dim) # ss = data.SSBlobs() dim = ss.dim() dat = ss.sample(n, seed=seed) tr, te = dat.split_tr_te(tr_proportion=0.5, seed=10) xy_tr = tr.stack_xy() # initialize test_locs by drawing the a Gaussian fitted to the data # number of test locations J = 3 V0 = util.fit_gaussian_draw(xy_tr, J, seed=seed + 1) med = util.meddistance(xy_tr, subsample=1000) gwidth0 = med**2 assert gwidth0 > 0 # optimize V_opt, gw2_opt, opt_info = tst.GaussUMETest.optimize_locs_width( tr, V0, gwidth0, reg=1e-2, max_iter=100, tol_fun=1e-5, disp=False, locs_bounds_frac=100, gwidth_lb=None, gwidth_ub=None) # perform the test using the optimized parameters on the test set alpha = 0.01 ume_opt = tst.GaussUMETest(V_opt, gw2_opt, n_simulate=2000, alpha=alpha) test_result = ume_opt.perform_test(te) assert test_result['h0_rejected'] assert util.is_real_num(gw2_opt) assert gw2_opt > 0 assert np.all(np.logical_not((np.isnan(V_opt)))) assert np.all(np.logical_not((np.isinf(V_opt))))
def test(self, X, Y): XY = self.preprocess(X, Y) med = fot_util.meddistance(XY.stack_xy(), 1000) kernel = fot_kernel.KGauss(med) MMD = fot_tst.QuadMMDTest(kernel, n_permute=self.n_permute, alpha=self.alpha) result = MMD.perform_test(XY) p_val = result['pvalue'] return p_val
def job_met_gwgrid(sample_source, tr, te, r, J): """MeanEmbeddingTest. Optimize only the Gaussian width with grid search Fix the test locations.""" # optimize on the training set T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r+92856) med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-5, 5, 40) ) ) ) list_gwidth.sort() besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth(tr, T_randn, list_gwidth, alpha) best_width2 = list_gwidth[besti] met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha) return met_grid.perform_test(te)
def job_met_gwgrid(sample_source, tr, te, r, J): """MeanEmbeddingTest. Optimize only the Gaussian width with grid search Fix the test locations.""" # optimize on the training set T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r + 92856) med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-5, 5, 40)))) list_gwidth.sort() besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth( tr, T_randn, list_gwidth, alpha) best_width2 = list_gwidth[besti] met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha) return met_grid.perform_test(te)
def test(self, X, Y): XY = self.preprocess(X, Y) train, test = XY.split_tr_te(tr_proportion=self.split_ratio) locations = fot_tst.MeanEmbeddingTest.init_locs_subset(train, self.J) med = fot_util.meddistance(train.stack_xy(), 1000) gwidth, info = fot_tst.MeanEmbeddingTest.optimize_gwidth( train, locations, med**2) ME = fot_tst.MeanEmbeddingTest(locations, gwidth, alpha=self.alpha) result = ME.perform_test(test) p_val = result['pvalue'] return p_val
def mmd(p, q, alpha=0.05): if (p.ndim == 1): p = p[:, np.newaxis] if (q.ndim == 1): q = q[:, np.newaxis] d = data.TSTData(p, q) d_tr, d_te = d.split_tr_te(tr_proportion=0.5) med = util.meddistance(d_tr.stack_xy()) widths = [(med * f) for f in 2.0**np.linspace(-1, 4, 20)] list_kernels = [kernel.KGauss(w**2) for w in widths] besti, powers = tst.LinearMMDTest.grid_search_kernel( d_tr, list_kernels, alpha) best_ker = list_kernels[besti] lin_mmd_test = tst.LinearMMDTest(best_ker, alpha) r = lin_mmd_test.perform_test(d_te) return r['test_stat'], r['pvalue']
def job_quad_mmd(sample_source, tr, te, r): """Quadratic mmd with grid search to choose the best Gaussian width.""" # If n is too large, pairwise meddian computation can cause a memory error. med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 30) ) ) ) list_gwidth.sort() list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = tst.QuadMMDTest(best_ker, n_permute=400, alpha=alpha) test_result = mmd_test.perform_test(te) return test_result
def job_lin_mmd(sample_source, tr, te, r): """Linear mmd with grid search to choose the best Gaussian width.""" # should be completely deterministic # If n is too large, pairwise meddian computation can cause a memory error. X, Y = tr.xy() Xr = X[:min(X.shape[0], 1000), :] Yr = Y[:min(Y.shape[0], 1000), :] med = util.meddistance(np.vstack((Xr, Yr)) ) widths = [ (med*f) for f in 2.0**np.linspace(-1, 4, 40)] list_kernels = [kernel.KGauss( w**2 ) for w in widths] # grid search to choose the best Gaussian width besti, powers = tst.LinearMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] lin_mmd_test = tst.LinearMMDTest(best_ker, alpha) test_result = lin_mmd_test.perform_test(te) return test_result
def test_optimize_locs_width(self): """ Test the function optimize_locs_width(..). Make sure it does not return unusual results. """ # sample source n = 600 dim = 2 seed = 17 ss = data.SSGaussMeanDiff(dim, my=1.0) #ss = data.SSGaussVarDiff(dim) #ss = data.SSSameGauss(dim) # ss = data.SSBlobs() dim = ss.dim() dat = ss.sample(n, seed=seed) tr, te = dat.split_tr_te(tr_proportion=0.5, seed=10) xy_tr = tr.stack_xy() # initialize test_locs by drawing the a Gaussian fitted to the data # number of test locations J = 3 V0 = util.fit_gaussian_draw(xy_tr, J, seed=seed+1) med = util.meddistance(xy_tr, subsample=1000) gwidth0 = med**2 assert gwidth0 > 0 # optimize V_opt, gw2_opt, opt_info = tst.GaussUMETest.optimize_locs_width(tr, V0, gwidth0, reg=1e-2, max_iter=100, tol_fun=1e-5, disp=False, locs_bounds_frac=100, gwidth_lb=None, gwidth_ub=None) # perform the test using the optimized parameters on the test set alpha = 0.01 ume_opt = tst.GaussUMETest(V_opt, gw2_opt, n_simulate=2000, alpha=alpha) test_result = ume_opt.perform_test(te) assert test_result['h0_rejected'] assert util.is_real_num(gw2_opt) assert gw2_opt > 0 assert np.all(np.logical_not((np.isnan(V_opt)))) assert np.all(np.logical_not((np.isinf(V_opt))))
def job_quad_mmd(sample_source, tr, te, r): """Quadratic mmd with grid search to choose the best Gaussian width. One-sample U-statistic. This should NOT be used anymore.""" # If n is too large, pairwise meddian computation can cause a memory error. with util.ContextTimer() as t: med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 40) ) ) ) list_gwidth.sort() list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = tst.QuadMMDTest(best_ker, n_permute=1000, alpha=alpha, use_1sample_U=True) test_result = mmd_test.perform_test(te) result = {'test_method': mmd_test, 'test_result': test_result, 'time_secs': t.secs} return result
def test(self, X, Y): XY = self.preprocess(X, Y) train, test = XY.split_tr_te(tr_proportion=self.split_ratio) med = fot_util.meddistance(train.stack_xy(), 1000) bandwidths = (med**2) * (2.**np.linspace(-4, 4, 20)) kernels = [fot_kernel.KGauss(width) for width in bandwidths] with contextlib.redirect_stdout(None): best_i, powers = fot_tst.QuadMMDTest.grid_search_kernel( train, kernels, alpha=self.alpha) best_kernel = kernels[best_i] MMD = fot_tst.QuadMMDTest(best_kernel, n_permute=self.n_permute, alpha=self.alpha) result = MMD.perform_test(test) p_val = result['pvalue'] return p_val
def job_met_gwgrid(prob_label, tr, te, r, ni, n): """MeanEmbeddingTest. Optimize only the Gaussian width with grid search Fix the test locations.""" with util.ContextTimer() as t: # optimize on the training set T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r+92856) med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-5, 5, 40) ) ) ) list_gwidth.sort() besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth(tr, T_randn, list_gwidth, alpha) best_width2 = list_gwidth[besti] met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha) met_grid_result = met_grid.perform_test(te) return { #'test_method': met_grid, 'test_result': met_grid_result, 'time_secs': t.secs}
def job_met_gwgrid(prob_label, tr, te, r, ni, n): """MeanEmbeddingTest. Optimize only the Gaussian width with grid search Fix the test locations.""" with util.ContextTimer() as t: # optimize on the training set T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r + 92856) med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-5, 5, 40)))) list_gwidth.sort() besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth( tr, T_randn, list_gwidth, alpha) best_width2 = list_gwidth[besti] met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha) met_grid_result = met_grid.perform_test(te) return { #'test_method': met_grid, 'test_result': met_grid_result, 'time_secs': t.secs }