def perform_mmd_test(train_miss_impute, test_miss_impute, train_full, test_full, alpha, mmd_miss_impute=None, mmd_full=None): mmd_result = np.zeros(2) sb_data_miss_impute = TSTData(train_miss_impute, test_miss_impute) if mmd_miss_impute is None: print('ini') x, y = sb_data_miss_impute.xy() dist_mat_miss_impute = metrics.pairwise_distances(x, y) the_kernel = kernel.KGauss(dist_mat_miss_impute.std()) mmd_miss_impute = tst.QuadMMDTest(the_kernel, alpha=alpha) test_result = mmd_miss_impute.perform_test(sb_data_miss_impute) if test_result['h0_rejected']: mmd_result[0] = 1 sb_data_full = TSTData(train_full, test_full) if mmd_full is None: x, y = sb_data_full.xy() dist_mat_full = metrics.pairwise_distances(x, y) the_kernel = kernel.KGauss(dist_mat_full.std()) mmd_full = tst.QuadMMDTest(the_kernel, alpha=alpha) test_result = mmd_full.perform_test(sb_data_full) if test_result['h0_rejected']: mmd_result[1] = 1 return mmd_result, mmd_miss_impute, mmd_full
def perform_mmd_test(train_miss_impute, test_miss_impute, train_full, test_full, alpha, mmd_miss_impute=None, mmd_full=None): mmd_result = np.zeros(2) sb_data_miss_impute = TSTData(train_miss_impute, test_miss_impute) if mmd_miss_impute is None: print('ini') the_kernel = kernel.KGauss(sb_data_miss_impute.mean_std()) mmd_miss_impute = tst.QuadMMDTest(the_kernel, alpha=alpha) test_result = mmd_miss_impute.perform_test(sb_data_miss_impute) if test_result['h0_rejected']: mmd_result[0] = 1 sb_data_full = TSTData(train_full, test_full) if mmd_full is None: the_kernel = kernel.KGauss(sb_data_full.mean_std()) mmd_full = tst.QuadMMDTest(the_kernel, alpha=alpha) test_result = mmd_full.perform_test(sb_data_full) if test_result['h0_rejected']: mmd_result[1] = 1 return mmd_result, mmd_miss_impute, mmd_full
def job_lin_mmd(sample_source, tr, te, r): """Linear mmd with grid search to choose the best Gaussian width.""" # should be completely deterministic # If n is too large, pairwise meddian computation can cause a memory error. with util.ContextTimer() as t: X, Y = tr.xy() Xr = X[:min(X.shape[0], 1000), :] Yr = Y[:min(Y.shape[0], 1000), :] med = util.meddistance(np.vstack((Xr, Yr))) widths = [(med * f) for f in 2.0**np.linspace(-1, 4, 40)] list_kernels = [kernel.KGauss(w**2) for w in widths] # grid search to choose the best Gaussian width besti, powers = tst.LinearMMDTest.grid_search_kernel( tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] lin_mmd_test = tst.LinearMMDTest(best_ker, alpha) test_result = lin_mmd_test.perform_test(te) result = { 'test_method': lin_mmd_test, 'test_result': test_result, 'time_secs': t.secs } return result
def job_quad_mmd_2U(sample_source, tr, te, r): """Quadratic mmd with grid search to choose the best Gaussian width. Use two-sample U statistics to compute k(X,Y). """ # If n is too large, pairwise meddian computation can cause a memory error. with util.ContextTimer() as t: med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 40)))) list_gwidth.sort() list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = tst.QuadMMDTest(best_ker, n_permute=1000, alpha=alpha, use_1sample_U=False) test_result = mmd_test.perform_test(te) result = { 'test_method': mmd_test, 'test_result': test_result, 'time_secs': t.secs } return result
def test_basic_H1(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 271 alpha = 0.01 for d in [1, 4]: # h1 is true ss = data.SSGaussMeanDiff(d=d, my=2.0) dat = ss.sample(n, seed=seed) xy = dat.stack_xy() sig2 = util.meddistance(xy, subsample=1000)**2 k = kernel.KGauss(sig2) # Test for J in [1, 6]: # random test locations V = util.fit_gaussian_draw(xy, J, seed=seed + 1) ume = tst.UMETest(V, k, n_simulate=2000, alpha=alpha) tresult = ume.perform_test(dat) # assertions self.assertGreaterEqual(tresult['pvalue'], 0.0) # H1 is true. Should reject with a small p-value self.assertLessEqual(tresult['pvalue'], 0.1)
def test(self, X, Y): XY = self.preprocess(X, Y) locations = fot_tst.MeanEmbeddingTest.init_locs_subset(XY, self.J) med = fot_util.meddistance(XY.stack_xy(), 1000) kernel = fot_kernel.KGauss(med) ME = fot_tst.MeanEmbeddingTest(locations, med, alpha=self.alpha) result = ME.perform_test(XY) p_val = result['pvalue'] return p_val
def test(self, X, Y): XY = self.preprocess(X, Y) med = fot_util.meddistance(XY.stack_xy(), 1000) kernel = fot_kernel.KGauss(med) MMD = fot_tst.QuadMMDTest(kernel, n_permute=self.n_permute, alpha=self.alpha) result = MMD.perform_test(XY) p_val = result['pvalue'] return p_val
def mmd(p, q, alpha=0.05): if (p.ndim == 1): p = p[:, np.newaxis] if (q.ndim == 1): q = q[:, np.newaxis] d = data.TSTData(p, q) d_tr, d_te = d.split_tr_te(tr_proportion=0.5) med = util.meddistance(d_tr.stack_xy()) widths = [(med * f) for f in 2.0**np.linspace(-1, 4, 20)] list_kernels = [kernel.KGauss(w**2) for w in widths] besti, powers = tst.LinearMMDTest.grid_search_kernel( d_tr, list_kernels, alpha) best_ker = list_kernels[besti] lin_mmd_test = tst.LinearMMDTest(best_ker, alpha) r = lin_mmd_test.perform_test(d_te) return r['test_stat'], r['pvalue']
def job_quad_mmd(sample_source, tr, te, r): """Quadratic mmd with grid search to choose the best Gaussian width.""" # If n is too large, pairwise meddian computation can cause a memory error. med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 30) ) ) ) list_gwidth.sort() list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = tst.QuadMMDTest(best_ker, n_permute=400, alpha=alpha) test_result = mmd_test.perform_test(te) return test_result
def test(self, X, Y): XY = self.preprocess(X, Y) train, test = XY.split_tr_te(tr_proportion=self.split_ratio) med = fot_util.meddistance(train.stack_xy(), 1000) bandwidths = (med**2) * (2.**np.linspace(-4, 4, 20)) kernels = [fot_kernel.KGauss(width) for width in bandwidths] with contextlib.redirect_stdout(None): best_i, powers = fot_tst.QuadMMDTest.grid_search_kernel( train, kernels, alpha=self.alpha) best_kernel = kernels[best_i] MMD = fot_tst.QuadMMDTest(best_kernel, n_permute=self.n_permute, alpha=self.alpha) result = MMD.perform_test(test) p_val = result['pvalue'] return p_val
def test_perform_test(self): # Full sample size n = 200 # mean shift my = 0.1 dim = 3 ss = data.SSGaussMeanDiff(dim, my=my) # Consider two dimensions here for s in [2, 8, 9]: with util.NumpySeedContext(seed=s): tst_data = ss.sample(n, seed=s) locs = np.random.randn(2, dim) k = kernel.KGauss(1) me1 = tst.METest(locs[[0], :], k, alpha=0.01) result1 = me1.perform_test(tst_data) self.assertGreaterEqual(result1['pvalue'], 0) self.assertGreaterEqual(result1['test_stat'], 0) me2 = tst.METest(locs, k, alpha=0.01) result2 = me2.perform_test(tst_data) self.assertGreaterEqual(result2['pvalue'], 0) self.assertGreaterEqual(result2['test_stat'], 0)