def perform_mmd_test(train_miss_impute, test_miss_impute, train_full, test_full, alpha, mmd_miss_impute=None, mmd_full=None): mmd_result = np.zeros(2) sb_data_miss_impute = TSTData(train_miss_impute, test_miss_impute) if mmd_miss_impute is None: print('ini') x, y = sb_data_miss_impute.xy() dist_mat_miss_impute = metrics.pairwise_distances(x, y) the_kernel = kernel.KGauss(dist_mat_miss_impute.std()) mmd_miss_impute = tst.QuadMMDTest(the_kernel, alpha=alpha) test_result = mmd_miss_impute.perform_test(sb_data_miss_impute) if test_result['h0_rejected']: mmd_result[0] = 1 sb_data_full = TSTData(train_full, test_full) if mmd_full is None: x, y = sb_data_full.xy() dist_mat_full = metrics.pairwise_distances(x, y) the_kernel = kernel.KGauss(dist_mat_full.std()) mmd_full = tst.QuadMMDTest(the_kernel, alpha=alpha) test_result = mmd_full.perform_test(sb_data_full) if test_result['h0_rejected']: mmd_result[1] = 1 return mmd_result, mmd_miss_impute, mmd_full
def perform_mmd_test(train_miss_impute, test_miss_impute, train_full, test_full, alpha, mmd_miss_impute=None, mmd_full=None): mmd_result = np.zeros(2) sb_data_miss_impute = TSTData(train_miss_impute, test_miss_impute) if mmd_miss_impute is None: print('ini') the_kernel = kernel.KGauss(sb_data_miss_impute.mean_std()) mmd_miss_impute = tst.QuadMMDTest(the_kernel, alpha=alpha) test_result = mmd_miss_impute.perform_test(sb_data_miss_impute) if test_result['h0_rejected']: mmd_result[0] = 1 sb_data_full = TSTData(train_full, test_full) if mmd_full is None: the_kernel = kernel.KGauss(sb_data_full.mean_std()) mmd_full = tst.QuadMMDTest(the_kernel, alpha=alpha) test_result = mmd_full.perform_test(sb_data_full) if test_result['h0_rejected']: mmd_result[1] = 1 return mmd_result, mmd_miss_impute, mmd_full
def job_quad_mmd_2U(sample_source, tr, te, r): """Quadratic mmd with grid search to choose the best Gaussian width. Use two-sample U statistics to compute k(X,Y). """ # If n is too large, pairwise meddian computation can cause a memory error. with util.ContextTimer() as t: med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 40)))) list_gwidth.sort() list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = tst.QuadMMDTest(best_ker, n_permute=1000, alpha=alpha, use_1sample_U=False) test_result = mmd_test.perform_test(te) result = { 'test_method': mmd_test, 'test_result': test_result, 'time_secs': t.secs } return result
def perform_test( self, dat, candidate_kernels=None, return_mmdtest=False, tr_proportion=0.2, reg=1e-3, ): """ dat: an instance of Data candidate_kernels: a list of Kernel's to choose from tr_proportion: proportion of sample to be used to choosing the best kernel reg: regularization parameter for the test power criterion """ with util.ContextTimer() as t: seed = self.seed p = self.p ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 77) xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) if candidate_kernels is None: # Assume a Gaussian kernel. Construct a list of # kernels to try based on multiples of the median heuristic med = util.meddistance(tr_tst_data.stack_xy(), 1000) list_gwidth = np.hstack( ((med**2) * (2.0**np.linspace(-4, 4, 10)))) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] alpha = self.alpha # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr_tst_data, candidate_kernels, alpha, reg=reg) # perform test best_ker = candidate_kernels[besti] mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha) results = mmdtest.perform_test(te_tst_data) if return_mmdtest: results["mmdtest"] = mmdtest results["time_secs"] = t.secs return results
def test(self, X, Y): XY = self.preprocess(X, Y) med = fot_util.meddistance(XY.stack_xy(), 1000) kernel = fot_kernel.KGauss(med) MMD = fot_tst.QuadMMDTest(kernel, n_permute=self.n_permute, alpha=self.alpha) result = MMD.perform_test(XY) p_val = result['pvalue'] return p_val
def job_quad_mmd(sample_source, tr, te, r): """Quadratic mmd with grid search to choose the best Gaussian width.""" # If n is too large, pairwise meddian computation can cause a memory error. med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 30) ) ) ) list_gwidth.sort() list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha) # perform test best_ker = list_kernels[besti] mmd_test = tst.QuadMMDTest(best_ker, n_permute=400, alpha=alpha) test_result = mmd_test.perform_test(te) return test_result
def __init__(self, p, k, n_permute=400, alpha=0.01, seed=28): """ p: an instance of UnnormalizedDensity k: an instance of Kernel n_permute: number of times to permute the samples to simulate from the null distribution (permutation test) alpha: significance level seed: random seed """ super(QuadMMDGof, self).__init__(p, alpha) # Construct the MMD test self.mmdtest = tst.QuadMMDTest(k, n_permute=n_permute, alpha=alpha) self.k = k self.seed = seed ds = p.get_datasource() if ds is None: raise ValueError('%s test requires a density p which implements get_datasource(', str(QuadMMDGof))
def __init__(self, p, k, l, n_permute=400, alpha=0.01, seed=11): # logging.warning(('This test does not accept Pytorch ' # 'kernels starting with prefix PT')) import freqopttest.tst as tst super(MMDTest, self).__init__(p, alpha) self.p = p self.k = k self.l = l self.ds_p = self.p.get_condsource() if self.ds_p is None: raise ValueError( 'The test requires that p can be sampled. Must implement p.get_condsource().' ) self.alpha = alpha self.seed = seed self.n_permute = n_permute kprod = ker.KTwoProduct(k, l, p.dx(), p.dy()) self.mmdtest = tst.QuadMMDTest(kprod, n_permute, alpha=alpha)
def test(self, X, Y): XY = self.preprocess(X, Y) train, test = XY.split_tr_te(tr_proportion=self.split_ratio) med = fot_util.meddistance(train.stack_xy(), 1000) bandwidths = (med**2) * (2.**np.linspace(-4, 4, 20)) kernels = [fot_kernel.KGauss(width) for width in bandwidths] with contextlib.redirect_stdout(None): best_i, powers = fot_tst.QuadMMDTest.grid_search_kernel( train, kernels, alpha=self.alpha) best_kernel = kernels[best_i] MMD = fot_tst.QuadMMDTest(best_kernel, n_permute=self.n_permute, alpha=self.alpha) result = MMD.perform_test(test) p_val = result['pvalue'] return p_val