def perform_me_test(train_miss_impute, test_miss_impute, train_full, test_full, alpha, test_locs_miss=None, gwidth_miss=None, test_locs_full=None, gwidth_full=None): me_result = np.zeros(2) op = { 'n_test_locs': 10, # number of test locations to optimize 'max_iter': 200, # maximum number of gradient ascent iterations 'locs_step_size': 1.0, # step size for the test locations (features) 'gwidth_step_size': 0.1, # step size for the Gaussian width 'tol_fun': 1e-4, # stop if the objective does not increase more than this. 'seed': 0 # random seed } sb_data_miss_impute = TSTData(train_miss_impute, test_miss_impute) train_miss_impute_sb, dumy = sb_data_miss_impute.split_tr_te( tr_proportion=1, seed=1) dumy, test_miss_impute_sb = sb_data_miss_impute.split_tr_te( tr_proportion=0, seed=1) #half_size = int(train_miss_impute.shape[0]/2) #train_miss_impute_sb = TSTData(train_miss_impute[:half_size], train_miss_impute[half_size:half_size*2]) #test_miss_impute_sb = TSTData(train_miss_impute, test_miss_impute) if test_locs_miss is None: test_locs_miss, gwidth_miss, info = tst.MeanEmbeddingTest.optimize_locs_width( train_miss_impute_sb, alpha, **op) met_opt = tst.MeanEmbeddingTest(test_locs_miss, gwidth_miss, alpha) test_result = met_opt.perform_test(test_miss_impute_sb) if test_result['h0_rejected']: me_result[0] = 1 sb_data_full = TSTData(train_full, test_full) train_full_sb, dumy = sb_data_full.split_tr_te(tr_proportion=1, seed=1) dumy, test_full_sb = sb_data_full.split_tr_te(tr_proportion=0, seed=1) if test_locs_full is None: test_locs_full, gwidth_full, info = tst.MeanEmbeddingTest.optimize_locs_width( train_full_sb, alpha, **op) met_opt = tst.MeanEmbeddingTest(test_locs_full, gwidth_full, alpha) test_result = met_opt.perform_test(test_full_sb) if test_result['h0_rejected']: me_result[1] = 1 return me_result, test_locs_miss, gwidth_miss, test_locs_full, gwidth_full
def TST_ME(Fea, N1, alpha, is_train, test_locs, gwidth, J=1, seed=15): """run ME test.""" Fea = get_item(Fea, is_cuda) tst_data = data.TSTData(Fea[0:N1, :], Fea[N1:, :]) h = 0 if is_train: op = { 'n_test_locs': J, # number of test locations to optimize 'max_iter': 300, # maximum number of gradient ascent iterations 'locs_step_size': 1.0, # step size for the test locations (features) 'gwidth_step_size': 0.1, # step size for the Gaussian width 'tol_fun': 1e-4, # stop if the objective does not increase more than this. 'seed': seed + 5, # random seed } test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width( tst_data, alpha, **op) return test_locs, gwidth else: met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha) test_result = met_opt.perform_test(tst_data) if test_result['h0_rejected']: h = 1 return h
def _get_metest_opt(self, dat, op=None): seed = self.seed if op is None: op = { 'n_test_locs': self.n_locs, 'seed': seed + 5, 'max_iter': 100, 'batch_proportion': 1.0, 'locs_step_size': 1.0, 'gwidth_step_size': 0.1, 'tol_fun': 1e-4, 'reg': 1e-6 } seed = self.seed alpha = self.alpha p = self.p # Draw sample from p. #sample to draw is the same as that of dat ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed) xtr, xte = p_sample.split_tr_te(tr_proportion=self.tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=self.tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) # Train the ME test V_opt, gw2_opt, _ = tst.MeanEmbeddingTest.optimize_locs_width( tr_tst_data, alpha, **op) metest = tst.MeanEmbeddingTest(V_opt, gw2_opt, alpha) return metest, tr_tst_data, te_tst_data
def job_met_gwopt(sample_source, tr, te, r): """MeanEmbeddingTest. Optimize only the Gaussian width. Fix the test locations.""" op_gwidth = {'max_iter': 200, 'gwidth_step_size': 0.1, 'batch_proportion': 1.0, 'tol_fun': 1e-3} # optimize on the training set T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r+92856) gwidth, info = tst.MeanEmbeddingTest.optimize_gwidth(tr, T_randn, **op_gwidth) met_gwopt = tst.MeanEmbeddingTest(T_randn, gwidth, alpha) raise ValueError('Use job_met_gwgrid instead') return met_gwopt.perform_test(te)
def job_met_opt(sample_source, tr, te, r): """MeanEmbeddingTest with test locations optimzied. Return results from calling perform_test()""" # MeanEmbeddingTest. optimize the test locations met_opt_options = {'n_test_locs': J, 'max_iter': 200, 'locs_step_size': 0.1, 'gwidth_step_size': 0.1, 'seed': r+92856, 'tol_fun': 1e-3} test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width(tr, alpha, **met_opt_options) met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha) met_opt_test = met_opt.perform_test(te) return met_opt_test
def test(self, X, Y): XY = self.preprocess(X, Y) locations = fot_tst.MeanEmbeddingTest.init_locs_subset(XY, self.J) med = fot_util.meddistance(XY.stack_xy(), 1000) kernel = fot_kernel.KGauss(med) ME = fot_tst.MeanEmbeddingTest(locations, med, alpha=self.alpha) result = ME.perform_test(XY) p_val = result['pvalue'] return p_val
def job_met_opt(sample_source, tr, te, r): """MeanEmbeddingTest with test locations optimzied.""" # MeanEmbeddingTest. optimize the test locations with util.ContextTimer() as t: met_opt_options = {'n_test_locs': J, 'max_iter': 200, 'locs_step_size': 500.0, 'gwidth_step_size': 0.2, 'seed': r+92856, 'tol_fun': 1e-4} test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width(tr, alpha, **met_opt_options) met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha) met_opt_test = met_opt.perform_test(te) result = {'test_method': met_opt, 'test_result': met_opt_test, 'time_secs': t.secs} return result
def job_met_gwgrid(sample_source, tr, te, r, J): """MeanEmbeddingTest. Optimize only the Gaussian width with grid search Fix the test locations.""" # optimize on the training set T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r + 92856) med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-5, 5, 40)))) list_gwidth.sort() besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth( tr, T_randn, list_gwidth, alpha) best_width2 = list_gwidth[besti] met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha) return met_grid.perform_test(te)
def test(self, X, Y): XY = self.preprocess(X, Y) train, test = XY.split_tr_te(tr_proportion=self.split_ratio) locations = fot_tst.MeanEmbeddingTest.init_locs_subset(train, self.J) med = fot_util.meddistance(train.stack_xy(), 1000) gwidth, info = fot_tst.MeanEmbeddingTest.optimize_gwidth( train, locations, med**2) ME = fot_tst.MeanEmbeddingTest(locations, gwidth, alpha=self.alpha) result = ME.perform_test(test) p_val = result['pvalue'] return p_val
def job_met_opt5(sample_source, tr, te, r): """MeanEmbeddingTest with test locations optimzied. Large step size Return results from calling perform_test()""" with util.ContextTimer() as t: # MeanEmbeddingTest. optimize the test locations met_opt_options = {'n_test_locs': J, 'max_iter': 200, 'locs_step_size': 0.5, 'gwidth_step_size': 0.1, 'seed': r+92856, 'tol_fun': 1e-3} test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width(tr, alpha, **met_opt_options) met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha) met_opt_test = met_opt.perform_test(te) return { #'test_method': met_opt, 'test_result': met_opt_test, 'time_secs': t.secs}
def test(self, X, Y): XY = self.preprocess(X, Y) train, test = XY.split_tr_te(tr_proportion=self.split_ratio) with contextlib.redirect_stdout(None): test_locs, gwidth, info = fot_tst.MeanEmbeddingTest.optimize_locs_width( train, self.alpha, n_test_locs=self.J, ) ME = fot_tst.MeanEmbeddingTest(test_locs, gwidth, alpha=self.alpha) result = ME.perform_test(test) p_val = result['pvalue'] return p_val
def __init__(self, p, gwidth2, test_locs, alpha=0.01, seed=28): """ p: an instance of UnnormalizedDensity gwidth2: Gaussian width squared for the Gaussian kernel test_locs: J x d numpy array of J locations to test the difference alpha: significance level """ super(GaussMETest, self).__init__(p, alpha) self.gwidth2 = gwidth2 self.test_locs = test_locs self.seed = seed ds = p.get_datasource() if ds is None: raise ValueError('%s test requires a density p which implements get_datasource(', str(GaussMETest)) # Construct the ME test metest = tst.MeanEmbeddingTest(test_locs, gwidth2, alpha=alpha) self.metest = metest
def job_met_gwopt(prob_label, tr, te, r, ni, n): """MeanEmbeddingTest. Optimize only the Gaussian width. Fix the test locations.""" raise ValueError('Use job_met_gwgrid instead') with util.ContextTimer() as t: op_gwidth = { 'max_iter': 200, 'gwidth_step_size': 0.1, 'batch_proportion': 1.0, 'tol_fun': 1e-3 } # optimize on the training set T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r + 92856) gwidth, info = tst.MeanEmbeddingTest.optimize_gwidth( tr, T_randn, **op_gwidth) met_gwopt = tst.MeanEmbeddingTest(T_randn, gwidth, alpha) return { #'test_method': met_gwopt, 'test_result': met_gwopt.perform_test(te), 'time_secs': t.secs }
def job_met_gwgrid(prob_label, tr, te, r, ni, n): """MeanEmbeddingTest. Optimize only the Gaussian width with grid search Fix the test locations.""" with util.ContextTimer() as t: # optimize on the training set T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r + 92856) med = util.meddistance(tr.stack_xy(), 1000) list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-5, 5, 40)))) list_gwidth.sort() besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth( tr, T_randn, list_gwidth, alpha) best_width2 = list_gwidth[besti] met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha) met_grid_result = met_grid.perform_test(te) return { #'test_method': met_grid, 'test_result': met_grid_result, 'time_secs': t.secs }
def job_met_opt10(prob_label, tr, te, r, ni, n): """MeanEmbeddingTest with test locations optimzied. Return results from calling perform_test()""" # MeanEmbeddingTest. optimize the test locations with util.ContextTimer() as t: met_opt_options = { 'n_test_locs': J, 'max_iter': 200, 'locs_step_size': 5.0, 'gwidth_step_size': 0.2, 'seed': r + 92856, 'tol_fun': 1e-3 } test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width( tr, alpha, **met_opt_options) met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha) met_opt_test = met_opt.perform_test(te) return { #'test_method': met_opt, 'test_result': met_opt_test, 'time_secs': t.secs }
def wtest(p, q, alpha=0.05): op = { 'n_test_locs': 2, 'seed': 0, 'max_iter': 200, 'batch_proportion': 1.0, 'locs_step_size': 1.0, 'gwidth_step_size': 0.1, 'tol_fun': 1e-4 } if (p.ndim == 1): p = p[:, np.newaxis] if (q.ndim == 1): q = q[:, np.newaxis] d = data.TSTData(p, q) d_tr, d_te = d.split_tr_te(tr_proportion=0.5) test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width( d_tr, alpha, **op) met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha) r = met_opt.perform_test(d_te) if (r['test_stat'] == -1): r['test_stat'] = np.nan r['pvalue'] = np.nan return r['test_stat'], r['pvalue']