def compute(self): # from prob_label, get p, rx, cs, n ns, p, rx, cs = get_ns_model_source(self.prob_label) r = self.rep n = self.n met_func = self.met_func prob_label = self.prob_label logger.info("computing. %s. prob=%s, r=%d,\ n=%d" % (met_func.__name__, prob_label, r, n)) with util.ContextTimer() as t: job_result = met_func(p, rx, cs, n, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) func_name = met_func.__name__ logger.info("done. ex1: %s, prob=%s, r=%d, n=%d. Took: %.3g s " % (func_name, prob_label, r, n, t.secs)) # save result fname = '%s-%s-n%d_r%d_a%.3f.p' \ %(prob_label, func_name, n, r, alpha ) glo.ex_save_result(ex, job_result, prob_label, fname)
def met_gkcsd_med(p, rx, cond_source, n, r): """ KCSD test with Gaussian kernels (for both kernels). Prefix g = Gaussian kernel. med = Use median heuristic to choose the bandwidths for both kernels. Compute the median heuristic on the data X and Y separate to get the two bandwidths. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=r + 3) sigy = util.pt_meddistance(Y, subsample=600, seed=r + 38) # kernels # k = kernel on X k = ker.PTKGauss(sigma2=sigx**2) # l = kernel on Y l = ker.PTKGauss(sigma2=sigy**2) # Construct a KCSD test object kcsdtest = cgof.KCSDTest(p, k, l, alpha=alpha, n_bootstrap=400, seed=r + 88) result = kcsdtest.perform_test(X, Y) return { # 'test': kcsdtest, 'test_result': result, 'time_secs': t.secs }
def perform_test(self, X, Y): import freqopttest.data as fdata ds_p = self.ds_p mmdtest = self.mmdtest seed = self.seed with util.ContextTimer() as t: # split the data X1, Y1, X2, Y2 = MMDSplitTest._split_half(X, Y, seed=self.seed + 330) # Draw sample from p Y2_ = ds_p.cond_pair_sample(X2, seed=seed + 13) real_data = torch.cat([X1, Y1], dim=1).numpy() model_data = torch.cat([X2, Y2_], dim=1).numpy() # Run the two-sample test on p_sample and dat # Make a two-sample test data tst_data = fdata.TSTData(real_data, model_data) # Test results = mmdtest.perform_test(tst_data) results['time_secs'] = t.secs return results
def met_gmmd_med(p, rx, cond_source, n, r): """ A naive baseline which samples from the conditional density model p to create a new joint sample. The test is performed with a two-sample MMD test comparing the two joint samples. Use a Gaussian kernel for both X and Y with median heuristic. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=r + 3) sigy = util.pt_meddistance(Y, subsample=600, seed=r + 38) # kernels # k = kernel on X. Need a kernel that can operator on numpy arrays k = kgof.kernel.KGauss(sigma2=sigx**2) # l = kernel on Y l = kgof.kernel.KGauss(sigma2=sigy**2) # Construct an MMD test object. Require freqopttest package. mmdtest = cgof.MMDTest(p, k, l, n_permute=400, alpha=alpha, seed=r + 37) result = mmdtest.perform_test(X, Y) return { # 'test': mmdtest, 'test_result': result, 'time_secs': t.secs }
def perform_test(self, X, Y): with util.ContextTimer() as t: alpha = self.alpha n_bootstrap = self.n_bootstrap n = X.shape[0] ds = self.p.get_condsource() test_stat = self.compute_stat(X, Y) # bootstrapping sim_stats = torch.zeros(n_bootstrap) with torch.no_grad(): with util.TorchSeedContext(seed=self.seed): for i in range(n_bootstrap): idx = torch.randint(0, n, [n]) X_ = X[idx] Y_ = ds.cond_pair_sample(X_, self.seed + i) # Bootstrapped statistic Hnb = CramerVonMisesTest.Hn(X_, Y_, X, Y) Hn0b = self.Hn0(X_, Y_, X, Y) boot_stat = torch.sum((Hnb - Hn0b)**2) sim_stats[i] = boot_stat # approximate p-value with the permutations I = sim_stats > test_stat pvalue = torch.mean(I.type(torch.float)).item() results = { 'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': test_stat.item(), 'h0_rejected': pvalue < alpha, 'n_simulate': n_bootstrap, 'time_secs': t.secs, } return results
def perform_test(self, X, Y): """ X: Torch tensor of size n x dx Y: Torch tensor of size n x dy perform the goodness-of-fit test and return values computed in a dictionary: { alpha: 0.01, pvalue: 0.0002, test_stat: 2.3, h0_rejected: True, time_secs: ... } """ with util.ContextTimer() as t: alpha = self.alpha stat = self.compute_stat(X, Y) pvalue = (1 - dists.Normal(0, 1).cdf(stat)).item() results = { 'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': stat.item(), 'h0_rejected': pvalue < alpha, 'time_secs': t.secs, } return results
def met_gmmd_split_med(p, rx, cond_source, n, r): """ Same as met_gmmd_med but perform data splitting to guarantee that the two sets of samples are independent. Effective sample size is then n/2. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=r + 4) sigy = util.pt_meddistance(Y, subsample=600, seed=r + 39) # kernels # k = kernel on X. Need a kernel that can operator on numpy arrays k = kgof.kernel.KGauss(sigma2=sigx**2) # l = kernel on Y l = kgof.kernel.KGauss(sigma2=sigy**2) # Construct an MMD test object. Require freqopttest package. mmdtest = cgof.MMDSplitTest(p, k, l, n_permute=400, alpha=alpha, seed=r + 47) result = mmdtest.perform_test(X, Y) return { # 'test': mmdtest, 'test_result': result, 'time_secs': t.secs }
def perform_test(self, X, Y): with util.ContextTimer() as t: alpha = self.alpha stat = self.compute_stat(X, Y) pvalue = (1 - dists.Normal(0, 1).cdf(stat)).item() results = { 'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': stat.item(), 'h0_rejected': pvalue < alpha, 'time_secs': t.secs, } return results
def met_zheng_cdf(p, rx, cond_source, n, r): X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # the test zheng_cdf = cgof.ZhengCDFTest(p, alpha) result = zheng_cdf.perform_test(X, Y) return { # 'test': zheng_test, 'test_result': result, 'time_secs': t.secs }
def perform_test(self, X, Y, return_simulated_stats=False, return_ustat_gram=False): """ X,Y: torch tensors. return_simulated_stats: If True, also include the boostrapped statistics in the returned dictionary. """ with util.ContextTimer() as t: alpha = self.alpha n_bootstrap = self.n_bootstrap n = X.shape[0] test_stat, H = self.compute_stat(X, Y, return_ustat_gram=True) # bootstrapping sim_stats = torch.zeros(n_bootstrap) mult_dist = dists.multinomial.Multinomial(total_count=n, probs=torch.ones(n) / n) with torch.no_grad(): with util.TorchSeedContext(seed=self.seed): for i in range(n_bootstrap): W = mult_dist.sample() Wt = (W - 1.0) / n # Bootstrapped statistic boot_stat = n * (H.matmul(Wt).dot(Wt) - torch.diag(H).dot(Wt**2)) sim_stats[i] = boot_stat # approximate p-value with the permutations I = sim_stats > test_stat pvalue = torch.mean(I.type(torch.float)).item() results = { 'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': test_stat.item(), 'h0_rejected': pvalue < alpha, 'n_simulate': n_bootstrap, 'time_secs': t.secs, } if return_simulated_stats: results['sim_stats'] = sim_stats.detach().numpy() if return_ustat_gram: results['H'] = H return results
def met_zhengkl_gh(p, rx, cond_source, n, r): """ Zheng 2000 test implemented with Gauss Hermite quadrature. """ X, Y = sample_xy(rx, cond_source, n, r) rate = (cond_source.dx() + cond_source.dy()) * 4. / 5 # start timing with util.ContextTimer() as t: # the test zheng_gh = cgof.ZhengKLTestGaussHerm(p, alpha, rate=rate) result = zheng_gh.perform_test(X, Y) return { # 'test': zheng_test, 'test_result': result, 'time_secs': t.secs }
def met_zhengkl_mc(p, rx, cond_source, n, r): """ Zheng 2000 test implemented with Monte Carlo integration. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # number of Monte Carlo particles n_mc = 10000 # the test zheng_mc = cgof.ZhengKLTestMC(p, alpha, n_mc=n_mc) result = zheng_mc.perform_test(X, Y) return { # 'test': zheng_test, 'test_result': result, 'time_secs': t.secs }
def met_gfscd_J1_rand(p, rx, cond_source, n, r, J=1): """ FSCD test with Gaussian kernels on both X and Y. * Use J=1 random test location by default. * The test locations are drawn from a Gaussian fitted to the data drawn from rx. * Bandwithds of the Gaussian kernels are determined by the median heuristic. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: tr, te = cdat.CondData(X, Y).split_tr_te(tr_proportion=0.3) Xtr, Ytr = tr.xy() # fit a Gaussian and draw J locations npV = util.fit_gaussian_sample(Xtr.detach().numpy(), J, seed=r + 750) V = torch.tensor(npV, dtype=torch.float) # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=2 + r) sigy = util.pt_meddistance(Y, subsample=600, seed=93 + r) # kernels # k = kernel on X k = ker.PTKGauss(sigma2=sigx**2) # l = kernel on Y l = ker.PTKGauss(sigma2=sigy**2) # Construct a FSCD test object fscdtest = cgof.FSCDTest(p, k, l, V, alpha=alpha, n_bootstrap=400, seed=r + 8) # test on the full samples result = fscdtest.perform_test(X, Y) return { # 'test': fscdtest, 'test_result': result, 'time_secs': t.secs }
def met_cramer_vm(p, rx, cond_source, n, r): """ """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # Construct a CramerVonMisesTest test object cvm = cgof.CramerVonMisesTest(p, alpha=alpha, n_bootstrap=200, seed=r + 88) result = cvm.perform_test(X, Y) return { # 'test': kcsdtest, 'test_result': result, 'time_secs': t.secs }
def met_zhengkl(p, rx, cond_source, n, r): """ "Zheng 2000, A CONSISTENT TEST OF CONDITIONAL PARAMETRIC DISTRIBUTIONS", which uses the first order approximation of KL divergence as the decision criterion. Use cgoftest.ZhengKLTest. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # the test zheng_test = cgof.ZhengKLTest(p, alpha) result = zheng_test.perform_test(X, Y) return { # 'test': zheng_test, 'test_result': result, 'time_secs': t.secs }
def met_gkcsd_opt_tr50(p, rx, cond_source, n, r, tr_proportion=0.5): """ KCSD test with Gaussian kernels (for both kernels). Optimize the kernel bandwidths by maximizing the power criterin of the KCSD test. med = Use median heuristic to choose the bandwidths for both kernels. Compute the median heuristic on the data X and Y separate to get the two bandwidths. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=r + 7) sigy = util.pt_meddistance(Y, subsample=600, seed=r + 99) # kernels # k = kernel on X k = ker.PTKGauss(sigma2=sigx**2) # l = kernel on Y l = ker.PTKGauss(sigma2=sigy**2) # split the data cd = cdat.CondData(X, Y) tr, te = cd.split_tr_te(tr_proportion=tr_proportion) # training data Xtr, Ytr = tr.xy() # abs_min, abs_max = torch.min(Xtr).item(), torch.max(Xtr).item() # abs_stdx = torch.std(Xtr).item() # abs_stdy = torch.std(Ytr).item() kcsd_pc = cgof.KCSDPowerCriterion(p, k, l, Xtr, Ytr) max_iter = 100 # learning rate lr = 1e-3 # regularization in the power criterion reg = 1e-3 # constraint satisfaction function def con_f(params): ksigma2 = params[0] lsigma2 = params[1] ksigma2.data.clamp_(min=1e-1, max=10 * sigx**2) lsigma2.data.clamp_(min=1e-1, max=10 * sigy**2) kcsd_pc.optimize_params([k.sigma2, l.sigma2], constraint_f=con_f, lr=lr, reg=reg, max_iter=max_iter) # Construct a KCSD test object kcsdtest = cgof.KCSDTest(p, k, l, alpha=alpha, n_bootstrap=400, seed=r + 88) Xte, Yte = te.xy() # test on the test set result = kcsdtest.perform_test(Xte, Yte) return { # 'test': kcsdtest, 'test_result': result, 'time_secs': t.secs }
def met_gfscd_J1_opt_tr50(p, rx, cond_source, n, r, J=1, tr_proportion=0.5): """ FSCD test with Gaussian kernels on both X and Y. Optimize both Gaussian bandwidhts and the test locations by maximizing the test power. The proportion of the training data used for the optimization is controlled by tr_proportion. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # split the data cd = cdat.CondData(X, Y) tr, te = cd.split_tr_te(tr_proportion=tr_proportion) # training data Xtr, Ytr = tr.xy() # fit a Gaussian and draw J locations as an initial point for V npV = util.fit_gaussian_sample(Xtr.detach().numpy(), J, seed=r + 75) V = torch.tensor(npV, dtype=torch.float) # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=30 + r) sigy = util.pt_meddistance(Y, subsample=600, seed=40 + r) # kernels # k = kernel on X k = ker.PTKGauss(sigma2=sigx**2) # l = kernel on Y l = ker.PTKGauss(sigma2=sigy**2) abs_min, abs_max = torch.min(Xtr).item(), torch.max(Xtr).item() abs_std = torch.std(Xtr).item() # parameter tuning fscd_pc = cgof.FSCDPowerCriterion(p, k, l, Xtr, Ytr) max_iter = 200 # learning rate lr = 1e-2 # regularization parameter when forming the power criterion reg = 1e-4 # constraint satisfaction function def con_f(params, V): ksigma2 = params[0] lsigma2 = params[1] ksigma2.data.clamp_(min=1e-1, max=10 * sigx**2) lsigma2.data.clamp_(min=1e-1, max=10 * sigy**2) V.data.clamp_(min=abs_min - 2.0 * abs_std, max=abs_max + 2.0 * abs_std) # do the optimization. Parameters are optimized in-place fscd_pc.optimize_params([k.sigma2, l.sigma2], V, constraint_f=con_f, lr=lr, reg=reg, max_iter=max_iter) # Now that k, l, and V are optimized. Construct a FSCD test object fscdtest = cgof.FSCDTest(p, k, l, V, alpha=alpha, n_bootstrap=400, seed=r + 8) Xte, Yte = te.xy() # test only on the test samples result = fscdtest.perform_test(Xte, Yte) return { # 'test': fscdtest, 'test_result': result, 'time_secs': t.secs }