def met_gmmd_med(p, rx, cond_source, n, r): """ A naive baseline which samples from the conditional density model p to create a new joint sample. The test is performed with a two-sample MMD test comparing the two joint samples. Use a Gaussian kernel for both X and Y with median heuristic. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=r + 3) sigy = util.pt_meddistance(Y, subsample=600, seed=r + 38) # kernels # k = kernel on X. Need a kernel that can operator on numpy arrays k = kgof.kernel.KGauss(sigma2=sigx**2) # l = kernel on Y l = kgof.kernel.KGauss(sigma2=sigy**2) # Construct an MMD test object. Require freqopttest package. mmdtest = cgof.MMDTest(p, k, l, n_permute=400, alpha=alpha, seed=r + 37) result = mmdtest.perform_test(X, Y) return { # 'test': mmdtest, 'test_result': result, 'time_secs': t.secs }
def met_gkcsd_med(p, rx, cond_source, n, r): """ KCSD test with Gaussian kernels (for both kernels). Prefix g = Gaussian kernel. med = Use median heuristic to choose the bandwidths for both kernels. Compute the median heuristic on the data X and Y separate to get the two bandwidths. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=r + 3) sigy = util.pt_meddistance(Y, subsample=600, seed=r + 38) # kernels # k = kernel on X k = ker.PTKGauss(sigma2=sigx**2) # l = kernel on Y l = ker.PTKGauss(sigma2=sigy**2) # Construct a KCSD test object kcsdtest = cgof.KCSDTest(p, k, l, alpha=alpha, n_bootstrap=400, seed=r + 88) result = kcsdtest.perform_test(X, Y) return { # 'test': kcsdtest, 'test_result': result, 'time_secs': t.secs }
def met_gmmd_split_med(p, rx, cond_source, n, r): """ Same as met_gmmd_med but perform data splitting to guarantee that the two sets of samples are independent. Effective sample size is then n/2. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=r + 4) sigy = util.pt_meddistance(Y, subsample=600, seed=r + 39) # kernels # k = kernel on X. Need a kernel that can operator on numpy arrays k = kgof.kernel.KGauss(sigma2=sigx**2) # l = kernel on Y l = kgof.kernel.KGauss(sigma2=sigy**2) # Construct an MMD test object. Require freqopttest package. mmdtest = cgof.MMDSplitTest(p, k, l, n_permute=400, alpha=alpha, seed=r + 47) result = mmdtest.perform_test(X, Y) return { # 'test': mmdtest, 'test_result': result, 'time_secs': t.secs }
def met_gfscd_J1_rand(p, rx, cond_source, n, r, J=1): """ FSCD test with Gaussian kernels on both X and Y. * Use J=1 random test location by default. * The test locations are drawn from a Gaussian fitted to the data drawn from rx. * Bandwithds of the Gaussian kernels are determined by the median heuristic. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: tr, te = cdat.CondData(X, Y).split_tr_te(tr_proportion=0.3) Xtr, Ytr = tr.xy() # fit a Gaussian and draw J locations npV = util.fit_gaussian_sample(Xtr.detach().numpy(), J, seed=r + 750) V = torch.tensor(npV, dtype=torch.float) # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=2 + r) sigy = util.pt_meddistance(Y, subsample=600, seed=93 + r) # kernels # k = kernel on X k = ker.PTKGauss(sigma2=sigx**2) # l = kernel on Y l = ker.PTKGauss(sigma2=sigy**2) # Construct a FSCD test object fscdtest = cgof.FSCDTest(p, k, l, V, alpha=alpha, n_bootstrap=400, seed=r + 8) # test on the full samples result = fscdtest.perform_test(X, Y) return { # 'test': fscdtest, 'test_result': result, 'time_secs': t.secs }
def met_gfscd_J1_opt_tr50(p, rx, cond_source, n, r, J=1, tr_proportion=0.5): """ FSCD test with Gaussian kernels on both X and Y. Optimize both Gaussian bandwidhts and the test locations by maximizing the test power. The proportion of the training data used for the optimization is controlled by tr_proportion. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # split the data cd = cdat.CondData(X, Y) tr, te = cd.split_tr_te(tr_proportion=tr_proportion) # training data Xtr, Ytr = tr.xy() # fit a Gaussian and draw J locations as an initial point for V npV = util.fit_gaussian_sample(Xtr.detach().numpy(), J, seed=r + 75) V = torch.tensor(npV, dtype=torch.float) # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=30 + r) sigy = util.pt_meddistance(Y, subsample=600, seed=40 + r) # kernels # k = kernel on X k = ker.PTKGauss(sigma2=sigx**2) # l = kernel on Y l = ker.PTKGauss(sigma2=sigy**2) abs_min, abs_max = torch.min(Xtr).item(), torch.max(Xtr).item() abs_std = torch.std(Xtr).item() # parameter tuning fscd_pc = cgof.FSCDPowerCriterion(p, k, l, Xtr, Ytr) max_iter = 200 # learning rate lr = 1e-2 # regularization parameter when forming the power criterion reg = 1e-4 # constraint satisfaction function def con_f(params, V): ksigma2 = params[0] lsigma2 = params[1] ksigma2.data.clamp_(min=1e-1, max=10 * sigx**2) lsigma2.data.clamp_(min=1e-1, max=10 * sigy**2) V.data.clamp_(min=abs_min - 2.0 * abs_std, max=abs_max + 2.0 * abs_std) # do the optimization. Parameters are optimized in-place fscd_pc.optimize_params([k.sigma2, l.sigma2], V, constraint_f=con_f, lr=lr, reg=reg, max_iter=max_iter) # Now that k, l, and V are optimized. Construct a FSCD test object fscdtest = cgof.FSCDTest(p, k, l, V, alpha=alpha, n_bootstrap=400, seed=r + 8) Xte, Yte = te.xy() # test only on the test samples result = fscdtest.perform_test(Xte, Yte) return { # 'test': fscdtest, 'test_result': result, 'time_secs': t.secs }
def met_gkcsd_opt_tr50(p, rx, cond_source, n, r, tr_proportion=0.5): """ KCSD test with Gaussian kernels (for both kernels). Optimize the kernel bandwidths by maximizing the power criterin of the KCSD test. med = Use median heuristic to choose the bandwidths for both kernels. Compute the median heuristic on the data X and Y separate to get the two bandwidths. """ X, Y = sample_xy(rx, cond_source, n, r) # start timing with util.ContextTimer() as t: # median heuristic sigx = util.pt_meddistance(X, subsample=600, seed=r + 7) sigy = util.pt_meddistance(Y, subsample=600, seed=r + 99) # kernels # k = kernel on X k = ker.PTKGauss(sigma2=sigx**2) # l = kernel on Y l = ker.PTKGauss(sigma2=sigy**2) # split the data cd = cdat.CondData(X, Y) tr, te = cd.split_tr_te(tr_proportion=tr_proportion) # training data Xtr, Ytr = tr.xy() # abs_min, abs_max = torch.min(Xtr).item(), torch.max(Xtr).item() # abs_stdx = torch.std(Xtr).item() # abs_stdy = torch.std(Ytr).item() kcsd_pc = cgof.KCSDPowerCriterion(p, k, l, Xtr, Ytr) max_iter = 100 # learning rate lr = 1e-3 # regularization in the power criterion reg = 1e-3 # constraint satisfaction function def con_f(params): ksigma2 = params[0] lsigma2 = params[1] ksigma2.data.clamp_(min=1e-1, max=10 * sigx**2) lsigma2.data.clamp_(min=1e-1, max=10 * sigy**2) kcsd_pc.optimize_params([k.sigma2, l.sigma2], constraint_f=con_f, lr=lr, reg=reg, max_iter=max_iter) # Construct a KCSD test object kcsdtest = cgof.KCSDTest(p, k, l, alpha=alpha, n_bootstrap=400, seed=r + 88) Xte, Yte = te.xy() # test on the test set result = kcsdtest.perform_test(Xte, Yte) return { # 'test': kcsdtest, 'test_result': result, 'time_secs': t.secs }