Пример #1
0
def met_gmmd_med(p, rx, cond_source, n, r):
    """
    A naive baseline which samples from the conditional density model p to
    create a new joint sample. The test is performed with a two-sample MMD
    test comparing the two joint samples. Use a Gaussian kernel for both X
    and Y with median heuristic.
    """
    X, Y = sample_xy(rx, cond_source, n, r)

    # start timing
    with util.ContextTimer() as t:
        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=r + 3)
        sigy = util.pt_meddistance(Y, subsample=600, seed=r + 38)

        # kernels
        # k = kernel on X. Need a kernel that can operator on numpy arrays
        k = kgof.kernel.KGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = kgof.kernel.KGauss(sigma2=sigy**2)

        # Construct an MMD test object. Require freqopttest package.
        mmdtest = cgof.MMDTest(p,
                               k,
                               l,
                               n_permute=400,
                               alpha=alpha,
                               seed=r + 37)
        result = mmdtest.perform_test(X, Y)

    return {
        # 'test': mmdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Пример #2
0
def met_gkcsd_med(p, rx, cond_source, n, r):
    """
    KCSD test with Gaussian kernels (for both kernels). Prefix g = Gaussian kernel.
    med = Use median heuristic to choose the bandwidths for both kernels.
    Compute the median heuristic on the data X and Y separate to get the two
    bandwidths.
    """
    X, Y = sample_xy(rx, cond_source, n, r)

    # start timing
    with util.ContextTimer() as t:
        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=r + 3)
        sigy = util.pt_meddistance(Y, subsample=600, seed=r + 38)

        # kernels
        # k = kernel on X
        k = ker.PTKGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = ker.PTKGauss(sigma2=sigy**2)

        # Construct a KCSD test object
        kcsdtest = cgof.KCSDTest(p,
                                 k,
                                 l,
                                 alpha=alpha,
                                 n_bootstrap=400,
                                 seed=r + 88)
        result = kcsdtest.perform_test(X, Y)

    return {
        # 'test': kcsdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Пример #3
0
def met_gmmd_split_med(p, rx, cond_source, n, r):
    """ 
    Same as met_gmmd_med but perform data splitting to guarantee that the
    two sets of samples are independent. Effective sample size is then n/2.
    """
    X, Y = sample_xy(rx, cond_source, n, r)

    # start timing
    with util.ContextTimer() as t:
        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=r + 4)
        sigy = util.pt_meddistance(Y, subsample=600, seed=r + 39)

        # kernels
        # k = kernel on X. Need a kernel that can operator on numpy arrays
        k = kgof.kernel.KGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = kgof.kernel.KGauss(sigma2=sigy**2)

        # Construct an MMD test object. Require freqopttest package.
        mmdtest = cgof.MMDSplitTest(p,
                                    k,
                                    l,
                                    n_permute=400,
                                    alpha=alpha,
                                    seed=r + 47)
        result = mmdtest.perform_test(X, Y)

    return {
        # 'test': mmdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Пример #4
0
def met_gfscd_J1_rand(p, rx, cond_source, n, r, J=1):
    """
    FSCD test with Gaussian kernels on both X and Y.
    * Use J=1 random test location by default.
    * The test locations are drawn from a Gaussian fitted to the data drawn
        from rx.
    * Bandwithds of the Gaussian kernels are determined by the median
        heuristic.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:
        tr, te = cdat.CondData(X, Y).split_tr_te(tr_proportion=0.3)
        Xtr, Ytr = tr.xy()
        # fit a Gaussian and draw J locations
        npV = util.fit_gaussian_sample(Xtr.detach().numpy(), J, seed=r + 750)
        V = torch.tensor(npV, dtype=torch.float)

        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=2 + r)
        sigy = util.pt_meddistance(Y, subsample=600, seed=93 + r)

        # kernels
        # k = kernel on X
        k = ker.PTKGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = ker.PTKGauss(sigma2=sigy**2)

        # Construct a FSCD test object
        fscdtest = cgof.FSCDTest(p,
                                 k,
                                 l,
                                 V,
                                 alpha=alpha,
                                 n_bootstrap=400,
                                 seed=r + 8)
        # test on the full samples
        result = fscdtest.perform_test(X, Y)

    return {
        # 'test': fscdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Пример #5
0
def met_gfscd_J1_opt_tr50(p, rx, cond_source, n, r, J=1, tr_proportion=0.5):
    """
    FSCD test with Gaussian kernels on both X and Y.
    Optimize both Gaussian bandwidhts and the test locations by maximizing
    the test power.
    The proportion of the training data used for the optimization is
    controlled by tr_proportion.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:
        # split the data
        cd = cdat.CondData(X, Y)
        tr, te = cd.split_tr_te(tr_proportion=tr_proportion)

        # training data
        Xtr, Ytr = tr.xy()

        # fit a Gaussian and draw J locations as an initial point for V
        npV = util.fit_gaussian_sample(Xtr.detach().numpy(), J, seed=r + 75)

        V = torch.tensor(npV, dtype=torch.float)

        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=30 + r)
        sigy = util.pt_meddistance(Y, subsample=600, seed=40 + r)

        # kernels
        # k = kernel on X
        k = ker.PTKGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = ker.PTKGauss(sigma2=sigy**2)

        abs_min, abs_max = torch.min(Xtr).item(), torch.max(Xtr).item()
        abs_std = torch.std(Xtr).item()

        # parameter tuning
        fscd_pc = cgof.FSCDPowerCriterion(p, k, l, Xtr, Ytr)
        max_iter = 200
        # learning rate
        lr = 1e-2
        # regularization parameter when forming the power criterion
        reg = 1e-4

        # constraint satisfaction function
        def con_f(params, V):
            ksigma2 = params[0]
            lsigma2 = params[1]
            ksigma2.data.clamp_(min=1e-1, max=10 * sigx**2)
            lsigma2.data.clamp_(min=1e-1, max=10 * sigy**2)
            V.data.clamp_(min=abs_min - 2.0 * abs_std,
                          max=abs_max + 2.0 * abs_std)

        # do the optimization. Parameters are optimized in-place
        fscd_pc.optimize_params([k.sigma2, l.sigma2],
                                V,
                                constraint_f=con_f,
                                lr=lr,
                                reg=reg,
                                max_iter=max_iter)

        # Now that k, l, and V are optimized. Construct a FSCD test object
        fscdtest = cgof.FSCDTest(p,
                                 k,
                                 l,
                                 V,
                                 alpha=alpha,
                                 n_bootstrap=400,
                                 seed=r + 8)
        Xte, Yte = te.xy()
        # test only on the test samples
        result = fscdtest.perform_test(Xte, Yte)

    return {
        # 'test': fscdtest,
        'test_result': result,
        'time_secs': t.secs
    }
Пример #6
0
def met_gkcsd_opt_tr50(p, rx, cond_source, n, r, tr_proportion=0.5):
    """
    KCSD test with Gaussian kernels (for both kernels). 
    Optimize the kernel bandwidths by maximizing the power criterin of the
    KCSD test.
    med = Use median heuristic to choose the bandwidths for both kernels.
    Compute the median heuristic on the data X and Y separate to get the two
    bandwidths.
    """
    X, Y = sample_xy(rx, cond_source, n, r)
    # start timing
    with util.ContextTimer() as t:
        # median heuristic
        sigx = util.pt_meddistance(X, subsample=600, seed=r + 7)
        sigy = util.pt_meddistance(Y, subsample=600, seed=r + 99)

        # kernels
        # k = kernel on X
        k = ker.PTKGauss(sigma2=sigx**2)
        # l = kernel on Y
        l = ker.PTKGauss(sigma2=sigy**2)

        # split the data
        cd = cdat.CondData(X, Y)
        tr, te = cd.split_tr_te(tr_proportion=tr_proportion)

        # training data
        Xtr, Ytr = tr.xy()
        # abs_min, abs_max = torch.min(Xtr).item(), torch.max(Xtr).item()
        # abs_stdx = torch.std(Xtr).item()
        # abs_stdy = torch.std(Ytr).item()

        kcsd_pc = cgof.KCSDPowerCriterion(p, k, l, Xtr, Ytr)

        max_iter = 100
        # learning rate
        lr = 1e-3
        # regularization in the power criterion
        reg = 1e-3

        # constraint satisfaction function
        def con_f(params):
            ksigma2 = params[0]
            lsigma2 = params[1]
            ksigma2.data.clamp_(min=1e-1, max=10 * sigx**2)
            lsigma2.data.clamp_(min=1e-1, max=10 * sigy**2)

        kcsd_pc.optimize_params([k.sigma2, l.sigma2],
                                constraint_f=con_f,
                                lr=lr,
                                reg=reg,
                                max_iter=max_iter)

        # Construct a KCSD test object
        kcsdtest = cgof.KCSDTest(p,
                                 k,
                                 l,
                                 alpha=alpha,
                                 n_bootstrap=400,
                                 seed=r + 88)
        Xte, Yte = te.xy()
        # test on the test set
        result = kcsdtest.perform_test(Xte, Yte)

    return {
        # 'test': kcsdtest,
        'test_result': result,
        'time_secs': t.secs
    }