예제 #1
0
    def ume_test(X, Y, Z, V, alpha=0.01, mode='mean'):
        """
        Perform a UME three-sample test.
        All the data are assumed to be preprocessed.

        Args:
            - X: n x d ndarray, a sample from P
            - Y: n x d ndarray, a sample from Q
            - Z: n x d ndarray, a sample from R
            - V: J x d ndarray, a set of J test locations
            - alpha: a user specified significance level

        Returns:
            - a dictionary of the form
                {
                    alpha: 0.01,
                    pvalue: 0.0002,
                    test_stat: 2.3,
                    h0_rejected: True,
                    time_secs: ...
                }
        """
        if mode == 'mean':
            mean_medxyz2 = SC_MMD.median_heuristic_bounliphone(X,
                                                               Y,
                                                               Z,
                                                               subsample=1000)
            gwidth = mean_medxyz2
        else:
            XYZ = np.vstack((X, Y, Z))
            med2 = util.meddistance(XYZ, subsample=1000)**2
            gwidth = med2
        k = kernel.KGauss(gwidth)
        scume = SC_UME(data.Data(X), data.Data(Y), k, k, V, V, alpha)
        return scume.perform_test(data.Data(Z))
예제 #2
0
def run_optimization(args, gp, gq, img_data, model_name, J=10):
    """
    Wrapper for noise space optimization

    """

    model = load_pretrained_model(model_name)
    model.eval()
    if model_name == 'inceptionv3':
        feat_func = model.pool3
    else:
        feat_func = model.features

    sample_size = args.sample_size  # number of images we want to generate
    samples_p = sample_images(gp, sample_size)
    datap = go.extract_feats(samples_p, feat_func, upsample=True)

    samples_q = sample_images(gq, sample_size)
    dataq = go.extract_feats(samples_q, feat_func, upsample=True)

    ind = util.subsample_ind(img_data.shape[0], sample_size)
    datar = img_data[ind]
    datar = samples_p = go.extract_feats(datar.transpose((0, 3, 1, 2)),
                                         feat_func,
                                         upsample=True)
    datap = data.Data(datap)
    dataq = data.Data(dataq)
    datar = data.Data(datar)

    Zp0 = np.random.uniform(-1, 1, (J, gp.z_size))
    Zq0 = np.random.uniform(-1, 1, (J, gq.z_size))
    XYZ = np.vstack((datap.data(), dataq.data(), datar.data()))
    med2 = util.meddistance(XYZ, subsample=1000)**2

    if args.exp == 2:
        gp = gq

    with util.ContextTimer() as t:
        Z_opt, gw_opt, opt_result = go.optimize_3sample_criterion(datap,
                                                                  dataq,
                                                                  datar,
                                                                  gp,
                                                                  gq,
                                                                  feat_func,
                                                                  Zp0,
                                                                  Zq0,
                                                                  gwidth0=med2)

    results = {}
    results['Z'] = Z_opt
    results['width'] = gw_opt
    results['opt'] = opt_result
    results['t'] = t
    results['ind'] = ind

    return results
예제 #3
0
def met_kid_mmd(mix_ratios, data_loader, n, r):
    """
    Bounliphone et al., 2016's MMD-based 3-sample test with the KID kernel
    in Binkowski et al., 2018.
    """

    sample_size = [n] * 3
    X, Y, Z, _ = sample_data_mixing(mix_ratios, data_loader, sample_size, r)

    k = kernel.KKID()
    scmmd = SC_MMD(data.Data(X), data.Data(Y), k, alpha)
    return scmmd.perform_test(data.Data(Z))
예제 #4
0
    def test_basic(self):
        """
        Test basic things. Make sure SC_UME runs under normal usage.
        """
        mp, varp = 4, 1
        # q cannot be the true model. 
        # That violates our assumption and the asymptotic null distribution
        # does not hold.
        mq, varq = 0.5, 1

        # draw some data
        n = 2999 # sample size
        seed = 89
        with util.NumpySeedContext(seed=seed):
            X = np.random.randn(n, 1)*varp**0.5 + mp
            Y = np.random.randn(n, 1)*varq**0.5 + mq
            Z = np.random.randn(n, 1)
            
            datap = data.Data(X)
            dataq = data.Data(Y)
            datar = data.Data(Z)

        # hyperparameters of the test
        medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
        medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
        k = kernel.KGauss(sigma2=medxz**2)
        l = kernel.KGauss(sigma2=medyz**2)

        # 2 sets of test locations
        J = 3
        Jp = J
        Jq = J
        V = util.fit_gaussian_draw(X, Jp, seed=seed+2)
        W = util.fit_gaussian_draw(Y, Jq, seed=seed+3)

        # construct a UME test
        alpha = 0.01 # significance level 
        scume = mct.SC_UME(datap, dataq, k, l, V, W, alpha=alpha)
        test_result = scume.perform_test(datar)

        # make sure it rejects
        #print(test_result)
        assert test_result['h0_rejected']
예제 #5
0
    def test_basic(self):
        """
        Nothing special. Just test basic things.
        """
        seed = 13
        # sample
        n = 103
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1
            p = density.IsotropicNormal(mean, variance)
            q = density.IsotropicNormal(mean, variance+3)

            # only one dimension of the mean is shifted
            #draw_mean = mean + np.hstack((1, np.zeros(d-1)))
            draw_mean = mean +0
            draw_variance = variance + 1
            X = util.randn(n, d, seed=seed)*np.sqrt(draw_variance) + draw_mean
            dat = data.Data(X)

            # Test
            for J in [1, 3]:
                sig2 = util.meddistance(X, subsample=1000)**2
                k = kernel.KGauss(sig2)

                # random test locations
                V = util.fit_gaussian_draw(X, J, seed=seed+1)
                W = util.fit_gaussian_draw(X, J, seed=seed+8)

                mcfssd = mct.DC_FSSD(p, q, k, k, V, W, alpha=0.01)
                s = mcfssd.compute_stat(dat)
                s2, var = mcfssd.get_H1_mean_variance(dat)

                tresult = mcfssd.perform_test(dat)

                # assertions
                self.assertGreaterEqual(tresult['pvalue'], 0)
                self.assertLessEqual(tresult['pvalue'], 1)
                testing.assert_approx_equal(s, (n**0.5)*s2)
예제 #6
0
def met_gumeJ1_2V_rand(P, Q, data_source, n, r, J=1, use_1set_locs=False):
    """
    UME-based three-sample test. 
        * Use J=1 test location by default. 
        * Use two sets (2V) of test locations by default: V and W, each having J
            locations.  Will constrain V=W if use_1set_locs=True.
        * The test locations are selected at random from the data. Selected
            points are removed for testing.
        * Gaussian kernels for the two UME statistics. Median heuristic is used
            to select each width.
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}
    assert J >= 1

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:

        # remove the first J points from each set
        X, Y, Z = datp.data(), datq.data(), datr.data()

        # containing 3*J points
        pool3J = np.vstack((X[:J, :], Y[:J, :], Z[:J, :]))
        X, Y, Z = (X[J:, :], Y[J:, :], Z[J:, :])

        datp, datq, datr = [data.Data(a) for a in [X, Y, Z]]
        assert X.shape[0] == Y.shape[0]
        assert Y.shape[0] == Z.shape[0]
        assert Z.shape[0] == n - J
        assert datp.sample_size() == n - J
        assert datq.sample_size() == n - J
        assert datr.sample_size() == n - J

        #XYZ = np.vstack((X, Y, Z))
        #stds = np.std(util.subsample_rows(XYZ, min(n-3*J, 500),
        #    seed=r+87), axis=0)
        d = X.shape[1]
        # add a little noise to the locations.
        with util.NumpySeedContext(seed=r * 191):
            #pool3J = pool3J + np.random.randn(3*J, d)*np.max(stds)*3
            pool3J = np.random.randn(3 * J, d) * 2

        # median heuristic to set the Gaussian widths
        medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
        medyz = util.meddistance(np.vstack((Z, Y)), subsample=1000)
        if use_1set_locs:
            # randomly select J points from the pool3J for the J test locations
            #V = util.subsample_rows(pool3J, J, r)
            V = pool3J[:J, :]
            W = V
            k = kernel.KGauss(sigma2=np.mean([medxz, medyz])**2)
            l = k
        else:
            # use two sets of locations: V and W
            #VW = util.subsample_rows(pool3J, 2*J, r)
            VW = pool3J[:2 * J, :]
            V = VW[:J, :]
            W = VW[J:, :]

            # 2 Gaussian kernels
            k = kernel.KGauss(sigma2=medxz**2)
            l = kernel.KGauss(sigma2=medyz**2)

        # construct the test
        scume = mct.SC_UME(datp, datq, k, l, V, W, alpha=alpha)
        scume_rand_result = scume.perform_test(datr)

    return {
        # This key "test" can be removed. Storing V, W can take quite a lot
        # of space, especially when the input dimension d is high.
        #'test':scume,
        'test_result': scume_rand_result,
        'time_secs': t.secs
    }
예제 #7
0
def opt_greedy_3sample_criterion(datap,
                                 dataq,
                                 datar,
                                 locs,
                                 k,
                                 J,
                                 reg=1e-3,
                                 maximize=True,
                                 featurizer=None):
    """
    Obtains a set of J test locations by maximizing (or minimizing)
    the power criterion of the UME three-sample test.
    The test locations are given by choosing a subset from given
    candidate locations by the greedy forward selection.

    Args:
        - datap: a kgof.data.Data from P (model 1)
        - dataq: a kgof.data.Data from Q (model 2)
        - datar: a kgof.data.Data from R (data generating distribution)
        - model: a torch model used for feature extraction
        - locs: an n_c x d numpy array representing a set of n_c candidate locations
        - k: a kernel object
        - J: the number of test locations
        - reg: reg to add to the mean/sqrt(variance) criterion to become
            mean/sqrt(variance + reg)
        - maximize: if True, maximize the power criterion, otherwise minimize
        - featurizer: if given, the data is tranformed by the given feature
          extractor model

    Returns:
        A set of indices representing obtained locations
    """

    # transform inputs to power criterion with feature extractor
    if featurizer is None:
        dp = datap
        dq = dataq
        dr = datar
        fV = locs
    else:
        dp = featurizer(datap.data()).cpu().data.numpy()
        dp = data.Data(dp)
        dq = featurizer(dataq.data()).cpu().data.numpy()
        dq = data.Data(dq)
        dr = featurizer(datar.data()).cpu().data.numpy()
        dr = data.Data(dr)
        fV = featurizer(locs).cpu().data.numpy()

    def obj(V):
        if len(V.shape) < 2:
            V = V.reshape((-1, ) + V.shape)
        if maximize:
            return SC_UME.power_criterion(dp, dq, dr, k, k, V, V, reg=reg)
        else:
            return -SC_UME.power_criterion(dp, dq, dr, k, k, V, V, reg=reg)

    def greedy_search(num_locs, loc_pool):
        best_loc_idx = []
        n = loc_pool.shape[0]
        current_pool_idx_set = set(range(n))
        for _ in range(num_locs):
            best_locs = loc_pool[best_loc_idx]
            max_val = -np.inf
            for k in current_pool_idx_set:
                V = np.vstack([loc_pool[[k]], best_locs])
                # evaluate the power criterion (score)
                score = obj(V)
                if score > max_val:
                    max_val = score
                    best_idx = k
            best_loc_idx.append(best_idx)
            current_pool_idx_set.remove(best_idx)
            # print(best_loc_idx)
        return best_loc_idx

    return greedy_search(J, fV)
예제 #8
0
    def test_optimize_2sets_locs_widths(self):
        mp, varp = 2, 1
        # q cannot be the true model. 
        # That violates our assumption and the asymptotic null distribution
        # does not hold.
        mq, varq = 1, 1

        # draw some data
        n = 800 # sample size
        seed = 6
        with util.NumpySeedContext(seed=seed):
            X = np.random.randn(n, 1)*varp**0.5 + mp
            Y = np.random.randn(n, 1)*varq**0.5 + mq
            Z = np.random.randn(n, 1)
            
            datap = data.Data(X)
            dataq = data.Data(Y)
            datar = data.Data(Z)

        # split the data into training/test sets
        [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \
            [D.split_tr_te(tr_proportion=0.3, seed=85) for D in [datap, dataq, datar]]
        Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]]

        # initialize optimization parameters.
        # Initialize the Gaussian widths with the median heuristic
        medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000)
        medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000)
        gwidth0p = medxz**2
        gwidth0q = medyz**2

        # numbers of test locations in V, W
        J = 2
        Jp = J
        Jq = J

        # pick a subset of points in the training set for V, W
        Xyztr = np.vstack((Xtr, Ytr, Ztr))
        VW = util.subsample_rows(Xyztr, Jp+Jq, seed=73)
        V0 = VW[:Jp, :]
        W0 = VW[Jp:, :]

        # optimization options
        opt_options = {
            'max_iter': 100,
            'reg': 1e-4,
            'tol_fun': 1e-6,
            'locs_bounds_frac': 100,
            'gwidth_lb': None,
            'gwidth_ub': None,
        }

        umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths(
            datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, 
            **opt_options)
        (V_opt, gw2p_opt, opt_infop) = umep_params
        (W_opt, gw2q_opt, opt_infoq) = umeq_params
        k_opt = kernel.KGauss(gw2p_opt)
        l_opt = kernel.KGauss(gw2q_opt)
        # construct a UME test
        alpha = 0.01 # significance level 
        scume_opt2 = mct.SC_UME(datpte, datqte, k_opt, l_opt, V_opt, W_opt, alpha=alpha)
        scume_opt2.perform_test(datrte)