示例#1
0
    def _get_metest_opt(self, dat, op=None):
        seed = self.seed
        if op is None:
            op = {
                'n_test_locs': self.n_locs,
                'seed': seed + 5,
                'max_iter': 100,
                'batch_proportion': 1.0,
                'locs_step_size': 1.0,
                'gwidth_step_size': 0.1,
                'tol_fun': 1e-4,
                'reg': 1e-6
            }
        seed = self.seed
        alpha = self.alpha
        p = self.p
        # Draw sample from p. #sample to draw is the same as that of dat
        ds = p.get_datasource()
        p_sample = ds.sample(dat.sample_size(), seed=seed)
        xtr, xte = p_sample.split_tr_te(tr_proportion=self.tr_proportion,
                                        seed=seed + 18)
        # ytr, yte are of type data.Data
        ytr, yte = dat.split_tr_te(tr_proportion=self.tr_proportion,
                                   seed=seed + 12)

        # training and test data
        tr_tst_data = fdata.TSTData(xtr.data(), ytr.data())
        te_tst_data = fdata.TSTData(xte.data(), yte.data())

        # Train the ME test
        V_opt, gw2_opt, _ = tst.MeanEmbeddingTest.optimize_locs_width(
            tr_tst_data, alpha, **op)
        metest = tst.MeanEmbeddingTest(V_opt, gw2_opt, alpha)
        return metest, tr_tst_data, te_tst_data
示例#2
0
    def get_H1_mean_variance(self, dat, return_variance=True):
        """
        Return the mean and variance under H1 of the 
        test statistic = \sqrt{n}(UME(P, R)^2 - UME(Q, R))^2.
        The estimator of the mean is unbiased (can be negative). The variance
        is also valid under H0.

        :returns: (mean, variance)

        If return_variance is False, 
        :returns: mean
        """
        umep = self.umep
        umeq = self.umeq
        # form a two-sample test dataset between datap and dat (data from R)
        Z = dat.data()
        datapr = tstdata.TSTData(self.datap.data(), Z)
        dataqr = tstdata.TSTData(self.dataq.data(), Z)

        # get the feature matrices (correlated)
        fea_pr = umep.feature_matrix(datapr)  # n x Jp
        fea_qr = umeq.feature_matrix(dataqr)  # n x Jq
        assert fea_pr.shape[1] == self.V.shape[0]
        assert fea_qr.shape[1] == self.W.shape[0]

        # umehp = ume_hat(p, r)
        umehp, var_pr = tst.UMETest.ustat_h1_mean_variance(
            fea_pr, return_variance=True, use_unbiased=True)
        umehq, var_qr = tst.UMETest.ustat_h1_mean_variance(
            fea_qr, return_variance=True, use_unbiased=True)

        if var_pr <= 0:
            log.l().warning(
                'Non-positive var_pr detected. Was {}'.format(var_pr))
        if var_qr <= 0:
            log.l().warning(
                'Non-positive var_qr detected. Was {}'.format(var_qr))
        #assert var_pr > 0, 'var_pr was {}'.format(var_pr)
        #assert var_qr > 0, 'var_qr was {}'.format(var_qr)
        mean_h1 = umehp - umehq

        if not return_variance:
            return mean_h1

        # mean features
        mean_pr = np.mean(fea_pr, axis=0)
        mean_qr = np.mean(fea_qr, axis=0)
        t1 = 4.0 * np.mean(np.dot(fea_pr, mean_pr) * np.dot(fea_qr, mean_qr))
        t2 = 4.0 * np.sum(mean_pr**2) * np.sum(mean_qr**2)

        # compute the cross-covariance
        var_pqr = t1 - t2
        var_h1 = var_pr - 2.0 * var_pqr + var_qr
        return mean_h1, var_h1
示例#3
0
文件: mmd.py 项目: mackelab/sbibm
    def perform_test(
        self,
        dat,
        candidate_kernels=None,
        return_mmdtest=False,
        tr_proportion=0.2,
        reg=1e-3,
    ):
        """
        dat: an instance of Data
        candidate_kernels: a list of Kernel's to choose from
        tr_proportion: proportion of sample to be used to choosing the best
            kernel
        reg: regularization parameter for the test power criterion
        """
        with util.ContextTimer() as t:
            seed = self.seed
            p = self.p
            ds = p.get_datasource()
            p_sample = ds.sample(dat.sample_size(), seed=seed + 77)
            xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion,
                                            seed=seed + 18)
            # ytr, yte are of type data.Data
            ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion,
                                       seed=seed + 12)

            # training and test data
            tr_tst_data = fdata.TSTData(xtr.data(), ytr.data())
            te_tst_data = fdata.TSTData(xte.data(), yte.data())

            if candidate_kernels is None:
                # Assume a Gaussian kernel. Construct a list of
                # kernels to try based on multiples of the median heuristic
                med = util.meddistance(tr_tst_data.stack_xy(), 1000)
                list_gwidth = np.hstack(
                    ((med**2) * (2.0**np.linspace(-4, 4, 10))))
                list_gwidth.sort()
                candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

            alpha = self.alpha

            # grid search to choose the best Gaussian width
            besti, powers = tst.QuadMMDTest.grid_search_kernel(
                tr_tst_data, candidate_kernels, alpha, reg=reg)
            # perform test
            best_ker = candidate_kernels[besti]
            mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha)
            results = mmdtest.perform_test(te_tst_data)
            if return_mmdtest:
                results["mmdtest"] = mmdtest

        results["time_secs"] = t.secs
        return results
示例#4
0
def TST_ME(Fea, N1, alpha, is_train, test_locs, gwidth, J=1, seed=15):
    """run ME test."""
    Fea = get_item(Fea, is_cuda)
    tst_data = data.TSTData(Fea[0:N1, :], Fea[N1:, :])
    h = 0
    if is_train:
        op = {
            'n_test_locs': J,  # number of test locations to optimize
            'max_iter': 300,  # maximum number of gradient ascent iterations
            'locs_step_size':
            1.0,  # step size for the test locations (features)
            'gwidth_step_size': 0.1,  # step size for the Gaussian width
            'tol_fun':
            1e-4,  # stop if the objective does not increase more than this.
            'seed': seed + 5,  # random seed
        }
        test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width(
            tst_data, alpha, **op)
        return test_locs, gwidth
    else:
        met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha)
        test_result = met_opt.perform_test(tst_data)
        if test_result['h0_rejected']:
            h = 1
        return h
示例#5
0
    def perform_test(self, X, Y):

        import freqopttest.data as fdata
        ds_p = self.ds_p
        mmdtest = self.mmdtest
        seed = self.seed

        with util.ContextTimer() as t:
            # split the data
            X1, Y1, X2, Y2 = MMDSplitTest._split_half(X,
                                                      Y,
                                                      seed=self.seed + 330)

            # Draw sample from p
            Y2_ = ds_p.cond_pair_sample(X2, seed=seed + 13)
            real_data = torch.cat([X1, Y1], dim=1).numpy()
            model_data = torch.cat([X2, Y2_], dim=1).numpy()

            # Run the two-sample test on p_sample and dat
            # Make a two-sample test data
            tst_data = fdata.TSTData(real_data, model_data)
            # Test
            results = mmdtest.perform_test(tst_data)

        results['time_secs'] = t.secs
        return results
示例#6
0
    def compute_stat(self, dat):
        mmdtest = self.mmdtest
        p = self.p
        # Draw sample from p. #sample to draw is the same as that of dat
        ds = p.get_datasource()
        p_sample = ds.sample(dat.sample_size(), seed=self.seed)

        # Make a two-sample test data
        tst_data = fdata.TSTData(p_sample.data(), dat.data())
        s = mmdtest.compute_stat(tst_data)
        return s
def mmd(p, q, alpha=0.05):
    if (p.ndim == 1): p = p[:, np.newaxis]
    if (q.ndim == 1): q = q[:, np.newaxis]
    d = data.TSTData(p, q)
    d_tr, d_te = d.split_tr_te(tr_proportion=0.5)
    med = util.meddistance(d_tr.stack_xy())
    widths = [(med * f) for f in 2.0**np.linspace(-1, 4, 20)]
    list_kernels = [kernel.KGauss(w**2) for w in widths]
    besti, powers = tst.LinearMMDTest.grid_search_kernel(
        d_tr, list_kernels, alpha)
    best_ker = list_kernels[besti]
    lin_mmd_test = tst.LinearMMDTest(best_ker, alpha)
    r = lin_mmd_test.perform_test(d_te)
    return r['test_stat'], r['pvalue']
示例#8
0
def TST_SCF(Fea, N1, alpha, is_train, test_freqs, gwidth, J = 1, seed = 15):
    """run SCF test."""
    Fea = get_item(Fea,is_cuda)
    tst_data = data.TSTData(Fea[0:N1,:], Fea[N1:,:])
    h = 0
    if is_train:
        op = {'n_test_freqs': J, 'seed': seed, 'max_iter': 300,
              'batch_proportion': 1.0, 'freqs_step_size': 0.1,
              'gwidth_step_size': 0.01, 'tol_fun': 1e-4}
        test_freqs, gwidth, info = tst.SmoothCFTest.optimize_freqs_width(tst_data, alpha, **op)
        return test_freqs, gwidth
    else:
        scf_opt = tst.SmoothCFTest(test_freqs, gwidth, alpha=alpha)
        test_result = scf_opt.perform_test(tst_data)
        if test_result['h0_rejected']:
            h = 1
        return h
示例#9
0
def load_nips_TSTData(fname):
    if fname in cache_loaded:
        return cache_loaded[fname]

    fpath = glo.data_file(fname)
    with open(fpath, 'r') as f:
        loaded = pickle.load(f)

    X = loaded['P']
    Y = loaded['Q']
    n_min = min(X.shape[0], Y.shape[0])
    X = X[:n_min, :]
    Y = Y[:n_min, :]
    assert (X.shape[0] == Y.shape[0])
    tst_data = data.TSTData(X, Y)
    cache_loaded[fname] = (tst_data, n_min)
    return tst_data, n_min
示例#10
0
    def compute_stat(self, X, Y):
        """
        X: Torch tensor of size n x dx
        Y: Torch tensor of size n x dy
        
        Return a test statistic
        """
        import freqopttest.data as fdata
        seed = self.seed
        ds_p = self.ds_p
        mmdtest = self.mmdtest

        # Draw sample from p
        Y_ = ds_p.cond_pair_sample(X, seed=seed + 13)
        real_data = torch.cat([X, Y], dim=1).numpy()
        model_data = torch.cat([X, Y_], dim=1).numpy()
        # Make a two-sample test data
        tst_data = fdata.TSTData(real_data, model_data)
        stat = mmdtest.compute_stat(tst_data)
        return stat
示例#11
0
    def perform_test(self, dat):
        """
        dat: an instance of Data
        """
        with util.ContextTimer() as t:
            seed = self.seed
            mmdtest = self.mmdtest
            p = self.p

            # Draw sample from p. #sample to draw is the same as that of dat
            ds = p.get_datasource()
            p_sample = ds.sample(dat.sample_size(), seed=seed + 12)

            # Run the two-sample test on p_sample and dat
            # Make a two-sample test data
            tst_data = fdata.TSTData(p_sample.data(), dat.data())
            # Test
            results = mmdtest.perform_test(tst_data)

        results['time_secs'] = t.secs
        return results
def wtest(p, q, alpha=0.05):
    op = {
        'n_test_locs': 2,
        'seed': 0,
        'max_iter': 200,
        'batch_proportion': 1.0,
        'locs_step_size': 1.0,
        'gwidth_step_size': 0.1,
        'tol_fun': 1e-4
    }
    if (p.ndim == 1): p = p[:, np.newaxis]
    if (q.ndim == 1): q = q[:, np.newaxis]
    d = data.TSTData(p, q)
    d_tr, d_te = d.split_tr_te(tr_proportion=0.5)
    test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width(
        d_tr, alpha, **op)
    met_opt = tst.MeanEmbeddingTest(test_locs, gwidth, alpha)
    r = met_opt.perform_test(d_te)
    if (r['test_stat'] == -1):
        r['test_stat'] = np.nan
        r['pvalue'] = np.nan
    return r['test_stat'], r['pvalue']
示例#13
0
    def compute_stat(self, X, Y):
        """
        X: Torch tensor of size n x dx
        Y: Torch tensor of size n x dy
        
        Return a test statistic
        """

        import freqopttest.data as fdata
        seed = self.seed
        ds_p = self.ds_p
        mmdtest = self.mmdtest
        # split the data
        X1, Y1, X2, Y2 = MMDSplitTest._split_half(X, Y, seed=self.seed + 330)

        # Draw sample from p
        Y2_ = ds_p.cond_pair_sample(X2, seed=seed + 13)
        real_data = torch.cat([X1, Y1], dim=1).numpy()
        model_data = torch.cat([X2, Y2_], dim=1).numpy()
        # Make a two-sample test data
        tst_data = fdata.TSTData(real_data, model_data)
        stat = mmdtest.compute_stat(tst_data)
        return stat
示例#14
0
 def preprocess(self, X, Y):
     if len(X.shape) > 2:
         X = X.reshape(len(X), -1)
         Y = Y.reshape(len(Y), -1)
     XY = fot_data.TSTData(X, Y)
     return XY
示例#15
0
    def optimize_2sets_locs_widths(datap,
                                   dataq,
                                   datar,
                                   V0,
                                   W0,
                                   gwidth0p,
                                   gwidth0q,
                                   reg=1e-3,
                                   max_iter=100,
                                   tol_fun=1e-6,
                                   disp=False,
                                   locs_bounds_frac=100,
                                   gwidth_lb=None,
                                   gwidth_ub=None):
        """
        Optimize two sets of test locations and the Gaussian kernel widths by
        maximizing the test power criterion of the UME two-sample test (not
        three-sample test). Briefly,
            1. Optimize the set V of test locations for UME(P, R) by maximizing
            its two-sample test power criterion.
            2. Optimize the set W for UME(Q, R) in the same way.

        The two optimization problems are independent. The only dependency is
        the data from R. This optimization function is deterministic.

        - datap: a kgof.data.Data from P (model 1)
        - dataq: a kgof.data.Data from Q (model 2)
        - datar: a kgof.data.Data from R (data generating distribution)
        - V0: Jpxd numpy array. Initial V.
        - W0: Jqxd numpy array. Initial W.
        - gwidth0p: initial value of the Gaussian width^2 for UME(P, R)
        - gwidth0q: initial value of the Gaussian width^2 for UME(Q, R)
        - reg: reg to add to the mean/sqrt(variance) criterion to become
            mean/sqrt(variance + reg)
        - max_iter: #gradient descent iterations
        - tol_fun: termination tolerance of the objective value
        - disp: True to print convergence messages
        - locs_bounds_frac: When making box bounds for the test_locs, extend
              the box defined by coordinate-wise min-max by std of each coordinate
              (of the aggregated data) multiplied by this number.
        - gwidth_lb: absolute lower bound on both the Gaussian width^2
        - gwidth_ub: absolute upper bound on both the Gaussian width^2

        If the lb, ub bounds are None, use fraction of the median heuristics 
            to automatically set the bounds.
        
        Return (  
            (V test_locs, gaussian width^2 for UME(P, R), optimization info log),
            (W test_locs, gaussian width^2 for UME(Q, R), optimization info log),
                )
        """

        Z = datar.data()
        datapr = tstdata.TSTData(datap.data(), Z)
        dataqr = tstdata.TSTData(dataq.data(), Z)

        # optimization for UME(P,R)
        V_opt, gw2p_opt, opt_infop = \
        tst.GaussUMETest.optimize_locs_width(datapr, V0, gwidth0p, reg=reg,
                max_iter=max_iter, tol_fun=tol_fun, disp=disp,
                locs_bounds_frac=locs_bounds_frac, gwidth_lb=gwidth_lb,
                gwidth_ub=gwidth_ub)

        # optimization for UME(Q,R)
        W_opt, gw2q_opt, opt_infoq = \
        tst.GaussUMETest.optimize_locs_width(dataqr, W0, gwidth0q, reg=reg,
                max_iter=max_iter, tol_fun=tol_fun, disp=disp,
                locs_bounds_frac=locs_bounds_frac, gwidth_lb=gwidth_lb,
                gwidth_ub=gwidth_ub)

        return ((V_opt, gw2p_opt, opt_infop), (W_opt, gw2q_opt, opt_infoq))