예제 #1
0
    def ume_test(X, Y, Z, V, alpha=0.01, mode='mean'):
        """
        Perform a UME three-sample test.
        All the data are assumed to be preprocessed.

        Args:
            - X: n x d ndarray, a sample from P
            - Y: n x d ndarray, a sample from Q
            - Z: n x d ndarray, a sample from R
            - V: J x d ndarray, a set of J test locations
            - alpha: a user specified significance level

        Returns:
            - a dictionary of the form
                {
                    alpha: 0.01,
                    pvalue: 0.0002,
                    test_stat: 2.3,
                    h0_rejected: True,
                    time_secs: ...
                }
        """
        if mode == 'mean':
            mean_medxyz2 = SC_MMD.median_heuristic_bounliphone(X,
                                                               Y,
                                                               Z,
                                                               subsample=1000)
            gwidth = mean_medxyz2
        else:
            XYZ = np.vstack((X, Y, Z))
            med2 = util.meddistance(XYZ, subsample=1000)**2
            gwidth = med2
        k = kernel.KGauss(gwidth)
        scume = SC_UME(data.Data(X), data.Data(Y), k, k, V, V, alpha)
        return scume.perform_test(data.Data(Z))
예제 #2
0
 def obj(sqrt_gwidth, V):
     gwidth2 = sqrt_gwidth**2
     k = kernel.KGauss(gwidth2)
     if added_obj is None:
         return -DC_FSSD.power_criterion(
             p, q, datar, k, k, V, V, reg=reg)
     else:
         return -(DC_FSSD.power_criterion(
             p, q, datar, k, k, V, V, reg=reg) + added_obj(gwidth2, V))
예제 #3
0
    def test_basic(self):
        """
        Test basic things. Make sure SC_UME runs under normal usage.
        """
        mp, varp = 4, 1
        # q cannot be the true model. 
        # That violates our assumption and the asymptotic null distribution
        # does not hold.
        mq, varq = 0.5, 1

        # draw some data
        n = 2999 # sample size
        seed = 89
        with util.NumpySeedContext(seed=seed):
            X = np.random.randn(n, 1)*varp**0.5 + mp
            Y = np.random.randn(n, 1)*varq**0.5 + mq
            Z = np.random.randn(n, 1)
            
            datap = data.Data(X)
            dataq = data.Data(Y)
            datar = data.Data(Z)

        # hyperparameters of the test
        medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
        medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
        k = kernel.KGauss(sigma2=medxz**2)
        l = kernel.KGauss(sigma2=medyz**2)

        # 2 sets of test locations
        J = 3
        Jp = J
        Jq = J
        V = util.fit_gaussian_draw(X, Jp, seed=seed+2)
        W = util.fit_gaussian_draw(Y, Jq, seed=seed+3)

        # construct a UME test
        alpha = 0.01 # significance level 
        scume = mct.SC_UME(datap, dataq, k, l, V, W, alpha=alpha)
        test_result = scume.perform_test(datar)

        # make sure it rejects
        #print(test_result)
        assert test_result['h0_rejected']
예제 #4
0
    def __init__(self, p, q, gwidth2p, gwidth2q, V, W, alpha=0.01):
        """
        :param p: a kmod.density.UnnormalizedDensity (model 1)
        :param q: a kmod.density.UnnormalizedDensity (model 2)
        :param gwidth0p: squared Gaussian width for the kernel k in FSSD(p, k, V)
        :param gwidth0q: squared Gaussian width for the kernel l in FSSD(q, l, W)
        :param V: Jp x d numpy array of Jp test locations used in FSSD(p, k, V)
        :param W: Jq x d numpy array of Jq test locations used in FSSD(q, l, W)
        :param alpha: significance level of the test
        """

        if not util.is_real_num(gwidth2p) or gwidth2p <= 0:
            raise ValueError(
                'gwidth2p must be positive real. Was {}'.format(gwidth2p))
        if not util.is_real_num(gwidth2q) or gwidth2q <= 0:
            raise ValueError(
                'gwidth2q must be positive real. Was {}'.format(gwidth2q))

        k = kernel.KGauss(gwidth2p)
        l = kernel.KGauss(gwidth2q)
        super(DC_GaussFSSD, self).__init__(p, q, k, l, V, W, alpha)
예제 #5
0
    def __init__(self, datap, dataq, gwidth2p, gwidth2q, V, W, alpha=0.01):
        """
        :param datap: a kmod.data.Data object representing an i.i.d. sample X
            (from model 1)
        :param dataq: a kmod.data.Data object representing an i.i.d. sample Y
            (from model 2)
        :param gwidth2p: squared Gaussian width for UME(P, R)
        :param gwidth2q: squared Gaussian width for UME(Q, R)
        :param V: Jp x d numpy array of Jp test locations used in UME(p, r)
        :param W: Jq x d numpy array of Jq test locations used in UME(q, r)
        :param alpha: significance level of the test
        """
        if not util.is_real_num(gwidth2p) or gwidth2p <= 0:
            raise ValueError(
                'gwidth2p must be positive real. Was {}'.format(gwidth2p))
        if not util.is_real_num(gwidth2q) or gwidth2q <= 0:
            raise ValueError(
                'gwidth2q must be positive real. Was {}'.format(gwidth2q))

        k = kernel.KGauss(gwidth2p)
        l = kernel.KGauss(gwidth2q)
        super(SC_GaussUME, self).__init__(datap, dataq, k, l, V, W, alpha)
예제 #6
0
def met_gmmd_med(P, Q, data_source, n, r):
    """
    Use met_gmmd_med_bounliphone(). It uses the median heuristic following
    Bounliphone et al., 2016.

    Bounliphone et al., 2016's MMD-based 3-sample test.
    * Gaussian kernel. 
    * Gaussian width = mean of (median heuristic on (X, Z), median heuristic on
        (Y, Z))
    * Use full sample for testing (no
    holding out for optimization)
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:
        X, Y, Z = datp.data(), datq.data(), datr.data()

        # hyperparameters of the test
        medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
        medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
        medxyz = np.mean([medxz, medyz])
        k = kernel.KGauss(sigma2=medxyz**2)

        scmmd = mct.SC_MMD(datp, datq, k, alpha=alpha)
        scmmd_result = scmmd.perform_test(datr)

    return {
        # This key "test" can be removed.
        #'test': scmmd,
        'test_result': scmmd_result,
        'time_secs': t.secs
    }
예제 #7
0
def met_gmmd_med_bounliphone(P, Q, data_source, n, r):
    """
    Bounliphone et al., 2016's MMD-based 3-sample test.
    * Gaussian kernel. 
    * Gaussian width = chosen as described in https://github.com/wbounliphone/relative_similarity_test/blob/4884786aa3fe0f41b3ee76c9587de535a6294aee/relativeSimilarityTest_finalversion.m 
    * Use full sample for testing (no
    holding out for optimization)
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:
        X, Y, Z = datp.data(), datq.data(), datr.data()

        med2 = mct.SC_MMD.median_heuristic_bounliphone(X,
                                                       Y,
                                                       Z,
                                                       subsample=1000,
                                                       seed=r + 3)
        k = kernel.KGauss(sigma2=med2)

        scmmd = mct.SC_MMD(datp, datq, k, alpha=alpha)
        scmmd_result = scmmd.perform_test(datr)

    return {
        # This key "test" can be removed.
        # 'test': scmmd,
        'test_result': scmmd_result,
        'time_secs': t.secs
    }
예제 #8
0
    def test_basic(self):
        """
        Nothing special. Just test basic things.
        """
        seed = 13
        # sample
        n = 103
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1
            p = density.IsotropicNormal(mean, variance)
            q = density.IsotropicNormal(mean, variance+3)

            # only one dimension of the mean is shifted
            #draw_mean = mean + np.hstack((1, np.zeros(d-1)))
            draw_mean = mean +0
            draw_variance = variance + 1
            X = util.randn(n, d, seed=seed)*np.sqrt(draw_variance) + draw_mean
            dat = data.Data(X)

            # Test
            for J in [1, 3]:
                sig2 = util.meddistance(X, subsample=1000)**2
                k = kernel.KGauss(sig2)

                # random test locations
                V = util.fit_gaussian_draw(X, J, seed=seed+1)
                W = util.fit_gaussian_draw(X, J, seed=seed+8)

                mcfssd = mct.DC_FSSD(p, q, k, k, V, W, alpha=0.01)
                s = mcfssd.compute_stat(dat)
                s2, var = mcfssd.get_H1_mean_variance(dat)

                tresult = mcfssd.perform_test(dat)

                # assertions
                self.assertGreaterEqual(tresult['pvalue'], 0)
                self.assertLessEqual(tresult['pvalue'], 1)
                testing.assert_approx_equal(s, (n**0.5)*s2)
예제 #9
0
def met_gumeJ1_2V_rand(P, Q, data_source, n, r, J=1, use_1set_locs=False):
    """
    UME-based three-sample test. 
        * Use J=1 test location by default. 
        * Use two sets (2V) of test locations by default: V and W, each having J
            locations.  Will constrain V=W if use_1set_locs=True.
        * The test locations are selected at random from the data. Selected
            points are removed for testing.
        * Gaussian kernels for the two UME statistics. Median heuristic is used
            to select each width.
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}
    assert J >= 1

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:

        # remove the first J points from each set
        X, Y, Z = datp.data(), datq.data(), datr.data()

        # containing 3*J points
        pool3J = np.vstack((X[:J, :], Y[:J, :], Z[:J, :]))
        X, Y, Z = (X[J:, :], Y[J:, :], Z[J:, :])

        datp, datq, datr = [data.Data(a) for a in [X, Y, Z]]
        assert X.shape[0] == Y.shape[0]
        assert Y.shape[0] == Z.shape[0]
        assert Z.shape[0] == n - J
        assert datp.sample_size() == n - J
        assert datq.sample_size() == n - J
        assert datr.sample_size() == n - J

        #XYZ = np.vstack((X, Y, Z))
        #stds = np.std(util.subsample_rows(XYZ, min(n-3*J, 500),
        #    seed=r+87), axis=0)
        d = X.shape[1]
        # add a little noise to the locations.
        with util.NumpySeedContext(seed=r * 191):
            #pool3J = pool3J + np.random.randn(3*J, d)*np.max(stds)*3
            pool3J = np.random.randn(3 * J, d) * 2

        # median heuristic to set the Gaussian widths
        medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
        medyz = util.meddistance(np.vstack((Z, Y)), subsample=1000)
        if use_1set_locs:
            # randomly select J points from the pool3J for the J test locations
            #V = util.subsample_rows(pool3J, J, r)
            V = pool3J[:J, :]
            W = V
            k = kernel.KGauss(sigma2=np.mean([medxz, medyz])**2)
            l = k
        else:
            # use two sets of locations: V and W
            #VW = util.subsample_rows(pool3J, 2*J, r)
            VW = pool3J[:2 * J, :]
            V = VW[:J, :]
            W = VW[J:, :]

            # 2 Gaussian kernels
            k = kernel.KGauss(sigma2=medxz**2)
            l = kernel.KGauss(sigma2=medyz**2)

        # construct the test
        scume = mct.SC_UME(datp, datq, k, l, V, W, alpha=alpha)
        scume_rand_result = scume.perform_test(datr)

    return {
        # This key "test" can be removed. Storing V, W can take quite a lot
        # of space, especially when the input dimension d is high.
        #'test':scume,
        'test_result': scume_rand_result,
        'time_secs': t.secs
    }
예제 #10
0
def met_gumeJ1_3sopt_tr50(P, Q, data_source, n, r, J=1, tr_proportion=0.5):
    """
    UME-based three-sample test
        * Use J=1 test location by default (in the set V=W). 
        * 3sopt = optimize the test locations by maximizing the 3-sample test's
        power criterion. There is only one set of test locations.
        * One Gaussian kernel for the two UME statistics. Optimize the Gaussian width
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}
    assert J >= 1

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:
        # split the data into training/test sets
        [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \
            [D.split_tr_te(tr_proportion=tr_proportion, seed=r) for D in [datp, datq, datr]]
        Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]]
        Xyztr = np.vstack((Xtr, Ytr, Ztr))
        # initialize optimization parameters.
        # Initialize the Gaussian widths with the median heuristic
        medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000)
        medyz = util.meddistance(np.vstack((Ztr, Ytr)), subsample=1000)
        gwidth0 = np.mean([medxz, medyz])**2

        # pick a subset of points in the training set for V, W
        V0 = util.subsample_rows(Xyztr, J, seed=r + 2)

        # optimization options
        opt_options = {
            'max_iter': 100,
            'reg': 1e-6,
            'tol_fun': 1e-7,
            'locs_bounds_frac': 50,
            'gwidth_lb': 0.1,
            'gwidth_ub': 6**2,
        }
        V_opt, gw2_opt, opt_result = mct.SC_GaussUME.optimize_3sample_criterion(
            datptr, datqtr, datrtr, V0, gwidth0, **opt_options)
        k_opt = kernel.KGauss(gw2_opt)

        # construct a UME test
        scume_opt3 = mct.SC_UME(datpte,
                                datqte,
                                k_opt,
                                k_opt,
                                V_opt,
                                V_opt,
                                alpha=alpha)
        scume_opt3_result = scume_opt3.perform_test(datrte)

    return {
        # This key "test" can be removed. Storing V, W can take quite a lot
        # of space, especially when the input dimension d is high.
        #'test':scume,
        'test_result': scume_opt3_result,
        'time_secs': t.secs
    }
예제 #11
0
def met_gumeJ1_2sopt_tr50(P, Q, data_source, n, r, J=1, tr_proportion=0.5):
    """
    UME-based three-sample test
        * Use J=1 test location by default. 
        * 2sopt = optimize the two sets of test locations by maximizing the
            2-sample test's power criterion. Each set is optmized separately.
        * Gaussian kernels for the two UME statistics. The Gaussian widths are
        also optimized separately.
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}
    assert J >= 1

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:
        # split the data into training/test sets
        [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \
            [D.split_tr_te(tr_proportion=tr_proportion, seed=r) for D in [datp, datq, datr]]
        Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]]

        # initialize optimization parameters.
        # Initialize the Gaussian widths with the median heuristic
        medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000)
        medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000)
        gwidth0p = medxz**2
        gwidth0q = medyz**2

        # numbers of test locations in V, W
        Jp = J
        Jq = J

        # pick a subset of points in the training set for V, W
        Xyztr = np.vstack((Xtr, Ytr, Ztr))
        VW = util.subsample_rows(Xyztr, Jp + Jq, seed=r + 1)
        V0 = VW[:Jp, :]
        W0 = VW[Jp:, :]

        # optimization options
        opt_options = {
            'max_iter': 100,
            'reg': 1e-4,
            'tol_fun': 1e-6,
            'locs_bounds_frac': 50,
            'gwidth_lb': 0.1,
            'gwidth_ub': 10**2,
        }

        umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths(
            datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, **opt_options)
        (V_opt, gw2p_opt, opt_infop) = umep_params
        (W_opt, gw2q_opt, opt_infoq) = umeq_params
        k_opt = kernel.KGauss(gw2p_opt)
        l_opt = kernel.KGauss(gw2q_opt)

        # construct a UME test
        scume_opt2 = mct.SC_UME(datpte,
                                datqte,
                                k_opt,
                                l_opt,
                                V_opt,
                                W_opt,
                                alpha=alpha)
        scume_opt2_result = scume_opt2.perform_test(datrte)

    return {
        # This key "test" can be removed. Storing V, W can take quite a lot
        # of space, especially when the input dimension d is high.
        #'test':scume,
        'test_result': scume_opt2_result,
        'time_secs': t.secs
    }
예제 #12
0
 def obj_feat_space(sqrt_gwidth, V):
     k = kernel.KGauss(sqrt_gwidth**2)
     return -SC_UME.power_criterion(
         datap, dataq, datar, k, k, V, V, reg=reg)
예제 #13
0
    def test_optimize_2sets_locs_widths(self):
        mp, varp = 2, 1
        # q cannot be the true model. 
        # That violates our assumption and the asymptotic null distribution
        # does not hold.
        mq, varq = 1, 1

        # draw some data
        n = 800 # sample size
        seed = 6
        with util.NumpySeedContext(seed=seed):
            X = np.random.randn(n, 1)*varp**0.5 + mp
            Y = np.random.randn(n, 1)*varq**0.5 + mq
            Z = np.random.randn(n, 1)
            
            datap = data.Data(X)
            dataq = data.Data(Y)
            datar = data.Data(Z)

        # split the data into training/test sets
        [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \
            [D.split_tr_te(tr_proportion=0.3, seed=85) for D in [datap, dataq, datar]]
        Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]]

        # initialize optimization parameters.
        # Initialize the Gaussian widths with the median heuristic
        medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000)
        medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000)
        gwidth0p = medxz**2
        gwidth0q = medyz**2

        # numbers of test locations in V, W
        J = 2
        Jp = J
        Jq = J

        # pick a subset of points in the training set for V, W
        Xyztr = np.vstack((Xtr, Ytr, Ztr))
        VW = util.subsample_rows(Xyztr, Jp+Jq, seed=73)
        V0 = VW[:Jp, :]
        W0 = VW[Jp:, :]

        # optimization options
        opt_options = {
            'max_iter': 100,
            'reg': 1e-4,
            'tol_fun': 1e-6,
            'locs_bounds_frac': 100,
            'gwidth_lb': None,
            'gwidth_ub': None,
        }

        umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths(
            datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, 
            **opt_options)
        (V_opt, gw2p_opt, opt_infop) = umep_params
        (W_opt, gw2q_opt, opt_infoq) = umeq_params
        k_opt = kernel.KGauss(gw2p_opt)
        l_opt = kernel.KGauss(gw2q_opt)
        # construct a UME test
        alpha = 0.01 # significance level 
        scume_opt2 = mct.SC_UME(datpte, datqte, k_opt, l_opt, V_opt, W_opt, alpha=alpha)
        scume_opt2.perform_test(datrte)