示例#1
0
    def test_basic(self):
        """
        Test basic things. Make sure SC_UME runs under normal usage.
        """
        mp, varp = 4, 1
        # q cannot be the true model. 
        # That violates our assumption and the asymptotic null distribution
        # does not hold.
        mq, varq = 0.5, 1

        # draw some data
        n = 2999 # sample size
        seed = 89
        with util.NumpySeedContext(seed=seed):
            X = np.random.randn(n, 1)*varp**0.5 + mp
            Y = np.random.randn(n, 1)*varq**0.5 + mq
            Z = np.random.randn(n, 1)
            
            datap = data.Data(X)
            dataq = data.Data(Y)
            datar = data.Data(Z)

        # hyperparameters of the test
        medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
        medyz = util.meddistance(np.vstack((Y, Z)), subsample=1000)
        k = kernel.KGauss(sigma2=medxz**2)
        l = kernel.KGauss(sigma2=medyz**2)

        # 2 sets of test locations
        J = 3
        Jp = J
        Jq = J
        V = util.fit_gaussian_draw(X, Jp, seed=seed+2)
        W = util.fit_gaussian_draw(Y, Jq, seed=seed+3)

        # construct a UME test
        alpha = 0.01 # significance level 
        scume = mct.SC_UME(datap, dataq, k, l, V, W, alpha=alpha)
        test_result = scume.perform_test(datar)

        # make sure it rejects
        #print(test_result)
        assert test_result['h0_rejected']
示例#2
0
def met_gumeJ1_2V_rand(P, Q, data_source, n, r, J=1, use_1set_locs=False):
    """
    UME-based three-sample test. 
        * Use J=1 test location by default. 
        * Use two sets (2V) of test locations by default: V and W, each having J
            locations.  Will constrain V=W if use_1set_locs=True.
        * The test locations are selected at random from the data. Selected
            points are removed for testing.
        * Gaussian kernels for the two UME statistics. Median heuristic is used
            to select each width.
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}
    assert J >= 1

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:

        # remove the first J points from each set
        X, Y, Z = datp.data(), datq.data(), datr.data()

        # containing 3*J points
        pool3J = np.vstack((X[:J, :], Y[:J, :], Z[:J, :]))
        X, Y, Z = (X[J:, :], Y[J:, :], Z[J:, :])

        datp, datq, datr = [data.Data(a) for a in [X, Y, Z]]
        assert X.shape[0] == Y.shape[0]
        assert Y.shape[0] == Z.shape[0]
        assert Z.shape[0] == n - J
        assert datp.sample_size() == n - J
        assert datq.sample_size() == n - J
        assert datr.sample_size() == n - J

        #XYZ = np.vstack((X, Y, Z))
        #stds = np.std(util.subsample_rows(XYZ, min(n-3*J, 500),
        #    seed=r+87), axis=0)
        d = X.shape[1]
        # add a little noise to the locations.
        with util.NumpySeedContext(seed=r * 191):
            #pool3J = pool3J + np.random.randn(3*J, d)*np.max(stds)*3
            pool3J = np.random.randn(3 * J, d) * 2

        # median heuristic to set the Gaussian widths
        medxz = util.meddistance(np.vstack((X, Z)), subsample=1000)
        medyz = util.meddistance(np.vstack((Z, Y)), subsample=1000)
        if use_1set_locs:
            # randomly select J points from the pool3J for the J test locations
            #V = util.subsample_rows(pool3J, J, r)
            V = pool3J[:J, :]
            W = V
            k = kernel.KGauss(sigma2=np.mean([medxz, medyz])**2)
            l = k
        else:
            # use two sets of locations: V and W
            #VW = util.subsample_rows(pool3J, 2*J, r)
            VW = pool3J[:2 * J, :]
            V = VW[:J, :]
            W = VW[J:, :]

            # 2 Gaussian kernels
            k = kernel.KGauss(sigma2=medxz**2)
            l = kernel.KGauss(sigma2=medyz**2)

        # construct the test
        scume = mct.SC_UME(datp, datq, k, l, V, W, alpha=alpha)
        scume_rand_result = scume.perform_test(datr)

    return {
        # This key "test" can be removed. Storing V, W can take quite a lot
        # of space, especially when the input dimension d is high.
        #'test':scume,
        'test_result': scume_rand_result,
        'time_secs': t.secs
    }
示例#3
0
def met_gumeJ1_3sopt_tr50(P, Q, data_source, n, r, J=1, tr_proportion=0.5):
    """
    UME-based three-sample test
        * Use J=1 test location by default (in the set V=W). 
        * 3sopt = optimize the test locations by maximizing the 3-sample test's
        power criterion. There is only one set of test locations.
        * One Gaussian kernel for the two UME statistics. Optimize the Gaussian width
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}
    assert J >= 1

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:
        # split the data into training/test sets
        [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \
            [D.split_tr_te(tr_proportion=tr_proportion, seed=r) for D in [datp, datq, datr]]
        Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]]
        Xyztr = np.vstack((Xtr, Ytr, Ztr))
        # initialize optimization parameters.
        # Initialize the Gaussian widths with the median heuristic
        medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000)
        medyz = util.meddistance(np.vstack((Ztr, Ytr)), subsample=1000)
        gwidth0 = np.mean([medxz, medyz])**2

        # pick a subset of points in the training set for V, W
        V0 = util.subsample_rows(Xyztr, J, seed=r + 2)

        # optimization options
        opt_options = {
            'max_iter': 100,
            'reg': 1e-6,
            'tol_fun': 1e-7,
            'locs_bounds_frac': 50,
            'gwidth_lb': 0.1,
            'gwidth_ub': 6**2,
        }
        V_opt, gw2_opt, opt_result = mct.SC_GaussUME.optimize_3sample_criterion(
            datptr, datqtr, datrtr, V0, gwidth0, **opt_options)
        k_opt = kernel.KGauss(gw2_opt)

        # construct a UME test
        scume_opt3 = mct.SC_UME(datpte,
                                datqte,
                                k_opt,
                                k_opt,
                                V_opt,
                                V_opt,
                                alpha=alpha)
        scume_opt3_result = scume_opt3.perform_test(datrte)

    return {
        # This key "test" can be removed. Storing V, W can take quite a lot
        # of space, especially when the input dimension d is high.
        #'test':scume,
        'test_result': scume_opt3_result,
        'time_secs': t.secs
    }
示例#4
0
def met_gumeJ1_2sopt_tr50(P, Q, data_source, n, r, J=1, tr_proportion=0.5):
    """
    UME-based three-sample test
        * Use J=1 test location by default. 
        * 2sopt = optimize the two sets of test locations by maximizing the
            2-sample test's power criterion. Each set is optmized separately.
        * Gaussian kernels for the two UME statistics. The Gaussian widths are
        also optimized separately.
    """
    if not P.has_datasource() or not Q.has_datasource():
        # Not applicable. Return {}.
        return {}
    assert J >= 1

    ds_p = P.get_datasource()
    ds_q = Q.get_datasource()
    # sample some data
    datp, datq, datr = sample_pqr(ds_p,
                                  ds_q,
                                  data_source,
                                  n,
                                  r,
                                  only_from_r=False)

    # Start the timer here
    with util.ContextTimer() as t:
        # split the data into training/test sets
        [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \
            [D.split_tr_te(tr_proportion=tr_proportion, seed=r) for D in [datp, datq, datr]]
        Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]]

        # initialize optimization parameters.
        # Initialize the Gaussian widths with the median heuristic
        medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000)
        medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000)
        gwidth0p = medxz**2
        gwidth0q = medyz**2

        # numbers of test locations in V, W
        Jp = J
        Jq = J

        # pick a subset of points in the training set for V, W
        Xyztr = np.vstack((Xtr, Ytr, Ztr))
        VW = util.subsample_rows(Xyztr, Jp + Jq, seed=r + 1)
        V0 = VW[:Jp, :]
        W0 = VW[Jp:, :]

        # optimization options
        opt_options = {
            'max_iter': 100,
            'reg': 1e-4,
            'tol_fun': 1e-6,
            'locs_bounds_frac': 50,
            'gwidth_lb': 0.1,
            'gwidth_ub': 10**2,
        }

        umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths(
            datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, **opt_options)
        (V_opt, gw2p_opt, opt_infop) = umep_params
        (W_opt, gw2q_opt, opt_infoq) = umeq_params
        k_opt = kernel.KGauss(gw2p_opt)
        l_opt = kernel.KGauss(gw2q_opt)

        # construct a UME test
        scume_opt2 = mct.SC_UME(datpte,
                                datqte,
                                k_opt,
                                l_opt,
                                V_opt,
                                W_opt,
                                alpha=alpha)
        scume_opt2_result = scume_opt2.perform_test(datrte)

    return {
        # This key "test" can be removed. Storing V, W can take quite a lot
        # of space, especially when the input dimension d is high.
        #'test':scume,
        'test_result': scume_opt2_result,
        'time_secs': t.secs
    }
示例#5
0
    def test_optimize_2sets_locs_widths(self):
        mp, varp = 2, 1
        # q cannot be the true model. 
        # That violates our assumption and the asymptotic null distribution
        # does not hold.
        mq, varq = 1, 1

        # draw some data
        n = 800 # sample size
        seed = 6
        with util.NumpySeedContext(seed=seed):
            X = np.random.randn(n, 1)*varp**0.5 + mp
            Y = np.random.randn(n, 1)*varq**0.5 + mq
            Z = np.random.randn(n, 1)
            
            datap = data.Data(X)
            dataq = data.Data(Y)
            datar = data.Data(Z)

        # split the data into training/test sets
        [(datptr, datpte), (datqtr, datqte), (datrtr, datrte)] = \
            [D.split_tr_te(tr_proportion=0.3, seed=85) for D in [datap, dataq, datar]]
        Xtr, Ytr, Ztr = [D.data() for D in [datptr, datqtr, datrtr]]

        # initialize optimization parameters.
        # Initialize the Gaussian widths with the median heuristic
        medxz = util.meddistance(np.vstack((Xtr, Ztr)), subsample=1000)
        medyz = util.meddistance(np.vstack((Ytr, Ztr)), subsample=1000)
        gwidth0p = medxz**2
        gwidth0q = medyz**2

        # numbers of test locations in V, W
        J = 2
        Jp = J
        Jq = J

        # pick a subset of points in the training set for V, W
        Xyztr = np.vstack((Xtr, Ytr, Ztr))
        VW = util.subsample_rows(Xyztr, Jp+Jq, seed=73)
        V0 = VW[:Jp, :]
        W0 = VW[Jp:, :]

        # optimization options
        opt_options = {
            'max_iter': 100,
            'reg': 1e-4,
            'tol_fun': 1e-6,
            'locs_bounds_frac': 100,
            'gwidth_lb': None,
            'gwidth_ub': None,
        }

        umep_params, umeq_params = mct.SC_GaussUME.optimize_2sets_locs_widths(
            datptr, datqtr, datrtr, V0, W0, gwidth0p, gwidth0q, 
            **opt_options)
        (V_opt, gw2p_opt, opt_infop) = umep_params
        (W_opt, gw2q_opt, opt_infoq) = umeq_params
        k_opt = kernel.KGauss(gw2p_opt)
        l_opt = kernel.KGauss(gw2q_opt)
        # construct a UME test
        alpha = 0.01 # significance level 
        scume_opt2 = mct.SC_UME(datpte, datqte, k_opt, l_opt, V_opt, W_opt, alpha=alpha)
        scume_opt2.perform_test(datrte)