Пример #1
0
def job_lin_mmd(sample_source, tr, te, r):
    """Linear mmd with grid search to choose the best Gaussian width."""
    # should be completely deterministic

    # If n is too large, pairwise meddian computation can cause a memory error.
    with util.ContextTimer() as t:
        X, Y = tr.xy()
        Xr = X[:min(X.shape[0], 1000), :]
        Yr = Y[:min(Y.shape[0], 1000), :]

        med = util.meddistance(np.vstack((Xr, Yr)))
        widths = [(med * f) for f in 2.0**np.linspace(-1, 4, 40)]
        list_kernels = [kernel.KGauss(w**2) for w in widths]
        # grid search to choose the best Gaussian width
        besti, powers = tst.LinearMMDTest.grid_search_kernel(
            tr, list_kernels, alpha)
        # perform test
        best_ker = list_kernels[besti]
        lin_mmd_test = tst.LinearMMDTest(best_ker, alpha)
        test_result = lin_mmd_test.perform_test(te)

    result = {
        'test_method': lin_mmd_test,
        'test_result': test_result,
        'time_secs': t.secs
    }
    return result
Пример #2
0
    def test_basic_H1(self):
        """
        Nothing special. Just test basic things.
        """
        seed = 12
        # sample
        n = 271
        alpha = 0.01
        for d in [1, 4]:
            # h1 is true
            ss = data.SSGaussMeanDiff(d=d, my=2.0)
            dat = ss.sample(n, seed=seed)
            xy = dat.stack_xy()
            
            sig2 = util.meddistance(xy, subsample=1000)**2
            k = kernel.KGauss(sig2)

            # Test
            for J in [1, 6]:
                # random test locations
                V = util.fit_gaussian_draw(xy, J, seed=seed+1)
                ume = tst.UMETest(V, k, n_simulate=2000, alpha=alpha)
                tresult = ume.perform_test(dat)

                # assertions
                self.assertGreaterEqual(tresult['pvalue'], 0.0)
                # H1 is true. Should reject with a small p-value
                self.assertLessEqual(tresult['pvalue'], 0.1)
Пример #3
0
    def test_basic_H1(self):
        """
        Nothing special. Just test basic things.
        """
        seed = 12
        # sample
        n = 271
        alpha = 0.01
        for d in [1, 4]:
            # h1 is true
            ss = data.SSGaussMeanDiff(d=d, my=2.0)
            dat = ss.sample(n, seed=seed)
            xy = dat.stack_xy()

            sig2 = util.meddistance(xy, subsample=1000)**2
            k = kernel.KGauss(sig2)

            # Test
            for J in [1, 6]:
                # random test locations
                V = util.fit_gaussian_draw(xy, J, seed=seed + 1)
                ume = tst.UMETest(V, k, n_simulate=2000, alpha=alpha)
                tresult = ume.perform_test(dat)

                # assertions
                self.assertGreaterEqual(tresult['pvalue'], 0.0)
                # H1 is true. Should reject with a small p-value
                self.assertLessEqual(tresult['pvalue'], 0.1)
Пример #4
0
def job_quad_mmd_2U(sample_source, tr, te, r):
    """Quadratic mmd with grid search to choose the best Gaussian width.
    Use two-sample U statistics to compute k(X,Y).
    """
    # If n is too large, pairwise meddian computation can cause a memory error.

    with util.ContextTimer() as t:
        med = util.meddistance(tr.stack_xy(), 1000)
        list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-4, 4, 40))))
        list_gwidth.sort()
        list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

        # grid search to choose the best Gaussian width
        besti, powers = tst.QuadMMDTest.grid_search_kernel(
            tr, list_kernels, alpha)
        # perform test
        best_ker = list_kernels[besti]
        mmd_test = tst.QuadMMDTest(best_ker,
                                   n_permute=1000,
                                   alpha=alpha,
                                   use_1sample_U=False)
        test_result = mmd_test.perform_test(te)
    result = {
        'test_method': mmd_test,
        'test_result': test_result,
        'time_secs': t.secs
    }
    return result
Пример #5
0
    def test(self, X, Y):
        XY = self.preprocess(X, Y)

        locations = fot_tst.MeanEmbeddingTest.init_locs_subset(XY, self.J)
        med = fot_util.meddistance(XY.stack_xy(), 1000)
        kernel = fot_kernel.KGauss(med)
        ME = fot_tst.MeanEmbeddingTest(locations, med, alpha=self.alpha)

        result = ME.perform_test(XY)
        p_val = result['pvalue']
        return p_val
Пример #6
0
    def test_optimize_locs_width(self):
        """
        Test the function optimize_locs_width(..). Make sure it does not return 
        unusual results.
        """
        # sample source
        n = 600
        dim = 2
        seed = 17

        ss = data.SSGaussMeanDiff(dim, my=1.0)
        #ss = data.SSGaussVarDiff(dim)
        #ss = data.SSSameGauss(dim)
        # ss = data.SSBlobs()
        dim = ss.dim()

        dat = ss.sample(n, seed=seed)
        tr, te = dat.split_tr_te(tr_proportion=0.5, seed=10)
        xy_tr = tr.stack_xy()

        # initialize test_locs by drawing the a Gaussian fitted to the data
        # number of test locations
        J = 3
        V0 = util.fit_gaussian_draw(xy_tr, J, seed=seed + 1)
        med = util.meddistance(xy_tr, subsample=1000)
        gwidth0 = med**2
        assert gwidth0 > 0

        # optimize
        V_opt, gw2_opt, opt_info = tst.GaussUMETest.optimize_locs_width(
            tr,
            V0,
            gwidth0,
            reg=1e-2,
            max_iter=100,
            tol_fun=1e-5,
            disp=False,
            locs_bounds_frac=100,
            gwidth_lb=None,
            gwidth_ub=None)

        # perform the test using the optimized parameters on the test set
        alpha = 0.01
        ume_opt = tst.GaussUMETest(V_opt,
                                   gw2_opt,
                                   n_simulate=2000,
                                   alpha=alpha)
        test_result = ume_opt.perform_test(te)

        assert test_result['h0_rejected']
        assert util.is_real_num(gw2_opt)
        assert gw2_opt > 0
        assert np.all(np.logical_not((np.isnan(V_opt))))
        assert np.all(np.logical_not((np.isinf(V_opt))))
Пример #7
0
    def test(self, X, Y):
        XY = self.preprocess(X, Y)

        med = fot_util.meddistance(XY.stack_xy(), 1000)
        kernel = fot_kernel.KGauss(med)

        MMD = fot_tst.QuadMMDTest(kernel,
                                  n_permute=self.n_permute,
                                  alpha=self.alpha)

        result = MMD.perform_test(XY)
        p_val = result['pvalue']
        return p_val
Пример #8
0
def job_met_gwgrid(sample_source, tr, te, r, J):
    """MeanEmbeddingTest. Optimize only the Gaussian width with grid search
    Fix the test locations."""
    # optimize on the training set
    T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r+92856)
    med = util.meddistance(tr.stack_xy(), 1000)
    list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-5, 5, 40) ) ) )
    list_gwidth.sort()
    besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth(tr, T_randn,
            list_gwidth, alpha)

    best_width2 = list_gwidth[besti]
    met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha)
    return met_grid.perform_test(te)
Пример #9
0
def job_met_gwgrid(sample_source, tr, te, r, J):
    """MeanEmbeddingTest. Optimize only the Gaussian width with grid search
    Fix the test locations."""
    # optimize on the training set
    T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r + 92856)
    med = util.meddistance(tr.stack_xy(), 1000)
    list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-5, 5, 40))))
    list_gwidth.sort()
    besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth(
        tr, T_randn, list_gwidth, alpha)

    best_width2 = list_gwidth[besti]
    met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha)
    return met_grid.perform_test(te)
Пример #10
0
    def test(self, X, Y):
        XY = self.preprocess(X, Y)
        train, test = XY.split_tr_te(tr_proportion=self.split_ratio)

        locations = fot_tst.MeanEmbeddingTest.init_locs_subset(train, self.J)
        med = fot_util.meddistance(train.stack_xy(), 1000)
        gwidth, info = fot_tst.MeanEmbeddingTest.optimize_gwidth(
            train, locations, med**2)

        ME = fot_tst.MeanEmbeddingTest(locations, gwidth, alpha=self.alpha)

        result = ME.perform_test(test)
        p_val = result['pvalue']
        return p_val
Пример #11
0
def mmd(p, q, alpha=0.05):
    if (p.ndim == 1): p = p[:, np.newaxis]
    if (q.ndim == 1): q = q[:, np.newaxis]
    d = data.TSTData(p, q)
    d_tr, d_te = d.split_tr_te(tr_proportion=0.5)
    med = util.meddistance(d_tr.stack_xy())
    widths = [(med * f) for f in 2.0**np.linspace(-1, 4, 20)]
    list_kernels = [kernel.KGauss(w**2) for w in widths]
    besti, powers = tst.LinearMMDTest.grid_search_kernel(
        d_tr, list_kernels, alpha)
    best_ker = list_kernels[besti]
    lin_mmd_test = tst.LinearMMDTest(best_ker, alpha)
    r = lin_mmd_test.perform_test(d_te)
    return r['test_stat'], r['pvalue']
Пример #12
0
def job_quad_mmd(sample_source, tr, te, r):
    """Quadratic mmd with grid search to choose the best Gaussian width."""
    # If n is too large, pairwise meddian computation can cause a memory error. 

    med = util.meddistance(tr.stack_xy(), 1000)
    list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 30) ) ) )
    list_gwidth.sort()
    list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

    # grid search to choose the best Gaussian width
    besti, powers = tst.QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha)
    # perform test 
    best_ker = list_kernels[besti]
    mmd_test = tst.QuadMMDTest(best_ker, n_permute=400, alpha=alpha)
    test_result = mmd_test.perform_test(te)
    return test_result
Пример #13
0
def job_quad_mmd(sample_source, tr, te, r):
    """Quadratic mmd with grid search to choose the best Gaussian width."""
    # If n is too large, pairwise meddian computation can cause a memory error. 

    med = util.meddistance(tr.stack_xy(), 1000)
    list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 30) ) ) )
    list_gwidth.sort()
    list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

    # grid search to choose the best Gaussian width
    besti, powers = tst.QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha)
    # perform test 
    best_ker = list_kernels[besti]
    mmd_test = tst.QuadMMDTest(best_ker, n_permute=400, alpha=alpha)
    test_result = mmd_test.perform_test(te)
    return test_result
Пример #14
0
def job_lin_mmd(sample_source, tr, te, r):
    """Linear mmd with grid search to choose the best Gaussian width."""
    # should be completely deterministic

    # If n is too large, pairwise meddian computation can cause a memory error. 
    X, Y = tr.xy()
    Xr = X[:min(X.shape[0], 1000), :]
    Yr = Y[:min(Y.shape[0], 1000), :]
    
    med = util.meddistance(np.vstack((Xr, Yr)) )
    widths = [ (med*f) for f in 2.0**np.linspace(-1, 4, 40)]
    list_kernels = [kernel.KGauss( w**2 ) for w in widths]
    # grid search to choose the best Gaussian width
    besti, powers = tst.LinearMMDTest.grid_search_kernel(tr, list_kernels, alpha)
    # perform test 
    best_ker = list_kernels[besti]
    lin_mmd_test = tst.LinearMMDTest(best_ker, alpha)
    test_result = lin_mmd_test.perform_test(te)
    return test_result
Пример #15
0
    def test_optimize_locs_width(self):
        """
        Test the function optimize_locs_width(..). Make sure it does not return 
        unusual results.
        """
        # sample source 
        n = 600
        dim = 2
        seed = 17

        ss = data.SSGaussMeanDiff(dim, my=1.0)
        #ss = data.SSGaussVarDiff(dim)
        #ss = data.SSSameGauss(dim)
        # ss = data.SSBlobs()
        dim = ss.dim()

        dat = ss.sample(n, seed=seed)
        tr, te = dat.split_tr_te(tr_proportion=0.5, seed=10)
        xy_tr = tr.stack_xy()

        # initialize test_locs by drawing the a Gaussian fitted to the data
        # number of test locations
        J = 3
        V0 = util.fit_gaussian_draw(xy_tr, J, seed=seed+1)
        med = util.meddistance(xy_tr, subsample=1000)
        gwidth0 = med**2
        assert gwidth0 > 0

        # optimize
        V_opt, gw2_opt, opt_info = tst.GaussUMETest.optimize_locs_width(tr, V0, gwidth0, reg=1e-2,
            max_iter=100,  tol_fun=1e-5, disp=False, locs_bounds_frac=100,
            gwidth_lb=None, gwidth_ub=None)

        # perform the test using the optimized parameters on the test set
        alpha = 0.01
        ume_opt = tst.GaussUMETest(V_opt, gw2_opt, n_simulate=2000, alpha=alpha)
        test_result = ume_opt.perform_test(te)

        assert test_result['h0_rejected']
        assert util.is_real_num(gw2_opt)
        assert gw2_opt > 0
        assert np.all(np.logical_not((np.isnan(V_opt))))
        assert np.all(np.logical_not((np.isinf(V_opt))))
Пример #16
0
def job_quad_mmd(sample_source, tr, te, r):
    """Quadratic mmd with grid search to choose the best Gaussian width.
    One-sample U-statistic. This should NOT be used anymore."""
    # If n is too large, pairwise meddian computation can cause a memory error. 
            
    with util.ContextTimer() as t:
        med = util.meddistance(tr.stack_xy(), 1000)
        list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 40) ) ) )
        list_gwidth.sort()
        list_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

        # grid search to choose the best Gaussian width
        besti, powers = tst.QuadMMDTest.grid_search_kernel(tr, list_kernels, alpha)
        # perform test 
        best_ker = list_kernels[besti]
        mmd_test = tst.QuadMMDTest(best_ker, n_permute=1000, alpha=alpha, 
                use_1sample_U=True)
        test_result = mmd_test.perform_test(te)
    result = {'test_method': mmd_test, 'test_result': test_result, 'time_secs': t.secs}
    return result
Пример #17
0
    def test(self, X, Y):
        XY = self.preprocess(X, Y)

        train, test = XY.split_tr_te(tr_proportion=self.split_ratio)
        med = fot_util.meddistance(train.stack_xy(), 1000)

        bandwidths = (med**2) * (2.**np.linspace(-4, 4, 20))
        kernels = [fot_kernel.KGauss(width) for width in bandwidths]
        with contextlib.redirect_stdout(None):
            best_i, powers = fot_tst.QuadMMDTest.grid_search_kernel(
                train, kernels, alpha=self.alpha)
        best_kernel = kernels[best_i]

        MMD = fot_tst.QuadMMDTest(best_kernel,
                                  n_permute=self.n_permute,
                                  alpha=self.alpha)

        result = MMD.perform_test(test)
        p_val = result['pvalue']
        return p_val
Пример #18
0
def job_met_gwgrid(prob_label, tr, te, r, ni, n):
    """MeanEmbeddingTest. Optimize only the Gaussian width with grid search
    Fix the test locations."""

    with util.ContextTimer() as t:
        # optimize on the training set
        T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r+92856)
        med = util.meddistance(tr.stack_xy(), 1000)
        list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-5, 5, 40) ) ) )
        list_gwidth.sort()
        besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth(tr, T_randn,
                list_gwidth, alpha)

        best_width2 = list_gwidth[besti]
        met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha)
        met_grid_result = met_grid.perform_test(te)
    return {
            #'test_method': met_grid,
            'test_result': met_grid_result,
            'time_secs': t.secs}
Пример #19
0
def job_met_gwgrid(prob_label, tr, te, r, ni, n):
    """MeanEmbeddingTest. Optimize only the Gaussian width with grid search
    Fix the test locations."""

    with util.ContextTimer() as t:
        # optimize on the training set
        T_randn = tst.MeanEmbeddingTest.init_locs_2randn(tr, J, seed=r + 92856)
        med = util.meddistance(tr.stack_xy(), 1000)
        list_gwidth = np.hstack(((med**2) * (2.0**np.linspace(-5, 5, 40))))
        list_gwidth.sort()
        besti, powers = tst.MeanEmbeddingTest.grid_search_gwidth(
            tr, T_randn, list_gwidth, alpha)

        best_width2 = list_gwidth[besti]
        met_grid = tst.MeanEmbeddingTest(T_randn, best_width2, alpha)
        met_grid_result = met_grid.perform_test(te)
    return {
        #'test_method': met_grid,
        'test_result': met_grid_result,
        'time_secs': t.secs
    }