Пример #1
0
def test_fssd():
    """Test FSSD with Gaussian kernel (median heuristic) and randomized test locations

    Following the example in:
    https://github.com/wittawatj/kernel-gof/blob/master/kgof/ex/ex1_vary_n.py
    """
    seed = 42

    d = 2  # dimensionality
    n = 800  # samples

    # Density
    mean = np.zeros(d)
    variance = 1.0
    p = density.IsotropicNormal(mean, variance)

    # Samples from same density
    ds = data.DSIsotropicNormal(mean, variance)
    samples = ds.sample(n, seed=seed + 1)

    # Gaussian kernel with median heuristic
    sig2 = util.meddistance(samples.data(), subsample=1000) ** 2
    k = kernel.KGauss(sig2)
    print(f"Kernel bandwidth: {sig2}")

    # FSSD
    J = 10
    null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=seed)
    # Fit a multivariate normal to the data X (n x d) and draw J points from the fit.
    V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1)
    fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01)
    test_result = fssd_med.perform_test(samples)
    print(test_result)
    assert test_result["h0_rejected"] == False

    # FSSD with samples from different density
    J = 10  # Fails with J=8, passes with J=10 (chance)
    ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2))
    samples = ds.sample(n, seed=seed + 1)
    sig2 = util.meddistance(samples.data(), subsample=1000) ** 2
    # NOTE: Works much better with the bandwidth that was optimized under FSSD:
    # sig2 = 0.3228712361986835
    k = kernel.KGauss(sig2)
    print(f"Kernel bandwidth: {sig2}")
    null_sim = gof.FSSDH0SimCovObs(n_simulate=3000, seed=seed)
    # TODO: is this what we want if samples come from another distribution ?!
    V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1)
    fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01)
    test_result = fssd_med.perform_test(samples)
    print(test_result)
    assert test_result["h0_rejected"] == True
Пример #2
0
def job_fssdJ1q_med(p, data_source, tr, te, r, J=1, null_sim=None):
    """
    FSSD test with a Gaussian kernel, where the test locations are randomized,
    and the Gaussian width is set with the median heuristic. Use full sample.
    No training/testing splits.

    p: an UnnormalizedDensity
    data_source: a DataSource
    tr, te: Data
    r: trial number (positive integer)
    """
    if null_sim is None:
        null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)

    # full data
    data = tr + te
    X = data.data()
    with util.ContextTimer() as t:
        # median heuristic
        med = util.meddistance(X, subsample=1000)
        k = kernel.KGauss(med**2)
        V = util.fit_gaussian_draw(X, J, seed=r + 1)

        fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha)
        fssd_med_result = fssd_med.perform_test(data)
    return {"test_result": fssd_med_result, "time_secs": t.secs}
Пример #3
0
    def test_ustat_h1_mean_variance(self):
        seed = 20
        # sample
        n = 200
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1
            isonorm = density.IsotropicNormal(mean, variance)

            draw_mean = mean + 2
            draw_variance = variance + 1
            X = util.randn(n, d,
                           seed=seed) * np.sqrt(draw_variance) + draw_mean
            dat = data.Data(X)

            # Test
            for J in [1, 3]:
                sig2 = util.meddistance(X, subsample=1000)**2
                k = kernel.KGauss(sig2)

                # random test locations
                V = util.fit_gaussian_draw(X, J, seed=seed + 1)

                null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3)
                fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)
                fea_tensor = fssd.feature_tensor(X)

                u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance(
                    fea_tensor)

                # assertions
                self.assertGreaterEqual(u_variance, 0)
                # should reject H0
                self.assertGreaterEqual(u_mean, 0)
Пример #4
0
def job_fssdJ1q_imq_optv(p,
                         data_source,
                         tr,
                         te,
                         r,
                         J=1,
                         b=-0.5,
                         null_sim=None):
    """
    FSSD with optimization on tr. Test on te. Use an inverse multiquadric
    kernel (IMQ). Optimize only the test locations (V). Fix the kernel
    parameters to b = -0.5, c=1. These are the recommended values from

        Measuring Sample Quality with Kernels
        Jackson Gorham, Lester Mackey
    """
    if null_sim is None:
        null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)

    Xtr = tr.data()
    with util.ContextTimer() as t:
        # IMQ kernel parameters: b and c
        c = 1.0

        # fit a Gaussian to the data and draw to initialize V0
        V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6)

        ops = {
            "reg": 1e-5,
            "max_iter": 30,
            "tol_fun": 1e-6,
            "disp": True,
            "locs_bounds_frac": 20.0,
        }

        V_opt, info = gof.IMQFSSD.optimize_locs(p, tr, b, c, V0, **ops)

        k_imq = kernel.KIMQ(b=b, c=c)

        # Use the optimized parameters to construct a test
        fssd_imq = gof.FSSD(p, k_imq, V_opt, null_sim=null_sim, alpha=alpha)
        fssd_imq_result = fssd_imq.perform_test(te)

    return {
        "test_result": fssd_imq_result,
        "time_secs": t.secs,
        "goftest": fssd_imq,
        "opt_info": info,
    }
Пример #5
0
def job_fssdJ1q_opt(p, data_source, tr, te, r, J=1, null_sim=None):
    """
    FSSD with optimization on tr. Test on te. Use a Gaussian kernel.
    """
    if null_sim is None:
        null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)

    Xtr = tr.data()
    with util.ContextTimer() as t:
        # Use grid search to initialize the gwidth
        n_gwidth_cand = 5
        gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand)
        med2 = util.meddistance(Xtr, 1000)**2

        k = kernel.KGauss(med2)
        # fit a Gaussian to the data and draw to initialize V0
        V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6)
        list_gwidth = np.hstack(((med2) * gwidth_factors))
        besti, objs = gof.GaussFSSD.grid_search_gwidth(p, tr, V0, list_gwidth)
        gwidth = list_gwidth[besti]
        assert util.is_real_num(
            gwidth), "gwidth not real. Was %s" % str(gwidth)
        assert gwidth > 0, "gwidth not positive. Was %.3g" % gwidth
        logging.info("After grid search, gwidth=%.3g" % gwidth)

        ops = {
            "reg": 1e-2,
            "max_iter": 30,
            "tol_fun": 1e-5,
            "disp": True,
            "locs_bounds_frac": 30.0,
            "gwidth_lb": 1e-1,
            "gwidth_ub": 1e4,
        }

        V_opt, gwidth_opt, info = gof.GaussFSSD.optimize_locs_widths(
            p, tr, gwidth, V0, **ops)
        # Use the optimized parameters to construct a test
        k_opt = kernel.KGauss(gwidth_opt)
        fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
        fssd_opt_result = fssd_opt.perform_test(te)
    return {
        "test_result": fssd_opt_result,
        "time_secs": t.secs,
        "goftest": fssd_opt,
        "opt_info": info,
    }
Пример #6
0
def job_fssdJ1q_imq_optbv(p, data_source, tr, te, r, J=1, null_sim=None):
    """
    FSSD with optimization on tr. Test on te. Use an inverse multiquadric
    kernel (IMQ). Optimize the test locations (V), and b. Fix c (in the kernel)
    """
    if null_sim is None:
        null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)

    Xtr = tr.data()
    with util.ContextTimer() as t:
        # Initial IMQ kernel parameters: b and c
        b0 = -0.5
        # Fix c to this value
        c = 1.0
        c0 = c

        # fit a Gaussian to the data and draw to initialize V0
        V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6)

        ops = {
            "reg": 1e-5,
            "max_iter": 40,
            "tol_fun": 1e-6,
            "disp": True,
            "locs_bounds_frac": 20.0,
            # IMQ kernel bounds
            "b_lb": -20,
            "c_lb": c,
            "c_ub": c,
        }

        V_opt, b_opt, c_opt, info = gof.IMQFSSD.optimize_locs_params(
            p, tr, b0, c0, V0, **ops)

        k_imq = kernel.KIMQ(b=b_opt, c=c_opt)

        # Use the optimized parameters to construct a test
        fssd_imq = gof.FSSD(p, k_imq, V_opt, null_sim=null_sim, alpha=alpha)
        fssd_imq_result = fssd_imq.perform_test(te)

    return {
        "test_result": fssd_imq_result,
        "time_secs": t.secs,
        "goftest": fssd_imq,
        "opt_info": info,
    }
Пример #7
0
    def test_optimized_fssd(self):
        """
        Test FSSD test with parameter optimization.
        """
        seed = 4
        # sample size
        n = 179
        alpha = 0.01
        for d in [1, 3]:
            mean = np.zeros(d)
            variance = 1.0
            p = density.IsotropicNormal(mean, variance)
            # Mean difference. obvious reject
            ds = data.DSIsotropicNormal(mean + 4, variance + 0)
            dat = ds.sample(n, seed=seed)
            # test
            for J in [1, 4]:
                opts = {
                    "reg": 1e-2,
                    "max_iter": 10,
                    "tol_fun": 1e-3,
                    "disp": False
                }
                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1)

                Xtr = tr.X
                gwidth0 = util.meddistance(Xtr, subsample=1000)**2
                # random test locations
                V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1)
                V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths(
                    p, tr, gwidth0, V0, **opts)

                # construct a test
                k_opt = kernel.KGauss(gw_opt)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
                fssd_opt = gof.FSSD(p,
                                    k_opt,
                                    V_opt,
                                    null_sim=null_sim,
                                    alpha=alpha)
                fssd_opt_result = fssd_opt.perform_test(
                    te, return_simulated_stats=True)
                assert fssd_opt_result["h0_rejected"]
Пример #8
0
    def test_auto_init_opt_fssd(self):
        """
        Test FSSD-opt test with automatic parameter initialization.
        """
        seed = 5
        # sample size
        n = 191
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1.0
            p = density.IsotropicNormal(mean, variance)
            # Mean difference. obvious reject
            ds = data.DSIsotropicNormal(mean + 4, variance + 0)
            dat = ds.sample(n, seed=seed)
            # test
            for J in [1, 3]:
                opts = {
                    "reg": 1e-2,
                    "max_iter": 10,
                    "tol_fun": 1e-3,
                    "disp": False
                }
                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1)

                V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_auto_init(
                    p, tr, J, **opts)

                # construct a test
                k_opt = kernel.KGauss(gw_opt)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
                fssd_opt = gof.FSSD(p,
                                    k_opt,
                                    V_opt,
                                    null_sim=null_sim,
                                    alpha=alpha)
                fssd_opt_result = fssd_opt.perform_test(
                    te, return_simulated_stats=True)
                assert fssd_opt_result["h0_rejected"]
Пример #9
0
    def test_basic(self):
        """
        Nothing special. Just test basic things.
        """
        seed = 12
        # sample
        n = 100
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1
            isonorm = density.IsotropicNormal(mean, variance)

            # only one dimension of the mean is shifted
            # draw_mean = mean + np.hstack((1, np.zeros(d-1)))
            draw_mean = mean + 0
            draw_variance = variance + 1
            X = util.randn(n, d,
                           seed=seed) * np.sqrt(draw_variance) + draw_mean
            dat = data.Data(X)

            # Test
            for J in [1, 3]:
                sig2 = util.meddistance(X, subsample=1000)**2
                k = kernel.KGauss(sig2)

                # random test locations
                V = util.fit_gaussian_draw(X, J, seed=seed + 1)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3)
                fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)

                tresult = fssd.perform_test(dat, return_simulated_stats=True)

                # assertions
                self.assertGreaterEqual(tresult["pvalue"], 0)
                self.assertLessEqual(tresult["pvalue"], 1)