Пример #1
0
    def test_ustat_h1_mean_variance(self):
        seed = 20
        # sample
        n = 200
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1
            isonorm = density.IsotropicNormal(mean, variance)

            draw_mean = mean + 2
            draw_variance = variance + 1
            X = util.randn(n, d,
                           seed=seed) * np.sqrt(draw_variance) + draw_mean
            dat = data.Data(X)

            # Test
            for J in [1, 3]:
                sig2 = util.meddistance(X, subsample=1000)**2
                k = kernel.KGauss(sig2)

                # random test locations
                V = util.fit_gaussian_draw(X, J, seed=seed + 1)

                null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3)
                fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)
                fea_tensor = fssd.feature_tensor(X)

                u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance(
                    fea_tensor)

                # assertions
                self.assertGreaterEqual(u_variance, 0)
                # should reject H0
                self.assertGreaterEqual(u_mean, 0)
Пример #2
0
    def test_basic(self):
        d = 3
        p = density.IsotropicNormal(mean=np.zeros(d), variance=3.0)
        q = density.IsotropicNormal(mean=np.zeros(d) + 2, variance=3.0)
        k = kernel.KGauss(2.0)

        ds = q.get_datasource()
        n = 97
        dat = ds.sample(n, seed=3)

        witness = gof.SteinWitness(p, k, dat)
        # points to evaluate the witness
        J = 4
        V = np.random.randn(J, d) * 2
        evals = witness(V)

        testing.assert_equal(evals.shape, (J, d))
Пример #3
0
def test_ksd():
    """Test quadratic time KSD

    Following the example in:
    https://github.com/wittawatj/kernel-gof/blob/master/ipynb/gof_kernel_stein.ipynb
    """
    seed = 42

    d = 2  # dimensionality
    n = 800  # samples

    # Density
    mean = np.zeros(d)
    variance = 1.0
    p = density.IsotropicNormal(mean, variance)

    # Samples from same density
    ds = data.DSIsotropicNormal(mean, variance)
    samples = ds.sample(n, seed=seed + 1)

    # Gaussian kernel with median heuristic
    sig2 = util.meddistance(samples.data(), subsample=1000)**2
    k = kernel.KGauss(sig2)
    print(f"Kernel bandwidth: {sig2}")

    # KSD
    bootstrapper = gof.bootstrapper_rademacher
    kstein = gof.KernelSteinTest(p,
                                 k,
                                 bootstrapper=bootstrapper,
                                 alpha=0.01,
                                 n_simulate=500,
                                 seed=seed + 1)
    test_result = kstein.perform_test(samples,
                                      return_simulated_stats=False,
                                      return_ustat_gram=False)
    print(test_result)
    assert test_result["h0_rejected"] == False

    # KSD with samples from different density
    ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2))
    samples = ds.sample(n, seed=seed + 1)
    sig2 = util.meddistance(samples.data(), subsample=1000)**2
    print(f"Kernel bandwidth: {sig2}")
    k = kernel.KGauss(sig2)
    bootstrapper = gof.bootstrapper_rademacher
    kstein = gof.KernelSteinTest(p,
                                 k,
                                 bootstrapper=bootstrapper,
                                 alpha=0.01,
                                 n_simulate=500,
                                 seed=seed + 1)
    test_result = kstein.perform_test(samples,
                                      return_simulated_stats=False,
                                      return_ustat_gram=False)
    print(test_result)
    assert test_result["h0_rejected"] == True
Пример #4
0
def get_ns_pqsource(prob_label):
    """
    Return (ns, p, ds), a tuple of
    where
    - ns: a list of sample sizes
    - p: a Density representing the distribution p
    - ds: a DataSource, each corresponding to one parameter setting.
        The DataSource generates sample from q.
    """
    gmd_p01_d10_ns = [1000, 3000, 5000]

    # gb_rbm_dx50_dh10_vars = [0, 1e-3, 2e-3, 3e-3]
    prob2tuples = {
        # vary d. P = N(0, I), Q = N( (c,..0), I)
        "gmd_p03_d10_ns": (
            gmd_p01_d10_ns,
            density.IsotropicNormal(np.zeros(10), 1),
            data.DSIsotropicNormal(np.hstack((0.03, np.zeros(10 - 1))), 1),
        ),
        # Gaussian Bernoulli RBM. dx=50, dh=10
        # Perturbation variance to B[0, 0] is 0.1
        "gbrbm_dx50_dh10_vp1": ([i * 1000 for i in range(1, 4 + 1)], ) +
        # ([1000, 5000], ) +
        gbrbm_perturb(var_perturb_B=0.1, dx=50, dh=10),
        # Gaussian Bernoulli RBM. dx=50, dh=40
        # Perturbation variance to B[0, 0] is 0.1
        "gbrbm_dx50_dh40_vp1": ([i * 1000 for i in range(1, 4 + 1)], ) +
        # ([1000, 5000], ) +
        gbrbm_perturb(var_perturb_B=0.1, dx=50, dh=40),
        # Gaussian Bernoulli RBM. dx=50, dh=10
        # No perturbation
        "gbrbm_dx50_dh10_h0": ([i * 1000 for i in range(1, 4 + 1)], ) +
        # ([1000, 5000], ) +
        gbrbm_perturb(var_perturb_B=0, dx=50, dh=10),
        # Gaussian Bernoulli RBM. dx=50, dh=40
        # No perturbation
        "gbrbm_dx50_dh40_h0": ([i * 1000 for i in range(1, 4 + 1)], ) +
        # ([1000, 5000], ) +
        gbrbm_perturb(var_perturb_B=0, dx=50, dh=40),
        # Gaussian Bernoulli RBM. dx=20, dh=10
        # Perturbation variance to B[0, 0] is 0.1
        "gbrbm_dx20_dh10_vp1": ([i * 1000 for i in range(2, 5 + 1)], ) +
        gbrbm_perturb(var_perturb_B=0.1, dx=20, dh=10),
        # Gaussian Bernoulli RBM. dx=20, dh=10
        # No perturbation
        "gbrbm_dx20_dh10_h0": ([i * 1000 for i in range(2, 5 + 1)], ) +
        gbrbm_perturb(var_perturb_B=0, dx=20, dh=10),
    }
    if prob_label not in prob2tuples:
        raise ValueError("Unknown problem label. Need to be one of %s" %
                         str(prob2tuples.keys()))
    return prob2tuples[prob_label]
Пример #5
0
def test_fssd():
    """Test FSSD with Gaussian kernel (median heuristic) and randomized test locations

    Following the example in:
    https://github.com/wittawatj/kernel-gof/blob/master/kgof/ex/ex1_vary_n.py
    """
    seed = 42

    d = 2  # dimensionality
    n = 800  # samples

    # Density
    mean = np.zeros(d)
    variance = 1.0
    p = density.IsotropicNormal(mean, variance)

    # Samples from same density
    ds = data.DSIsotropicNormal(mean, variance)
    samples = ds.sample(n, seed=seed + 1)

    # Gaussian kernel with median heuristic
    sig2 = util.meddistance(samples.data(), subsample=1000) ** 2
    k = kernel.KGauss(sig2)
    print(f"Kernel bandwidth: {sig2}")

    # FSSD
    J = 10
    null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=seed)
    # Fit a multivariate normal to the data X (n x d) and draw J points from the fit.
    V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1)
    fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01)
    test_result = fssd_med.perform_test(samples)
    print(test_result)
    assert test_result["h0_rejected"] == False

    # FSSD with samples from different density
    J = 10  # Fails with J=8, passes with J=10 (chance)
    ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2))
    samples = ds.sample(n, seed=seed + 1)
    sig2 = util.meddistance(samples.data(), subsample=1000) ** 2
    # NOTE: Works much better with the bandwidth that was optimized under FSSD:
    # sig2 = 0.3228712361986835
    k = kernel.KGauss(sig2)
    print(f"Kernel bandwidth: {sig2}")
    null_sim = gof.FSSDH0SimCovObs(n_simulate=3000, seed=seed)
    # TODO: is this what we want if samples come from another distribution ?!
    V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1)
    fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01)
    test_result = fssd_med.perform_test(samples)
    print(test_result)
    assert test_result["h0_rejected"] == True
Пример #6
0
    def test_grad_log(self):
        n = 8
        with util.NumpySeedContext(seed=17):
            for d in [4, 1]:
                variance = 1.2
                mean = np.random.randn(d) + 1
                X = np.random.rand(n, d) - 2

                isonorm = density.IsotropicNormal(mean, variance)
                grad_log = isonorm.grad_log(X)
                my_grad_log = -(X - mean) / variance

                # check correctness
                np.testing.assert_almost_equal(grad_log, my_grad_log)
Пример #7
0
    def test_log_den(self):
        n = 7
        with util.NumpySeedContext(seed=16):
            for d in [3, 1]:
                variance = 1.1
                mean = np.random.randn(d)
                X = np.random.rand(n, d) + 1

                isonorm = density.IsotropicNormal(mean, variance)
                log_dens = isonorm.log_den(X)
                my_log_dens = -np.sum((X - mean)**2, 1) / (2.0 * variance)

                # check correctness
                np.testing.assert_almost_equal(log_dens, my_log_dens)
Пример #8
0
    def test_optimized_fssd(self):
        """
        Test FSSD test with parameter optimization.
        """
        seed = 4
        # sample size
        n = 179
        alpha = 0.01
        for d in [1, 3]:
            mean = np.zeros(d)
            variance = 1.0
            p = density.IsotropicNormal(mean, variance)
            # Mean difference. obvious reject
            ds = data.DSIsotropicNormal(mean + 4, variance + 0)
            dat = ds.sample(n, seed=seed)
            # test
            for J in [1, 4]:
                opts = {
                    "reg": 1e-2,
                    "max_iter": 10,
                    "tol_fun": 1e-3,
                    "disp": False
                }
                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1)

                Xtr = tr.X
                gwidth0 = util.meddistance(Xtr, subsample=1000)**2
                # random test locations
                V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1)
                V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths(
                    p, tr, gwidth0, V0, **opts)

                # construct a test
                k_opt = kernel.KGauss(gw_opt)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
                fssd_opt = gof.FSSD(p,
                                    k_opt,
                                    V_opt,
                                    null_sim=null_sim,
                                    alpha=alpha)
                fssd_opt_result = fssd_opt.perform_test(
                    te, return_simulated_stats=True)
                assert fssd_opt_result["h0_rejected"]
Пример #9
0
    def test_auto_init_opt_fssd(self):
        """
        Test FSSD-opt test with automatic parameter initialization.
        """
        seed = 5
        # sample size
        n = 191
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1.0
            p = density.IsotropicNormal(mean, variance)
            # Mean difference. obvious reject
            ds = data.DSIsotropicNormal(mean + 4, variance + 0)
            dat = ds.sample(n, seed=seed)
            # test
            for J in [1, 3]:
                opts = {
                    "reg": 1e-2,
                    "max_iter": 10,
                    "tol_fun": 1e-3,
                    "disp": False
                }
                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1)

                V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_auto_init(
                    p, tr, J, **opts)

                # construct a test
                k_opt = kernel.KGauss(gw_opt)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
                fssd_opt = gof.FSSD(p,
                                    k_opt,
                                    V_opt,
                                    null_sim=null_sim,
                                    alpha=alpha)
                fssd_opt_result = fssd_opt.perform_test(
                    te, return_simulated_stats=True)
                assert fssd_opt_result["h0_rejected"]
Пример #10
0
    def test_basic(self):
        """
        Nothing special. Just test basic things.
        """
        seed = 12
        # sample
        n = 100
        alpha = 0.01
        for d in [1, 4]:
            mean = np.zeros(d)
            variance = 1
            isonorm = density.IsotropicNormal(mean, variance)

            # only one dimension of the mean is shifted
            # draw_mean = mean + np.hstack((1, np.zeros(d-1)))
            draw_mean = mean + 0
            draw_variance = variance + 1
            X = util.randn(n, d,
                           seed=seed) * np.sqrt(draw_variance) + draw_mean
            dat = data.Data(X)

            # Test
            for J in [1, 3]:
                sig2 = util.meddistance(X, subsample=1000)**2
                k = kernel.KGauss(sig2)

                # random test locations
                V = util.fit_gaussian_draw(X, J, seed=seed + 1)
                null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3)
                fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)

                tresult = fssd.perform_test(dat, return_simulated_stats=True)

                # assertions
                self.assertGreaterEqual(tresult["pvalue"], 0)
                self.assertLessEqual(tresult["pvalue"], 1)
Пример #11
0
def test_fssd_opt():
    """Test FSSD with optimized test locations

    Following the example in:
    https://github.com/wittawatj/kernel-gof/blob/master/ipynb/demo_kgof.ipynb
    """
    seed = 42

    d = 2  # dimensionality
    n = 800  # samples

    # Density
    mean = np.zeros(d)
    variance = 1.0
    p = density.IsotropicNormal(mean, variance)

    # Samples from same density
    ds = data.DSIsotropicNormal(mean, variance)
    samples = ds.sample(n, seed=seed + 1)

    # Split dataset
    tr, te = samples.split_tr_te(tr_proportion=0.2, seed=2)

    # Optimization
    opts = {
        "reg": 1e-2,  # regularization parameter in the optimization objective
        "max_iter": 50,  # maximum number of gradient ascent iterations
        "tol_fun": 1e-7,  # termination tolerance of the objective
    }
    # J is the number of test locations (or features). Typically not larger than 10
    J = 1
    V_opt, gw_opt, opt_info = gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)
    print(V_opt)
    print(f"Kernel bandwidth: {gw_opt}")
    print(opt_info)

    # FSSD
    fssd_opt = gof.GaussFSSD(p, gw_opt, V_opt, alpha=0.01)
    test_result = fssd_opt.perform_test(te)
    test_result
    print(test_result)
    assert test_result["h0_rejected"] == False

    # FSSD with samples from different density
    ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2))
    samples = ds.sample(n, seed=seed + 1)
    tr, te = samples.split_tr_te(tr_proportion=0.2, seed=2)
    opts = {
        "reg": 1e-2,  # regularization parameter in the optimization objective
        "max_iter": 50,  # maximum number of gradient ascent iterations
        "tol_fun": 1e-7,  # termination tolerance of the objective
    }
    J = 1  # J is the number of test locations (or features)
    V_opt, gw_opt, opt_info = gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)
    print(f"Kernel bandwidth: {gw_opt}")

    # FSSD
    fssd_opt = gof.GaussFSSD(p, gw_opt, V_opt, alpha=0.01)
    test_result = fssd_opt.perform_test(te)
    print(test_result)
    assert test_result["h0_rejected"] == True
Пример #12
0
def get_pqsource_list(prob_label):
    """
    Return [(prob_param, p, ds) for ... ], a list of tuples
    where
    - prob_param: a problem parameters. Each parameter has to be a
      scalar (so that we can plot them later). Parameters are preferably
      positive integers.
    - p: a Density representing the distribution p
    - ds: a DataSource, each corresponding to one parameter setting.
        The DataSource generates sample from q.
    """
    sg_ds = [1, 5, 10, 15]
    gmd_ds = [5, 20, 40, 60]
    # vary the mean
    gmd_d10_ms = [0, 0.02, 0.04, 0.06]
    gvinc_d1_vs = [1, 1.5, 2, 2.5]
    gvinc_d5_vs = [1, 1.5, 2, 2.5]
    gvsub1_d1_vs = [0.1, 0.3, 0.5, 0.7]
    gvd_ds = [1, 5, 10, 15]

    # gb_rbm_dx50_dh10_stds = [0, 0.01, 0.02, 0.03]
    gb_rbm_dx50_dh10_stds = [0, 0.02, 0.04, 0.06]
    # gb_rbm_dx50_dh10_stds = [0]
    gb_rbm_dx50_dh40_stds = [0, 0.01, 0.02, 0.04, 0.06]
    glaplace_ds = [1, 5, 10, 15]
    prob2tuples = {
        # H0 is true. vary d. P = Q = N(0, I)
        "sg": [(
            d,
            density.IsotropicNormal(np.zeros(d), 1),
            data.DSIsotropicNormal(np.zeros(d), 1),
        ) for d in sg_ds],
        # vary d. P = N(0, I), Q = N( (c,..0), I)
        "gmd": [(
            d,
            density.IsotropicNormal(np.zeros(d), 1),
            data.DSIsotropicNormal(np.hstack((1, np.zeros(d - 1))), 1),
        ) for d in gmd_ds],
        # P = N(0, I), Q = N( (m, ..0), I). Vary m
        "gmd_d10_ms": [(
            m,
            density.IsotropicNormal(np.zeros(10), 1),
            data.DSIsotropicNormal(np.hstack((m, np.zeros(9))), 1),
        ) for m in gmd_d10_ms],
        # d=1. Increase the variance. P = N(0, I). Q = N(0, v*I)
        "gvinc_d1": [(
            var,
            density.IsotropicNormal(np.zeros(1), 1),
            data.DSIsotropicNormal(np.zeros(1), var),
        ) for var in gvinc_d1_vs],
        # d=5. Increase the variance. P = N(0, I). Q = N(0, v*I)
        "gvinc_d5": [(
            var,
            density.IsotropicNormal(np.zeros(5), 1),
            data.DSIsotropicNormal(np.zeros(5), var),
        ) for var in gvinc_d5_vs],
        # d=1. P=N(0,1), Q(0,v). Consider the variance below 1.
        "gvsub1_d1": [(
            var,
            density.IsotropicNormal(np.zeros(1), 1),
            data.DSIsotropicNormal(np.zeros(1), var),
        ) for var in gvsub1_d1_vs],
        # Gaussian variance difference problem. Only the variance
        # of the first dimenion differs. d varies.
        "gvd": [(
            d,
            density.Normal(np.zeros(d), np.eye(d)),
            data.DSNormal(np.zeros(d), np.diag(np.hstack(
                (2, np.ones(d - 1))))),
        ) for d in gvd_ds],
        # Gaussian Bernoulli RBM. dx=50, dh=10
        "gbrbm_dx50_dh10":
        gaussbern_rbm_probs(gb_rbm_dx50_dh10_stds, dx=50, dh=10,
                            n=sample_size),
        # Gaussian Bernoulli RBM. dx=50, dh=40
        "gbrbm_dx50_dh40":
        gaussbern_rbm_probs(gb_rbm_dx50_dh40_stds, dx=50, dh=40,
                            n=sample_size),
        # p: N(0, I), q: standard Laplace. Vary d
        "glaplace": [
            (
                d,
                density.IsotropicNormal(np.zeros(d), 1),
                # Scaling of 1/sqrt(2) will make the variance 1.
                data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)),
            ) for d in glaplace_ds
        ],
    }
    if prob_label not in prob2tuples:
        raise ValueError("Unknown problem label. Need to be one of %s" %
                         str(prob2tuples.keys()))
    return prob2tuples[prob_label]
Пример #13
0
def get_pqsource(prob_label):
    """
    Return (p, ds), a tuple of
    - p: a Density representing the distribution p
    - ds: a DataSource, each corresponding to one parameter setting.
        The DataSource generates sample from q.
    """
    prob2tuples = {
        # H0 is true. vary d. P = Q = N(0, I)
        "sg5": (
            density.IsotropicNormal(np.zeros(5), 1),
            data.DSIsotropicNormal(np.zeros(5), 1),
        ),
        # P = N(0, I), Q = N( (0.2,..0), I)
        "gmd5": (
            density.IsotropicNormal(np.zeros(5), 1),
            data.DSIsotropicNormal(np.hstack((0.2, np.zeros(4))), 1),
        ),
        "gmd1": (
            density.IsotropicNormal(np.zeros(1), 1),
            data.DSIsotropicNormal(np.ones(1) * 0.2, 1),
        ),
        # P = N(0, I), Q = N( (1,..0), I)
        "gmd100": (
            density.IsotropicNormal(np.zeros(100), 1),
            data.DSIsotropicNormal(np.hstack((1, np.zeros(99))), 1),
        ),
        # Gaussian variance difference problem. Only the variance
        # of the first dimenion differs. d varies.
        "gvd5": (
            density.Normal(np.zeros(5), np.eye(5)),
            data.DSNormal(np.zeros(5), np.diag(np.hstack((2, np.ones(4))))),
        ),
        "gvd10": (
            density.Normal(np.zeros(10), np.eye(10)),
            data.DSNormal(np.zeros(10), np.diag(np.hstack((2, np.ones(9))))),
        ),
        # Gaussian Bernoulli RBM. dx=50, dh=10. H0 is true
        "gbrbm_dx50_dh10_v0":
        gaussbern_rbm_tuple(0, dx=50, dh=10, n=sample_size),
        # Gaussian Bernoulli RBM. dx=5, dh=3. H0 is true
        "gbrbm_dx5_dh3_v0":
        gaussbern_rbm_tuple(0, dx=5, dh=3, n=sample_size),
        # Gaussian Bernoulli RBM. dx=50, dh=10.
        "gbrbm_dx50_dh10_v1em3":
        gaussbern_rbm_tuple(1e-3, dx=50, dh=10, n=sample_size),
        # Gaussian Bernoulli RBM. dx=5, dh=3. Perturb with noise = 1e-2.
        "gbrbm_dx5_dh3_v5em3":
        gaussbern_rbm_tuple(5e-3, dx=5, dh=3, n=sample_size),
        # Gaussian mixture of two components. Uniform mixture weights.
        # p = 0.5*N(0, 1) + 0.5*N(3, 0.01)
        # q = 0.5*N(-3, 0.01) + 0.5*N(0, 1)
        "gmm_d1": (
            density.IsoGaussianMixture(np.array([[0], [3.0]]),
                                       np.array([1, 0.01])),
            data.DSIsoGaussianMixture(np.array([[-3.0], [0]]),
                                      np.array([0.01, 1])),
        ),
        # p = N(0, 1)
        # q = 0.1*N([-10, 0,..0], 0.001) + 0.9*N([0,0,..0], 1)
        "g_vs_gmm_d5": (
            density.IsotropicNormal(np.zeros(5), 1),
            data.DSIsoGaussianMixture(
                np.vstack((np.hstack((0.0, np.zeros(4))), np.zeros(5))),
                np.array([0.0001, 1]),
                pmix=[0.1, 0.9],
            ),
        ),
        "g_vs_gmm_d2": (
            density.IsotropicNormal(np.zeros(2), 1),
            data.DSIsoGaussianMixture(
                np.vstack((np.hstack((0.0, np.zeros(1))), np.zeros(2))),
                np.array([0.01, 1]),
                pmix=[0.1, 0.9],
            ),
        ),
        "g_vs_gmm_d1": (
            density.IsotropicNormal(np.zeros(1), 1),
            data.DSIsoGaussianMixture(np.array([[0.0], [0]]),
                                      np.array([0.01, 1]),
                                      pmix=[0.1, 0.9]),
        ),
    }
    if prob_label not in prob2tuples:
        raise ValueError("Unknown problem label. Need to be one of %s" %
                         str(prob2tuples.keys()))
    return prob2tuples[prob_label]