def test_ustat_h1_mean_variance(self): seed = 20 # sample n = 200 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) draw_mean = mean + 2 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) fea_tensor = fssd.feature_tensor(X) u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance( fea_tensor) # assertions self.assertGreaterEqual(u_variance, 0) # should reject H0 self.assertGreaterEqual(u_mean, 0)
def test_basic(self): d = 3 p = density.IsotropicNormal(mean=np.zeros(d), variance=3.0) q = density.IsotropicNormal(mean=np.zeros(d) + 2, variance=3.0) k = kernel.KGauss(2.0) ds = q.get_datasource() n = 97 dat = ds.sample(n, seed=3) witness = gof.SteinWitness(p, k, dat) # points to evaluate the witness J = 4 V = np.random.randn(J, d) * 2 evals = witness(V) testing.assert_equal(evals.shape, (J, d))
def test_ksd(): """Test quadratic time KSD Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/ipynb/gof_kernel_stein.ipynb """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000)**2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # KSD bootstrapper = gof.bootstrapper_rademacher kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, alpha=0.01, n_simulate=500, seed=seed + 1) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) print(test_result) assert test_result["h0_rejected"] == False # KSD with samples from different density ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000)**2 print(f"Kernel bandwidth: {sig2}") k = kernel.KGauss(sig2) bootstrapper = gof.bootstrapper_rademacher kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, alpha=0.01, n_simulate=500, seed=seed + 1) test_result = kstein.perform_test(samples, return_simulated_stats=False, return_ustat_gram=False) print(test_result) assert test_result["h0_rejected"] == True
def get_ns_pqsource(prob_label): """ Return (ns, p, ds), a tuple of where - ns: a list of sample sizes - p: a Density representing the distribution p - ds: a DataSource, each corresponding to one parameter setting. The DataSource generates sample from q. """ gmd_p01_d10_ns = [1000, 3000, 5000] # gb_rbm_dx50_dh10_vars = [0, 1e-3, 2e-3, 3e-3] prob2tuples = { # vary d. P = N(0, I), Q = N( (c,..0), I) "gmd_p03_d10_ns": ( gmd_p01_d10_ns, density.IsotropicNormal(np.zeros(10), 1), data.DSIsotropicNormal(np.hstack((0.03, np.zeros(10 - 1))), 1), ), # Gaussian Bernoulli RBM. dx=50, dh=10 # Perturbation variance to B[0, 0] is 0.1 "gbrbm_dx50_dh10_vp1": ([i * 1000 for i in range(1, 4 + 1)], ) + # ([1000, 5000], ) + gbrbm_perturb(var_perturb_B=0.1, dx=50, dh=10), # Gaussian Bernoulli RBM. dx=50, dh=40 # Perturbation variance to B[0, 0] is 0.1 "gbrbm_dx50_dh40_vp1": ([i * 1000 for i in range(1, 4 + 1)], ) + # ([1000, 5000], ) + gbrbm_perturb(var_perturb_B=0.1, dx=50, dh=40), # Gaussian Bernoulli RBM. dx=50, dh=10 # No perturbation "gbrbm_dx50_dh10_h0": ([i * 1000 for i in range(1, 4 + 1)], ) + # ([1000, 5000], ) + gbrbm_perturb(var_perturb_B=0, dx=50, dh=10), # Gaussian Bernoulli RBM. dx=50, dh=40 # No perturbation "gbrbm_dx50_dh40_h0": ([i * 1000 for i in range(1, 4 + 1)], ) + # ([1000, 5000], ) + gbrbm_perturb(var_perturb_B=0, dx=50, dh=40), # Gaussian Bernoulli RBM. dx=20, dh=10 # Perturbation variance to B[0, 0] is 0.1 "gbrbm_dx20_dh10_vp1": ([i * 1000 for i in range(2, 5 + 1)], ) + gbrbm_perturb(var_perturb_B=0.1, dx=20, dh=10), # Gaussian Bernoulli RBM. dx=20, dh=10 # No perturbation "gbrbm_dx20_dh10_h0": ([i * 1000 for i in range(2, 5 + 1)], ) + gbrbm_perturb(var_perturb_B=0, dx=20, dh=10), } if prob_label not in prob2tuples: raise ValueError("Unknown problem label. Need to be one of %s" % str(prob2tuples.keys())) return prob2tuples[prob_label]
def test_fssd(): """Test FSSD with Gaussian kernel (median heuristic) and randomized test locations Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/kgof/ex/ex1_vary_n.py """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # FSSD J = 10 null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=seed) # Fit a multivariate normal to the data X (n x d) and draw J points from the fit. V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == False # FSSD with samples from different density J = 10 # Fails with J=8, passes with J=10 (chance) ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 # NOTE: Works much better with the bandwidth that was optimized under FSSD: # sig2 = 0.3228712361986835 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") null_sim = gof.FSSDH0SimCovObs(n_simulate=3000, seed=seed) # TODO: is this what we want if samples come from another distribution ?! V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == True
def test_grad_log(self): n = 8 with util.NumpySeedContext(seed=17): for d in [4, 1]: variance = 1.2 mean = np.random.randn(d) + 1 X = np.random.rand(n, d) - 2 isonorm = density.IsotropicNormal(mean, variance) grad_log = isonorm.grad_log(X) my_grad_log = -(X - mean) / variance # check correctness np.testing.assert_almost_equal(grad_log, my_grad_log)
def test_log_den(self): n = 7 with util.NumpySeedContext(seed=16): for d in [3, 1]: variance = 1.1 mean = np.random.randn(d) X = np.random.rand(n, d) + 1 isonorm = density.IsotropicNormal(mean, variance) log_dens = isonorm.log_den(X) my_log_dens = -np.sum((X - mean)**2, 1) / (2.0 * variance) # check correctness np.testing.assert_almost_equal(log_dens, my_log_dens)
def test_optimized_fssd(self): """ Test FSSD test with parameter optimization. """ seed = 4 # sample size n = 179 alpha = 0.01 for d in [1, 3]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 4]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) Xtr = tr.X gwidth0 = util.meddistance(Xtr, subsample=1000)**2 # random test locations V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth0, V0, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def test_auto_init_opt_fssd(self): """ Test FSSD-opt test with automatic parameter initialization. """ seed = 5 # sample size n = 191 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 3]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_auto_init( p, tr, J, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def test_basic(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 100 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) # only one dimension of the mean is shifted # draw_mean = mean + np.hstack((1, np.zeros(d-1))) draw_mean = mean + 0 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) tresult = fssd.perform_test(dat, return_simulated_stats=True) # assertions self.assertGreaterEqual(tresult["pvalue"], 0) self.assertLessEqual(tresult["pvalue"], 1)
def test_fssd_opt(): """Test FSSD with optimized test locations Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/ipynb/demo_kgof.ipynb """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Split dataset tr, te = samples.split_tr_te(tr_proportion=0.2, seed=2) # Optimization opts = { "reg": 1e-2, # regularization parameter in the optimization objective "max_iter": 50, # maximum number of gradient ascent iterations "tol_fun": 1e-7, # termination tolerance of the objective } # J is the number of test locations (or features). Typically not larger than 10 J = 1 V_opt, gw_opt, opt_info = gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts) print(V_opt) print(f"Kernel bandwidth: {gw_opt}") print(opt_info) # FSSD fssd_opt = gof.GaussFSSD(p, gw_opt, V_opt, alpha=0.01) test_result = fssd_opt.perform_test(te) test_result print(test_result) assert test_result["h0_rejected"] == False # FSSD with samples from different density ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) tr, te = samples.split_tr_te(tr_proportion=0.2, seed=2) opts = { "reg": 1e-2, # regularization parameter in the optimization objective "max_iter": 50, # maximum number of gradient ascent iterations "tol_fun": 1e-7, # termination tolerance of the objective } J = 1 # J is the number of test locations (or features) V_opt, gw_opt, opt_info = gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts) print(f"Kernel bandwidth: {gw_opt}") # FSSD fssd_opt = gof.GaussFSSD(p, gw_opt, V_opt, alpha=0.01) test_result = fssd_opt.perform_test(te) print(test_result) assert test_result["h0_rejected"] == True
def get_pqsource_list(prob_label): """ Return [(prob_param, p, ds) for ... ], a list of tuples where - prob_param: a problem parameters. Each parameter has to be a scalar (so that we can plot them later). Parameters are preferably positive integers. - p: a Density representing the distribution p - ds: a DataSource, each corresponding to one parameter setting. The DataSource generates sample from q. """ sg_ds = [1, 5, 10, 15] gmd_ds = [5, 20, 40, 60] # vary the mean gmd_d10_ms = [0, 0.02, 0.04, 0.06] gvinc_d1_vs = [1, 1.5, 2, 2.5] gvinc_d5_vs = [1, 1.5, 2, 2.5] gvsub1_d1_vs = [0.1, 0.3, 0.5, 0.7] gvd_ds = [1, 5, 10, 15] # gb_rbm_dx50_dh10_stds = [0, 0.01, 0.02, 0.03] gb_rbm_dx50_dh10_stds = [0, 0.02, 0.04, 0.06] # gb_rbm_dx50_dh10_stds = [0] gb_rbm_dx50_dh40_stds = [0, 0.01, 0.02, 0.04, 0.06] glaplace_ds = [1, 5, 10, 15] prob2tuples = { # H0 is true. vary d. P = Q = N(0, I) "sg": [( d, density.IsotropicNormal(np.zeros(d), 1), data.DSIsotropicNormal(np.zeros(d), 1), ) for d in sg_ds], # vary d. P = N(0, I), Q = N( (c,..0), I) "gmd": [( d, density.IsotropicNormal(np.zeros(d), 1), data.DSIsotropicNormal(np.hstack((1, np.zeros(d - 1))), 1), ) for d in gmd_ds], # P = N(0, I), Q = N( (m, ..0), I). Vary m "gmd_d10_ms": [( m, density.IsotropicNormal(np.zeros(10), 1), data.DSIsotropicNormal(np.hstack((m, np.zeros(9))), 1), ) for m in gmd_d10_ms], # d=1. Increase the variance. P = N(0, I). Q = N(0, v*I) "gvinc_d1": [( var, density.IsotropicNormal(np.zeros(1), 1), data.DSIsotropicNormal(np.zeros(1), var), ) for var in gvinc_d1_vs], # d=5. Increase the variance. P = N(0, I). Q = N(0, v*I) "gvinc_d5": [( var, density.IsotropicNormal(np.zeros(5), 1), data.DSIsotropicNormal(np.zeros(5), var), ) for var in gvinc_d5_vs], # d=1. P=N(0,1), Q(0,v). Consider the variance below 1. "gvsub1_d1": [( var, density.IsotropicNormal(np.zeros(1), 1), data.DSIsotropicNormal(np.zeros(1), var), ) for var in gvsub1_d1_vs], # Gaussian variance difference problem. Only the variance # of the first dimenion differs. d varies. "gvd": [( d, density.Normal(np.zeros(d), np.eye(d)), data.DSNormal(np.zeros(d), np.diag(np.hstack( (2, np.ones(d - 1))))), ) for d in gvd_ds], # Gaussian Bernoulli RBM. dx=50, dh=10 "gbrbm_dx50_dh10": gaussbern_rbm_probs(gb_rbm_dx50_dh10_stds, dx=50, dh=10, n=sample_size), # Gaussian Bernoulli RBM. dx=50, dh=40 "gbrbm_dx50_dh40": gaussbern_rbm_probs(gb_rbm_dx50_dh40_stds, dx=50, dh=40, n=sample_size), # p: N(0, I), q: standard Laplace. Vary d "glaplace": [ ( d, density.IsotropicNormal(np.zeros(d), 1), # Scaling of 1/sqrt(2) will make the variance 1. data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)), ) for d in glaplace_ds ], } if prob_label not in prob2tuples: raise ValueError("Unknown problem label. Need to be one of %s" % str(prob2tuples.keys())) return prob2tuples[prob_label]
def get_pqsource(prob_label): """ Return (p, ds), a tuple of - p: a Density representing the distribution p - ds: a DataSource, each corresponding to one parameter setting. The DataSource generates sample from q. """ prob2tuples = { # H0 is true. vary d. P = Q = N(0, I) "sg5": ( density.IsotropicNormal(np.zeros(5), 1), data.DSIsotropicNormal(np.zeros(5), 1), ), # P = N(0, I), Q = N( (0.2,..0), I) "gmd5": ( density.IsotropicNormal(np.zeros(5), 1), data.DSIsotropicNormal(np.hstack((0.2, np.zeros(4))), 1), ), "gmd1": ( density.IsotropicNormal(np.zeros(1), 1), data.DSIsotropicNormal(np.ones(1) * 0.2, 1), ), # P = N(0, I), Q = N( (1,..0), I) "gmd100": ( density.IsotropicNormal(np.zeros(100), 1), data.DSIsotropicNormal(np.hstack((1, np.zeros(99))), 1), ), # Gaussian variance difference problem. Only the variance # of the first dimenion differs. d varies. "gvd5": ( density.Normal(np.zeros(5), np.eye(5)), data.DSNormal(np.zeros(5), np.diag(np.hstack((2, np.ones(4))))), ), "gvd10": ( density.Normal(np.zeros(10), np.eye(10)), data.DSNormal(np.zeros(10), np.diag(np.hstack((2, np.ones(9))))), ), # Gaussian Bernoulli RBM. dx=50, dh=10. H0 is true "gbrbm_dx50_dh10_v0": gaussbern_rbm_tuple(0, dx=50, dh=10, n=sample_size), # Gaussian Bernoulli RBM. dx=5, dh=3. H0 is true "gbrbm_dx5_dh3_v0": gaussbern_rbm_tuple(0, dx=5, dh=3, n=sample_size), # Gaussian Bernoulli RBM. dx=50, dh=10. "gbrbm_dx50_dh10_v1em3": gaussbern_rbm_tuple(1e-3, dx=50, dh=10, n=sample_size), # Gaussian Bernoulli RBM. dx=5, dh=3. Perturb with noise = 1e-2. "gbrbm_dx5_dh3_v5em3": gaussbern_rbm_tuple(5e-3, dx=5, dh=3, n=sample_size), # Gaussian mixture of two components. Uniform mixture weights. # p = 0.5*N(0, 1) + 0.5*N(3, 0.01) # q = 0.5*N(-3, 0.01) + 0.5*N(0, 1) "gmm_d1": ( density.IsoGaussianMixture(np.array([[0], [3.0]]), np.array([1, 0.01])), data.DSIsoGaussianMixture(np.array([[-3.0], [0]]), np.array([0.01, 1])), ), # p = N(0, 1) # q = 0.1*N([-10, 0,..0], 0.001) + 0.9*N([0,0,..0], 1) "g_vs_gmm_d5": ( density.IsotropicNormal(np.zeros(5), 1), data.DSIsoGaussianMixture( np.vstack((np.hstack((0.0, np.zeros(4))), np.zeros(5))), np.array([0.0001, 1]), pmix=[0.1, 0.9], ), ), "g_vs_gmm_d2": ( density.IsotropicNormal(np.zeros(2), 1), data.DSIsoGaussianMixture( np.vstack((np.hstack((0.0, np.zeros(1))), np.zeros(2))), np.array([0.01, 1]), pmix=[0.1, 0.9], ), ), "g_vs_gmm_d1": ( density.IsotropicNormal(np.zeros(1), 1), data.DSIsoGaussianMixture(np.array([[0.0], [0]]), np.array([0.01, 1]), pmix=[0.1, 0.9]), ), } if prob_label not in prob2tuples: raise ValueError("Unknown problem label. Need to be one of %s" % str(prob2tuples.keys())) return prob2tuples[prob_label]