def test_fssd(): """Test FSSD with Gaussian kernel (median heuristic) and randomized test locations Following the example in: https://github.com/wittawatj/kernel-gof/blob/master/kgof/ex/ex1_vary_n.py """ seed = 42 d = 2 # dimensionality n = 800 # samples # Density mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Samples from same density ds = data.DSIsotropicNormal(mean, variance) samples = ds.sample(n, seed=seed + 1) # Gaussian kernel with median heuristic sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") # FSSD J = 10 null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=seed) # Fit a multivariate normal to the data X (n x d) and draw J points from the fit. V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == False # FSSD with samples from different density J = 10 # Fails with J=8, passes with J=10 (chance) ds = data.DSLaplace(d=d, loc=0, scale=1.0 / np.sqrt(2)) samples = ds.sample(n, seed=seed + 1) sig2 = util.meddistance(samples.data(), subsample=1000) ** 2 # NOTE: Works much better with the bandwidth that was optimized under FSSD: # sig2 = 0.3228712361986835 k = kernel.KGauss(sig2) print(f"Kernel bandwidth: {sig2}") null_sim = gof.FSSDH0SimCovObs(n_simulate=3000, seed=seed) # TODO: is this what we want if samples come from another distribution ?! V = util.fit_gaussian_draw(samples.data(), J=J, seed=seed + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=0.01) test_result = fssd_med.perform_test(samples) print(test_result) assert test_result["h0_rejected"] == True
def job_fssdJ1q_med(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD test with a Gaussian kernel, where the test locations are randomized, and the Gaussian width is set with the median heuristic. Use full sample. No training/testing splits. p: an UnnormalizedDensity data_source: a DataSource tr, te: Data r: trial number (positive integer) """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med**2) V = util.fit_gaussian_draw(X, J, seed=r + 1) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha) fssd_med_result = fssd_med.perform_test(data) return {"test_result": fssd_med_result, "time_secs": t.secs}
def test_ustat_h1_mean_variance(self): seed = 20 # sample n = 200 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) draw_mean = mean + 2 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) fea_tensor = fssd.feature_tensor(X) u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance( fea_tensor) # assertions self.assertGreaterEqual(u_variance, 0) # should reject H0 self.assertGreaterEqual(u_mean, 0)
def job_fssdJ1q_imq_optv(p, data_source, tr, te, r, J=1, b=-0.5, null_sim=None): """ FSSD with optimization on tr. Test on te. Use an inverse multiquadric kernel (IMQ). Optimize only the test locations (V). Fix the kernel parameters to b = -0.5, c=1. These are the recommended values from Measuring Sample Quality with Kernels Jackson Gorham, Lester Mackey """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # IMQ kernel parameters: b and c c = 1.0 # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) ops = { "reg": 1e-5, "max_iter": 30, "tol_fun": 1e-6, "disp": True, "locs_bounds_frac": 20.0, } V_opt, info = gof.IMQFSSD.optimize_locs(p, tr, b, c, V0, **ops) k_imq = kernel.KIMQ(b=b, c=c) # Use the optimized parameters to construct a test fssd_imq = gof.FSSD(p, k_imq, V_opt, null_sim=null_sim, alpha=alpha) fssd_imq_result = fssd_imq.perform_test(te) return { "test_result": fssd_imq_result, "time_secs": t.secs, "goftest": fssd_imq, "opt_info": info, }
def job_fssdJ1q_opt(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD with optimization on tr. Test on te. Use a Gaussian kernel. """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # Use grid search to initialize the gwidth n_gwidth_cand = 5 gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(Xtr, 1000)**2 k = kernel.KGauss(med2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = gof.GaussFSSD.grid_search_gwidth(p, tr, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num( gwidth), "gwidth not real. Was %s" % str(gwidth) assert gwidth > 0, "gwidth not positive. Was %.3g" % gwidth logging.info("After grid search, gwidth=%.3g" % gwidth) ops = { "reg": 1e-2, "max_iter": 30, "tol_fun": 1e-5, "disp": True, "locs_bounds_frac": 30.0, "gwidth_lb": 1e-1, "gwidth_ub": 1e4, } V_opt, gwidth_opt, info = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth, V0, **ops) # Use the optimized parameters to construct a test k_opt = kernel.KGauss(gwidth_opt) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test(te) return { "test_result": fssd_opt_result, "time_secs": t.secs, "goftest": fssd_opt, "opt_info": info, }
def job_fssdJ1q_imq_optbv(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD with optimization on tr. Test on te. Use an inverse multiquadric kernel (IMQ). Optimize the test locations (V), and b. Fix c (in the kernel) """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # Initial IMQ kernel parameters: b and c b0 = -0.5 # Fix c to this value c = 1.0 c0 = c # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) ops = { "reg": 1e-5, "max_iter": 40, "tol_fun": 1e-6, "disp": True, "locs_bounds_frac": 20.0, # IMQ kernel bounds "b_lb": -20, "c_lb": c, "c_ub": c, } V_opt, b_opt, c_opt, info = gof.IMQFSSD.optimize_locs_params( p, tr, b0, c0, V0, **ops) k_imq = kernel.KIMQ(b=b_opt, c=c_opt) # Use the optimized parameters to construct a test fssd_imq = gof.FSSD(p, k_imq, V_opt, null_sim=null_sim, alpha=alpha) fssd_imq_result = fssd_imq.perform_test(te) return { "test_result": fssd_imq_result, "time_secs": t.secs, "goftest": fssd_imq, "opt_info": info, }
def test_optimized_fssd(self): """ Test FSSD test with parameter optimization. """ seed = 4 # sample size n = 179 alpha = 0.01 for d in [1, 3]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 4]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) Xtr = tr.X gwidth0 = util.meddistance(Xtr, subsample=1000)**2 # random test locations V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth0, V0, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def test_auto_init_opt_fssd(self): """ Test FSSD-opt test with automatic parameter initialization. """ seed = 5 # sample size n = 191 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 3]: opts = { "reg": 1e-2, "max_iter": 10, "tol_fun": 1e-3, "disp": False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_auto_init( p, tr, J, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result["h0_rejected"]
def test_basic(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 100 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) # only one dimension of the mean is shifted # draw_mean = mean + np.hstack((1, np.zeros(d-1))) draw_mean = mean + 0 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) tresult = fssd.perform_test(dat, return_simulated_stats=True) # assertions self.assertGreaterEqual(tresult["pvalue"], 0) self.assertLessEqual(tresult["pvalue"], 1)