def job_fssdJ1q_med(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD test with a Gaussian kernel, where the test locations are randomized, and the Gaussian width is set with the median heuristic. Use full sample. No training/testing splits. p: an UnnormalizedDensity data_source: a DataSource tr, te: Data r: trial number (positive integer) """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic med = util.meddistance(X, subsample=1000) k = kernel.KGauss(med**2) V = util.fit_gaussian_draw(X, J, seed=r + 3) fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha) fssd_med_result = fssd_med.perform_test(data) return { 'goftest': fssd_med, 'test_result': fssd_med_result, 'time_secs': t.secs }
def test_ustat_h1_mean_variance(self): seed = 20 # sample n = 200 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) draw_mean = mean + 2 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) fea_tensor = fssd.feature_tensor(X) u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance( fea_tensor) # assertions self.assertGreaterEqual(u_variance, 0) # should reject H0 self.assertGreaterEqual(u_mean, 0)
def __init__(self, p, q, k, l, V, W, alpha=0.01): """ :param p: a kmod.density.UnnormalizedDensity (model 1) :param q: a kmod.density.UnnormalizedDensity (model 2) :param k: a DifferentiableKernel for defining the Stein function class of p :param l: a DifferentiableKernel for defining the Stein function class of q :param V: Jp x d numpy array of Jp test locations used in FSSD(p, k, V) :param W: Jq x d numpy array of Jq test locations used in FSSD(q, l, W) :param alpha: significance level of the test """ super(DC_FSSD, self).__init__(p, q, alpha) self.k = k self.l = l self.V = V self.W = W # Construct two FSSD objects self.fssdp = gof.FSSD(p=p, k=k, V=V, null_sim=None, alpha=alpha) self.fssdq = gof.FSSD(p=q, k=l, V=W, null_sim=None, alpha=alpha)
def job_fssdJ1q_imq_optv(p, data_source, tr, te, r, J=1, b=-0.5, null_sim=None): """ FSSD with optimization on tr. Test on te. Use an inverse multiquadric kernel (IMQ). Optimize only the test locations (V). Fix the kernel parameters to b = -0.5, c=1. These are the recommended values from Measuring Sample Quality with Kernels Jackson Gorham, Lester Mackey """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # IMQ kernel parameters: b and c c = 1.0 # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) ops = { 'reg': 1e-5, 'max_iter': 30, 'tol_fun': 1e-6, 'disp': True, 'locs_bounds_frac': 20.0, } V_opt, info = gof.IMQFSSD.optimize_locs(p, tr, b, c, V0, **ops) k_imq = kernel.KIMQ(b=b, c=c) # Use the optimized parameters to construct a test fssd_imq = gof.FSSD(p, k_imq, V_opt, null_sim=null_sim, alpha=alpha) fssd_imq_result = fssd_imq.perform_test(te) return { 'test_result': fssd_imq_result, 'time_secs': t.secs, 'goftest': fssd_imq, 'opt_info': info, }
def job_fssdJ1q_opt(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD with optimization on tr. Test on te. Use a Gaussian kernel. """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # Use grid search to initialize the gwidth n_gwidth_cand = 5 gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(Xtr, 1000)**2 k = kernel.KGauss(med2 * 2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = gof.GaussFSSD.grid_search_gwidth(p, tr, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num( gwidth), 'gwidth not real. Was %s' % str(gwidth) assert gwidth > 0, 'gwidth not positive. Was %.3g' % gwidth logging.info('After grid search, gwidth=%.3g' % gwidth) ops = { 'reg': 1e-2, 'max_iter': 40, 'tol_fun': 1e-4, 'disp': True, 'locs_bounds_frac': 10.0, 'gwidth_lb': 1e-1, 'gwidth_ub': 1e4, } V_opt, gwidth_opt, info = gof.GaussFSSD.optimize_locs_widths( p, tr, gwidth, V0, **ops) # Use the optimized parameters to construct a test k_opt = kernel.KGauss(gwidth_opt) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test(te) return { 'test_result': fssd_opt_result, 'time_secs': t.secs, 'goftest': fssd_opt, 'opt_info': info, }
def job_fssdJ1q_imq_optbv(p, data_source, tr, te, r, J=1, null_sim=None): """ FSSD with optimization on tr. Test on te. Use an inverse multiquadric kernel (IMQ). Optimize the test locations (V), and b. Fix c (in the kernel) """ if null_sim is None: null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r) Xtr = tr.data() with util.ContextTimer() as t: # Initial IMQ kernel parameters: b and c b0 = -0.5 # Fix c to this value c = 1.0 c0 = c # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(Xtr, J, seed=r + 1, reg=1e-6) ops = { 'reg': 1e-5, 'max_iter': 40, 'tol_fun': 1e-6, 'disp': True, 'locs_bounds_frac': 20.0, # IMQ kernel bounds 'b_lb': -20, 'c_lb': c, 'c_ub': c, } V_opt, b_opt, c_opt, info = gof.IMQFSSD.optimize_locs_params( p, tr, b0, c0, V0, **ops) k_imq = kernel.KIMQ(b=b_opt, c=c_opt) # Use the optimized parameters to construct a test fssd_imq = gof.FSSD(p, k_imq, V_opt, null_sim=null_sim, alpha=alpha) fssd_imq_result = fssd_imq.perform_test(te) return { 'test_result': fssd_imq_result, 'time_secs': t.secs, 'goftest': fssd_imq, 'opt_info': info, }
def test_optimized_fssd(self): """ Test FSSD test with parameter optimization. """ seed = 4 # sample size n = 179 alpha = 0.01 for d in [1, 3]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 4]: opts = { 'reg': 1e-2, 'max_iter': 10, 'tol_fun': 1e-3, 'disp': False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) Xtr = tr.X gwidth0 = util.meddistance(Xtr, subsample=1000)**2 # random test locations V0 = util.fit_gaussian_draw(Xtr, J, seed=seed + 1) V_opt, gw_opt, opt_result = \ gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result['h0_rejected']
def test_auto_init_opt_fssd(self): """ Test FSSD-opt test with automatic parameter initialization. """ seed = 5 # sample size n = 191 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1.0 p = density.IsotropicNormal(mean, variance) # Mean difference. obvious reject ds = data.DSIsotropicNormal(mean + 4, variance + 0) dat = ds.sample(n, seed=seed) # test for J in [1, 3]: opts = { 'reg': 1e-2, 'max_iter': 10, 'tol_fun': 1e-3, 'disp': False } tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed + 1) V_opt, gw_opt, opt_result = \ gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts) # construct a test k_opt = kernel.KGauss(gw_opt) null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10) fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha) fssd_opt_result = fssd_opt.perform_test( te, return_simulated_stats=True) assert fssd_opt_result['h0_rejected']
def test_basic(self): """ Nothing special. Just test basic things. """ seed = 12 # sample n = 100 alpha = 0.01 for d in [1, 4]: mean = np.zeros(d) variance = 1 isonorm = density.IsotropicNormal(mean, variance) # only one dimension of the mean is shifted #draw_mean = mean + np.hstack((1, np.zeros(d-1))) draw_mean = mean + 0 draw_variance = variance + 1 X = util.randn(n, d, seed=seed) * np.sqrt(draw_variance) + draw_mean dat = data.Data(X) # Test for J in [1, 3]: sig2 = util.meddistance(X, subsample=1000)**2 k = kernel.KGauss(sig2) # random test locations V = util.fit_gaussian_draw(X, J, seed=seed + 1) null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3) fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha) tresult = fssd.perform_test(dat, return_simulated_stats=True) # assertions self.assertGreaterEqual(tresult['pvalue'], 0) self.assertLessEqual(tresult['pvalue'], 1)