def test_MI(): np.random.seed(414) x = np.random.normal(size=(200, 4)) x[[1, 3, 9], 0] = np.nan x[[1, 4, 3], 1] = np.nan x[[2, 11, 21], 2] = np.nan x[[11, 22, 99], 3] = np.nan def model_args_fn(x): # Return endog, exog # Regress x0 on x1 and x2 if type(x) is np.ndarray: return (x[:, 0], x[:, 1:]) else: return (x.iloc[:, 0].values, x.iloc[:, 1:].values) for j in (0, 1): np.random.seed(2342) imp = BayesGaussMI(x.copy()) mi = MI(imp, sm.OLS, model_args_fn, burn=0) r = mi.fit() r.summary() # smoke test # TODO: why does the test tolerance need to be so slack? # There is unexpected variation across versions on travis. assert_allclose(r.params, np.r_[-0.05347919, -0.02479701, 0.10075517], 0.25, 0) c = np.asarray([[0.00418232, 0.00029746, -0.00035057], [0.00029746, 0.00407264, 0.00019496], [-0.00035057, 0.00019496, 0.00509413]]) assert_allclose(r.cov_params(), c, 0.3, 0) # Test with ndarray and pandas input x = pd.DataFrame(x)
def test_mi_formula(): np.random.seed(414) x = np.random.normal(size=(200, 4)) x[[1, 3, 9], 0] = np.nan x[[1, 4, 3], 1] = np.nan x[[2, 11, 21], 2] = np.nan x[[11, 22, 99], 3] = np.nan df = pd.DataFrame({ "y": x[:, 0], "x1": x[:, 1], "x2": x[:, 2], "x3": x[:, 3] }) fml = "y ~ 0 + x1 + x2 + x3" np.random.seed(2342) imp = BayesGaussMI(df.copy()) mi = MI(imp, sm.OLS, formula=fml, burn=0) r = mi.fit() r.summary() # smoke test # TODO: why does the test tolerance need to be so slack? # There is unexpected variation across versions on travis. assert_allclose(r.params, np.r_[-0.05347919, -0.02479701, 0.10075517], 0.25, 0) c = np.asarray([[0.00418232, 0.00029746, -0.00035057], [0.00029746, 0.00407264, 0.00019496], [-0.00035057, 0.00019496, 0.00509413]]) assert_allclose(r.cov_params(), c, 0.3, 0)
def test_MI(): np.random.seed(414) x = np.random.normal(size=(200, 4)) x[[1, 3, 9], 0] = np.nan x[[1, 4, 3], 1] = np.nan x[[2, 11, 21], 2] = np.nan x[[11, 22, 99], 3] = np.nan def model_args_fn(x): # Return endog, exog # Regress x0 on x1 and x2 if type(x) is np.ndarray: return (x[:, 0], x[:, 1:]) else: return (x.iloc[:, 0].values, x.iloc[:, 1:].values) for j in (0, 1): np.random.seed(2342) imp = BayesGaussMI(x.copy()) mi = MI(imp, sm.OLS, model_args_fn, burn=0) r = mi.fit() r.summary() # smoke test # TODO: why does the test tolerance need to be so slack? # There is unexpected variation across versions on travis. assert_allclose(r.params, np.r_[ -0.05347919, -0.02479701, 0.10075517], 0.25, 0) c = np.asarray([[0.00418232, 0.00029746, -0.00035057], [0.00029746, 0.00407264, 0.00019496], [-0.00035057, 0.00019496, 0.00509413]]) assert_allclose(r.cov_params(), c, 0.3, 0) # Test with ndarray and pandas input x = pd.DataFrame(x)
def test_mi_formula(): np.random.seed(414) x = np.random.normal(size=(200, 4)) x[[1, 3, 9], 0] = np.nan x[[1, 4, 3], 1] = np.nan x[[2, 11, 21], 2] = np.nan x[[11, 22, 99], 3] = np.nan df = pd.DataFrame({"y": x[:, 0], "x1": x[:, 1], "x2": x[:, 2], "x3": x[:, 3]}) fml = "y ~ 0 + x1 + x2 + x3" np.random.seed(2342) imp = BayesGaussMI(df.copy()) mi = MI(imp, sm.OLS, formula=fml, burn=0) r = mi.fit() r.summary() # smoke test # TODO: why does the test tolerance need to be so slack? # There is unexpected variation across versions on travis. assert_allclose(r.params, np.r_[ -0.05347919, -0.02479701, 0.10075517], 0.25, 0) c = np.asarray([[0.00418232, 0.00029746, -0.00035057], [0.00029746, 0.00407264, 0.00019496], [-0.00035057, 0.00019496, 0.00509413]]) assert_allclose(r.cov_params(), c, 0.3, 0)
def make_imp(fml): imp = MI( mimi(impvar, vx, vb, mn, proj, varls, df, log, bp_var, bp_dir), sm.OLS, formula=fml, model_kwds_fn=lambda x: {"data": x}, burn=0, nrep=n_imp, skip=0) return imp
def test_MI_stat(): # Test for MI where we know statistically what should happen. The # analysis model is x0 ~ x1 with standard error 1/sqrt(n) for the # slope parameter. The nominal n is 1000, but half of the cases # have missing x1. Then we introduce x2 that is either # independent of x1, or almost perfectly correlated with x1. In # the first case the SE is 1/sqrt(500), in the second case the SE # is 1/sqrt(1000). np.random.seed(414) z = np.random.normal(size=(1000, 3)) z[:, 0] += 0.5 * z[:, 1] # Control the degree to which x2 proxies for x1 exp = [1 / np.sqrt(500), 1 / np.sqrt(1000)] fmi = [0.5, 0] for j, r in enumerate((0, 0.9999)): x = z.copy() x[:, 2] = r * x[:, 1] + np.sqrt(1 - r**2) * x[:, 2] x[0:500, 1] = np.nan def model_args(x): # Return endog, exog # Regress x1 on x2 return (x[:, 0], x[:, 1]) np.random.seed(2342) imp = BayesGaussMI(x.copy()) mi = MI(imp, sm.OLS, model_args, nrep=100, skip=10) r = mi.fit() # Check the SE d = np.abs(r.bse[0] - exp[j]) / exp[j] assert (d < 0.03) # Check the FMI d = np.abs(r.fmi[0] - fmi[j]) assert (d < 0.05)
def test_MI_stat(): # Test for MI where we know statistically what should happen. The # analysis model is x0 ~ x1 with standard error 1/sqrt(n) for the # slope parameter. The nominal n is 1000, but half of the cases # have missing x1. Then we introduce x2 that is either # independent of x1, or almost perfectly correlated with x1. In # the first case the SE is 1/sqrt(500), in the second case the SE # is 1/sqrt(1000). np.random.seed(414) z = np.random.normal(size=(1000, 3)) z[:, 0] += 0.5*z[:, 1] # Control the degree to which x2 proxies for x1 exp = [1/np.sqrt(500), 1/np.sqrt(1000)] fmi = [0.5, 0] for j, r in enumerate((0, 0.9999)): x = z.copy() x[:, 2] = r*x[:, 1] + np.sqrt(1 - r**2)*x[:, 2] x[0:500, 1] = np.nan def model_args(x): # Return endog, exog # Regress x1 on x2 return (x[:, 0], x[:, 1]) np.random.seed(2342) imp = BayesGaussMI(x.copy()) mi = MI(imp, sm.OLS, model_args, nrep=100, skip=10) r = mi.fit() # Check the SE d = np.abs(r.bse[0] - exp[j]) / exp[j] assert(d < 0.03) # Check the FMI d = np.abs(r.fmi[0] - fmi[j]) assert(d < 0.05)
"groups": "MomIdUnique", "data": x, "re_formula": "1", "vc_formula": vcf } def fit_kwds_fn(x): return {"method": "lbfgs", "reml": False} imp = MI(mimi(impvar, vx, vb, mn, proj, varls, df, log, bp_var, bp_dir), sm.MixedLM, model_args_fn=None, formula=fml, model_kwds_fn=model_kwds_fn, fit_kwds=fit_kwds_fn, burn=0, nrep=20, skip=0) if (impvar == "BMI") and (ndim == 1) and controlcbs: import json f = open("centering.json") centering = json.load(f) f.close() # Create a table 1, based on counting people m = mimi(impvar, vx, vb, mn, proj, varls, df, log, bp_var, bp_dir) m.update()