def test_MI(): np.random.seed(414) x = np.random.normal(size=(200, 4)) x[[1, 3, 9], 0] = np.nan x[[1, 4, 3], 1] = np.nan x[[2, 11, 21], 2] = np.nan x[[11, 22, 99], 3] = np.nan def model_args_fn(x): # Return endog, exog # Regress x0 on x1 and x2 if type(x) is np.ndarray: return (x[:, 0], x[:, 1:]) else: return (x.iloc[:, 0].values, x.iloc[:, 1:].values) for j in (0, 1): np.random.seed(2342) imp = BayesGaussMI(x.copy()) mi = MI(imp, sm.OLS, model_args_fn, burn=0) r = mi.fit() r.summary() # smoke test # TODO: why does the test tolerance need to be so slack? # There is unexpected variation across versions on travis. assert_allclose(r.params, np.r_[-0.05347919, -0.02479701, 0.10075517], 0.25, 0) c = np.asarray([[0.00418232, 0.00029746, -0.00035057], [0.00029746, 0.00407264, 0.00019496], [-0.00035057, 0.00019496, 0.00509413]]) assert_allclose(r.cov_params(), c, 0.3, 0) # Test with ndarray and pandas input x = pd.DataFrame(x)
def test_mi_formula(): np.random.seed(414) x = np.random.normal(size=(200, 4)) x[[1, 3, 9], 0] = np.nan x[[1, 4, 3], 1] = np.nan x[[2, 11, 21], 2] = np.nan x[[11, 22, 99], 3] = np.nan df = pd.DataFrame({ "y": x[:, 0], "x1": x[:, 1], "x2": x[:, 2], "x3": x[:, 3] }) fml = "y ~ 0 + x1 + x2 + x3" np.random.seed(2342) imp = BayesGaussMI(df.copy()) mi = MI(imp, sm.OLS, formula=fml, burn=0) r = mi.fit() r.summary() # smoke test # TODO: why does the test tolerance need to be so slack? # There is unexpected variation across versions on travis. assert_allclose(r.params, np.r_[-0.05347919, -0.02479701, 0.10075517], 0.25, 0) c = np.asarray([[0.00418232, 0.00029746, -0.00035057], [0.00029746, 0.00407264, 0.00019496], [-0.00035057, 0.00019496, 0.00509413]]) assert_allclose(r.cov_params(), c, 0.3, 0)
def test_MI(): np.random.seed(414) x = np.random.normal(size=(200, 4)) x[[1, 3, 9], 0] = np.nan x[[1, 4, 3], 1] = np.nan x[[2, 11, 21], 2] = np.nan x[[11, 22, 99], 3] = np.nan def model_args_fn(x): # Return endog, exog # Regress x0 on x1 and x2 if type(x) is np.ndarray: return (x[:, 0], x[:, 1:]) else: return (x.iloc[:, 0].values, x.iloc[:, 1:].values) for j in (0, 1): np.random.seed(2342) imp = BayesGaussMI(x.copy()) mi = MI(imp, sm.OLS, model_args_fn, burn=0) r = mi.fit() r.summary() # smoke test # TODO: why does the test tolerance need to be so slack? # There is unexpected variation across versions on travis. assert_allclose(r.params, np.r_[ -0.05347919, -0.02479701, 0.10075517], 0.25, 0) c = np.asarray([[0.00418232, 0.00029746, -0.00035057], [0.00029746, 0.00407264, 0.00019496], [-0.00035057, 0.00019496, 0.00509413]]) assert_allclose(r.cov_params(), c, 0.3, 0) # Test with ndarray and pandas input x = pd.DataFrame(x)
def test_mi_formula(): np.random.seed(414) x = np.random.normal(size=(200, 4)) x[[1, 3, 9], 0] = np.nan x[[1, 4, 3], 1] = np.nan x[[2, 11, 21], 2] = np.nan x[[11, 22, 99], 3] = np.nan df = pd.DataFrame({"y": x[:, 0], "x1": x[:, 1], "x2": x[:, 2], "x3": x[:, 3]}) fml = "y ~ 0 + x1 + x2 + x3" np.random.seed(2342) imp = BayesGaussMI(df.copy()) mi = MI(imp, sm.OLS, formula=fml, burn=0) r = mi.fit() r.summary() # smoke test # TODO: why does the test tolerance need to be so slack? # There is unexpected variation across versions on travis. assert_allclose(r.params, np.r_[ -0.05347919, -0.02479701, 0.10075517], 0.25, 0) c = np.asarray([[0.00418232, 0.00029746, -0.00035057], [0.00029746, 0.00407264, 0.00019496], [-0.00035057, 0.00019496, 0.00509413]]) assert_allclose(r.cov_params(), c, 0.3, 0)
def test_MI_stat(): # Test for MI where we know statistically what should happen. The # analysis model is x0 ~ x1 with standard error 1/sqrt(n) for the # slope parameter. The nominal n is 1000, but half of the cases # have missing x1. Then we introduce x2 that is either # independent of x1, or almost perfectly correlated with x1. In # the first case the SE is 1/sqrt(500), in the second case the SE # is 1/sqrt(1000). np.random.seed(414) z = np.random.normal(size=(1000, 3)) z[:, 0] += 0.5 * z[:, 1] # Control the degree to which x2 proxies for x1 exp = [1 / np.sqrt(500), 1 / np.sqrt(1000)] fmi = [0.5, 0] for j, r in enumerate((0, 0.9999)): x = z.copy() x[:, 2] = r * x[:, 1] + np.sqrt(1 - r**2) * x[:, 2] x[0:500, 1] = np.nan def model_args(x): # Return endog, exog # Regress x1 on x2 return (x[:, 0], x[:, 1]) np.random.seed(2342) imp = BayesGaussMI(x.copy()) mi = MI(imp, sm.OLS, model_args, nrep=100, skip=10) r = mi.fit() # Check the SE d = np.abs(r.bse[0] - exp[j]) / exp[j] assert (d < 0.03) # Check the FMI d = np.abs(r.fmi[0] - fmi[j]) assert (d < 0.05)
def test_MI_stat(): # Test for MI where we know statistically what should happen. The # analysis model is x0 ~ x1 with standard error 1/sqrt(n) for the # slope parameter. The nominal n is 1000, but half of the cases # have missing x1. Then we introduce x2 that is either # independent of x1, or almost perfectly correlated with x1. In # the first case the SE is 1/sqrt(500), in the second case the SE # is 1/sqrt(1000). np.random.seed(414) z = np.random.normal(size=(1000, 3)) z[:, 0] += 0.5*z[:, 1] # Control the degree to which x2 proxies for x1 exp = [1/np.sqrt(500), 1/np.sqrt(1000)] fmi = [0.5, 0] for j, r in enumerate((0, 0.9999)): x = z.copy() x[:, 2] = r*x[:, 1] + np.sqrt(1 - r**2)*x[:, 2] x[0:500, 1] = np.nan def model_args(x): # Return endog, exog # Regress x1 on x2 return (x[:, 0], x[:, 1]) np.random.seed(2342) imp = BayesGaussMI(x.copy()) mi = MI(imp, sm.OLS, model_args, nrep=100, skip=10) r = mi.fit() # Check the SE d = np.abs(r.bse[0] - exp[j]) / exp[j] assert(d < 0.03) # Check the FMI d = np.abs(r.fmi[0] - fmi[j]) assert(d < 0.05)
a = dx.agg(stats) a = a.T a["Female"] = female a["Visit"] = "first" if dz is first else "last" astats.append(a) astats = pd.concat(astats, axis=0) astats = astats.rename(columns={"mean": "Mean", "len": "N"}) sname = "%s_table1.csv" % bp_var sname = sname.lower() sname = sname.replace("mean_", "") astats.to_csv(sname) # Table based on observations y = x.groupby("Female").ID.agg(len) rslt = imp.fit(results_cb=lambda x: x) mm = rslt.results[0].model nobs = sum([x.shape[1] for x in mm.exog_vc.mats[0]]) ic = [x.llf for x in rslt.results] sca = [x.scale for x in rslt.results] out.write("%s\n" % impvar) out.write("%d distinct subjects\n" % nobs) out.write("%d distinct mothers\n" % mm.n_groups) out.write("mean IC %f\n" % np.mean(ic)) out.write("mean scale %f\n" % np.mean(sca)) out.write(rslt.summary().as_text())