def test_missing_formula(self): # Test missing data handling for formulas. endog = np.random.normal(size=100) exog1 = np.random.normal(size=100) exog2 = np.random.normal(size=100) exog3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog2[10:12] = np.nan data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2, "exog3": exog3, "groups": groups}) mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups, data, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4]) data = data.dropna() groups = groups[data.index.values] mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups, data, missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params.values, rslt2.params.values) assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
def test_missing_formula(self): # Test missing data handling for formulas. endog = np.random.normal(size=100) exog1 = np.random.normal(size=100) exog2 = np.random.normal(size=100) exog3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog2[10:12] = np.nan data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2, "exog3": exog3, "groups": groups}) mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups, data, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4]) data = data.dropna() groups = groups[data.index.values] mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups, data, missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params.values, rslt2.params.values) assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
def test_formulas(self): """ Check formulas, especially passing groups and time as either variable names or arrays. """ n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1) Time = np.random.uniform(size=n) groups = np.kron(lrange(20), np.ones(5)) data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups}) va = Autoregressive() family = Gaussian() mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va) rslt1 = mod1.fit() mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time, family=family, cov_struct=va) rslt2 = mod2.fit() mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time", family=family, cov_struct=va) rslt3 = mod3.fit() mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time, family=family, cov_struct=va) rslt4 = mod4.fit() mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time", family=family, cov_struct=va) rslt5 = mod5.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=8) assert_almost_equal(rslt1.params, rslt3.params, decimal=8) assert_almost_equal(rslt1.params, rslt4.params, decimal=8) assert_almost_equal(rslt1.params, rslt5.params, decimal=8) check_wrapper(rslt2)
def BuildPoissonModels(hist_data, feature_list, comp_data=None): ''' Build score predictions via (linear) poisson regression. ''' hist_data_1 = hist_data[["team_1_score"] + feature_list] hist_data_2 = hist_data[["team_2_score"] + feature_list] formula_1 = "team_1_score ~ " + " + ".join(feature_list) formula_2 = "team_2_score ~ " + " + ".join(feature_list) # using the GEE package along with independance assumptions to fit poisson model. # Am assuming this is using a maximum likleyhood approach? fam = Poisson() ind = Independence() model_1 = GEE.from_formula(formula_1, "team_1_score", hist_data, cov_struct=ind, family=fam) model_2 = GEE.from_formula(formula_2, "team_2_score", hist_data, cov_struct=ind, family=fam) model_1_fit = model_1.fit() model_2_fit = model_2.fit() print(model_1_fit.summary()) hist_data['team_1_score_pred'] = model_1_fit.predict(hist_data) hist_data['team_2_score_pred'] = model_2_fit.predict(hist_data) # return historical data if comp_data wasn't passed. if comp_data is None: return hist_data # prepare comp data comp_data['team_1_score_pred'] = model_1_fit.predict( comp_data[feature_list]) comp_data['team_2_score_pred'] = model_2_fit.predict( comp_data[feature_list]) comp_data['team_1_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[ 'team_2_score_pred']), 1) comp_data['team_tie_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred'] ), 1) comp_data['team_2_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred' ]), 1) return hist_data, comp_data
def test_predict_exposure(self): n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.random.uniform(1, 2, size=n) Y = np.random.poisson(0.1*(X1 + X2) + offset + np.log(exposure), size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset", exposure="exposure") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data["offset"]) pred3 = result.predict(exposure=data["exposure"]) pred4 = result.predict(offset=data["offset"], exposure=data["exposure"]) pred5 = result.predict(exog=data[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:]) # without patsy pred6 = result.predict(exog=result.model.exog[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:], transform=False) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4) assert_allclose(pred1[-10:], pred5) assert_allclose(pred1[-10:], pred6)
def test_compare_OLS(self): #Gaussian GEE with independence correlation should agree #exactly with OLS for parameter estimates and standard errors #derived from the naive covariance estimate. vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) mdf = md.fit() ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit() # don't use wrapper, asserts_xxx don't work ols = ols._results assert_almost_equal(ols.params, mdf.params, decimal=10) se = mdf.standard_errors(cov_type="naive") assert_almost_equal(ols.bse, se, decimal=10) naive_tvalues = mdf.params / \ np.sqrt(np.diag(mdf.cov_naive)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_predict_exposure(self): n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.random.uniform(1, 2, size=n) Y = np.random.poisson(0.1*(X1 + X2) + offset + np.log(exposure), size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset", exposure="exposure") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data["offset"]) pred3 = result.predict(exposure=data["exposure"]) pred4 = result.predict(offset=data["offset"], exposure=data["exposure"]) pred5 = result.predict(exog=data[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:]) # without patsy pred6 = result.predict(exog=result.model.exog[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:], transform=False) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4) assert_allclose(pred1[-10:], pred5) assert_allclose(pred1[-10:], pred6)
def test_sensitivity(self): va = Exchangeable() family = Gaussian() np.random.seed(34234) n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(50), np.r_[1, 1]) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2}) mod = GEE.from_formula("Y ~ X1 + X2", groups, D, family=family, cov_struct=va) rslt = mod.fit() ps = rslt.params_sensitivity(0, 0.5, 2) assert_almost_equal(len(ps), 2) assert_almost_equal([x.cov_struct.dep_params for x in ps], [0.0, 0.5]) # Regression test assert_almost_equal([x.params[0] for x in ps], [0.1696214707458818, 0.17836097387799127])
def t_est_missing(self): Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) Y[0] = np.nan Y[5:7] = np.nan X2[10:12] = np.nan D = pd.DataFrame({ "Y": Y, "X1": X1, "X2": X2, "X3": X3, "groups": groups }) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D["groups"], missing='drop') mdf = md.fit() assert (len(md.endog) == 95) assert (md.exog.shape) == (95, 4)
def test_sensitivity(self): va = Exchangeable() family = Gaussian() n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(50), np.r_[1, 1]) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2}) mod = GEE.from_formula("Y ~ X1 + X2", groups, D, family=family, cov_struct=va) rslt = mod.fit() ps = rslt.params_sensitivity(0, 0.5, 2) assert_almost_equal(len(ps), 2) assert_almost_equal([x.cov_struct.dep_params for x in ps], [0.0, 0.5]) # Regression test assert_almost_equal([x.params[0] for x in ps], np.r_[-0.1256575, -0.126747036])
def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=vi) cls.start_params = np.array([ -0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315 ])
def test_compare_OLS(self): """ Gaussian GEE with independence correlation should agree exactly with OLS for parameter estimates and standard errors derived from the naive covariance estimate. """ vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(range(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs) mdf = md.fit() ols = sm.ols("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(ols.params.values, mdf.params, decimal=10) naive_tvalues = mdf.params / np.sqrt(np.diag(mdf.naive_covariance)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_compare_OLS(self): #Gaussian GEE with independence correlation should agree #exactly with OLS for parameter estimates and standard errors #derived from the naive covariance estimate. vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) mdf = md.fit() ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit() # don't use wrapper, asserts_xxx don't work ols = ols._results assert_almost_equal(ols.params, mdf.params, decimal=10) se = mdf.standard_errors(cov_type="naive") assert_almost_equal(ols.bse, se, decimal=10) naive_tvalues = mdf.params / \ np.sqrt(np.diag(mdf.cov_naive)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit(scale="X2") # don't use wrapper, asserts_xxx don't work rslt1 = rslt1._results rslt2 = rslt2._results assert_almost_equal(rslt1.params, rslt2.params, decimal=6) assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() md1 = GEE.from_formula("y ~ age + trt + base", data, groups=data["subject"], cov_struct=ind, family=fam) mdf1 = md1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families md2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) mdf2 = md2.fit(scale="X2") assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
def test_compare_OLS(self): """ Gaussian GEE with independence correlation should agree exactly with OLS for parameter estimates and standard errors derived from the naive covariance estimate. """ vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(range(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs) mdf = md.fit() ols = sm.ols("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(ols.params.values, mdf.params, decimal=10) naive_tvalues = mdf.params / \ np.sqrt(np.diag(mdf.naive_covariance)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_predict(self): n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) Y = np.random.normal(0.1*(X1 + X2) + offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset}) fml = "Y ~ X1 + X2" model = GEE.from_formula(fml, groups, data, family=Gaussian(), offset="offset") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data.offset) pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset) pred4 = result.predict(exog=data[["X1", "X2"]], offset=0*data.offset) pred5 = result.predict(offset=0*data.offset) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4 + data.offset) assert_allclose(pred1, pred5 + data.offset) x1_new = np.random.normal(size=10) x2_new = np.random.normal(size=10) new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new}) pred6 = result.predict(exog=new_exog) params = result.params pred6_correct = params[0] + params[1]*x1_new + params[2]*x2_new assert_allclose(pred6, pred6_correct)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params.values, rslt2.params.values, decimal=10)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit(scale="X2") # don't use wrapper, asserts_xxx don't work rslt1 = rslt1._results rslt2 = rslt2._results assert_almost_equal(rslt1.params, rslt2.params, decimal=6) assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
def test_formulas(self): """ Check formulas, especially passing groups and time as either variable names or arrays. """ n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) mat = np.concatenate((np.ones((n,1)), X1[:, None]), axis=1) Time = np.random.uniform(size=n) groups = np.kron(lrange(20), np.ones(5)) data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups}) va = Autoregressive() family = Gaussian() mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va) rslt1 = mod1.fit() mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time, family=family, cov_struct=va) rslt2 = mod2.fit() mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time", family=family, cov_struct=va) rslt3 = mod3.fit() mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time, family=family, cov_struct=va) rslt4 = mod4.fit() mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time", family=family, cov_struct=va) rslt5 = mod5.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=8) assert_almost_equal(rslt1.params, rslt3.params, decimal=8) assert_almost_equal(rslt1.params, rslt4.params, decimal=8) assert_almost_equal(rslt1.params, rslt5.params, decimal=8) check_wrapper(rslt2)
def setup_class(cls): vs = Independence() family = families.Poisson() np.random.seed(987126) Y = np.exp(1 + np.random.normal(size=100)) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit() sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(sml.params.values, md.params, decimal=10)
def setup_class(cls): vs = Independence() family = families.Gaussian() np.random.seed(987126) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(np.arange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = md.fit() cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() # Test with formulas D = np.concatenate((endog[:,None], group_n[:,None], exog[:,1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y","Id",] + ["X%d" % (k+1) for k in range(exog.shape[1]-1)] cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=vi) cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315])
def test_compare_logit(self): vs = Independence() family = Binomial() Y = 1*(np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, cov_struct=vs).fit() sml = sm.logit("Y ~ X1 + X2 + X3", data=D).fit(disp=False) assert_almost_equal(sml.params.values, md.params, decimal=10)
def test_predict(self): n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) Y = np.random.normal(0.1 * (X1 + X2) + offset, size=n) data = pd.DataFrame({ "Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset }) fml = "Y ~ X1 + X2" model = GEE.from_formula(fml, groups, data, family=Gaussian(), offset="offset") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data.offset) pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset) pred4 = result.predict(exog=data[["X1", "X2"]], offset=0 * data.offset) pred5 = result.predict(offset=0 * data.offset) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4 + data.offset) assert_allclose(pred1, pred5 + data.offset) x1_new = np.random.normal(size=10) x2_new = np.random.normal(size=10) new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new}) pred6 = result.predict(exog=new_exog) params = result.params pred6_correct = params[0] + params[1] * x1_new + params[2] * x2_new assert_allclose(pred6, pred6_correct)
def test_offset_formula(self): """ Test various ways of passing offset and exposure to `from_formula`. """ n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.exp(offset) Y = np.random.poisson(0.1*(X1 + X2) + 2*offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model1 = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset") result1 = model1.fit() assert_equal(result1.converged, True) model2 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=offset) result2 = model2.fit(start_params=result1.params) assert_allclose(result1.params, result2.params) assert_equal(result2.converged, True) model3 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure=exposure) result3 = model3.fit(start_params=result1.params) assert_allclose(result1.params, result3.params) assert_equal(result3.converged, True) model4 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure") result4 = model4.fit(start_params=result1.params) assert_allclose(result1.params, result4.params) assert_equal(result4.converged, True) model5 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure", offset="offset") result5 = model5.fit() assert_equal(result5.converged, True) model6 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=2*offset) result6 = model6.fit(start_params=result5.params) assert_allclose(result5.params, result6.params) assert_equal(result6.converged, True)
def setup_class(cls): # adjusted for Gamma, not in test_gee.py vs = Independence() family = families.Gamma(link=links.log) np.random.seed(987126) #Y = np.random.normal(size=100)**2 Y = np.exp(0.1 + np.random.normal(size=100)) # log-normal X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def setup_class(cls): vs = Independence() family = families.Gaussian() np.random.seed(987126) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(np.arange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = md.fit() cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
def test_compare_logit(self): vs = Independence() family = Binomial() Y = 1*(np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.logit("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
def setup_class(cls): vs = Independence() family = families.Poisson() np.random.seed(987126) Y = np.exp(1 + np.random.normal(size=100)) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def t_est_missing(self): Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) Y[0] = np.nan Y[5:7] = np.nan X2[10:12] = np.nan D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3, "groups": groups}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D["groups"], missing='drop') mdf = md.fit() assert(len(md.endog) == 95) assert(md.exog.shape) == (95,4)
def test_offset_formula(self): # Test various ways of passing offset and exposure to `from_formula`. n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.exp(offset) Y = np.random.poisson(0.1*(X1 + X2) + 2*offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model1 = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset") result1 = model1.fit() assert_equal(result1.converged, True) model2 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=offset) result2 = model2.fit(start_params=result1.params) assert_allclose(result1.params, result2.params) assert_equal(result2.converged, True) model3 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure=exposure) result3 = model3.fit(start_params=result1.params) assert_allclose(result1.params, result3.params) assert_equal(result3.converged, True) model4 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure") result4 = model4.fit(start_params=result1.params) assert_allclose(result1.params, result4.params) assert_equal(result4.converged, True) model5 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure", offset="offset") result5 = model5.fit() assert_equal(result5.converged, True) model6 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=2*offset) result6 = model6.fit(start_params=result5.params) assert_allclose(result5.params, result6.params) assert_equal(result6.converged, True)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit() sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(sml.params.values, md.params, decimal=10)
def setup_class(cls): # adjusted for Gamma, not in test_gee.py vs = Independence() family = families.Gamma(link=links.log) np.random.seed(987126) #Y = np.random.normal(size=100)**2 Y = np.exp(0.1 + np.random.normal(size=100)) # log-normal X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() md1 = GEE.from_formula("y ~ age + trt + base", data, groups=data["subject"], cov_struct=ind, family=fam) mdf1 = md1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families md2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) mdf2 = md2.fit(scale="X2") assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
def test_compare_logit(self): vs = Independence() family = Binomial() Y = 1 * (np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, cov_struct=vs).fit() sml = sm.logit("Y ~ X1 + X2 + X3", data=D).fit(disp=False) assert_almost_equal(sml.params.values, md.params, decimal=10)
def test_compare_logit(self): vs = Independence() family = Binomial() Y = 1 * (np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.logit("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
def run_permutation_test(dependent, network, number_of_permutations, output_path): nodes = pd.DataFrame.from_dict(dict(network.nodes(data=True)), orient='index') degree = pd.DataFrame.from_dict(dict(network.degree()), orient='index') centrality = pd.DataFrame.from_dict(dict( nx.betweenness_centrality(network)), orient='index') h1 = pd.concat([nodes, degree, centrality], axis=1).reset_index(0) h1.columns = [ 'ID', 'Age', 'Species', 'type', 'Location', 'Sex', 'degree', 'centrality' ] h1['degree_dist'] = h1.degree / float(h1.degree.max()) equation = dependent + "~ Age + Sex" from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.cov_struct import (Exchangeable, Independence, Autoregressive) from statsmodels.genmod.families import Poisson fam = Poisson() ind = Independence() model = GEE.from_formula(equation, "Location", h1, cov_struct=ind, family=fam) main_model_result = model.fit() main_result = pd.DataFrame(main_model_result.params).T degree_random_coeff = [] for i in range(number_of_permutations): rand_h1 = h1.copy() rand_h1[dependent] = np.random.permutation(h1[dependent]) fam = Poisson() ind = Independence() model = GEE.from_formula(equation, "Location", rand_h1, cov_struct=ind, family=fam) result = model.fit() degree_random_coeff.append(result.params) d = pd.DataFrame.from_records(degree_random_coeff) import seaborn as sns f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) ax1.hist(d['Age[T.HY]'], bins=100) ax1.axvline(x=main_result['Age[T.HY]'].values[0], color='#fc9272') p = (d['Age[T.HY]'] > main_result['Age[T.HY]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax1.set_xlabel( 'Coefficient Age: Hatch Year\n(ref: After Hatch Year)\np= ' + '{0:.2f}'.format(p)) ax1.set_ylabel('Frequency') ax2.hist(d['Age[T.UNK]'], bins=100) ax2.axvline(x=main_result['Age[T.UNK]'].values[0], color='#fc9272') p = (d['Age[T.UNK]'] > main_result['Age[T.UNK]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax2.set_xlabel('Coefficient Age: Unknown\n(ref: After Hatch Year)\np= ' + '{0:.2f}'.format(p)) ax3.hist(d['Sex[T.M]'], bins=100) ax3.axvline(x=main_result['Sex[T.M]'].values[0], color='#fc9272') p = (d['Sex[T.M]'] > main_result['Sex[T.M]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax3.set_xlabel('Coefficient Sex: Male\n (ref: Female)\np= ' + '{0:.2f}'.format(p)) title = 'permutation test for ' + dependent f.suptitle(title) plt.tight_layout() plt.savefig(output_path + '/' + dependent + '_Permutation_test.png', dpi=300) plt.show()
LR = 2 * (model_panel2_results.llf - model_panel1_results.llf) p = chi2.sf(LR, 2) print('p: %.30f' % p) # provides a summary of the number of zeros print(US_cases_long_demogr_week['cases_count_pos'].describe()) print(US_cases_long_demogr_week['cases_count_pos'].value_counts()) count_total = sum(US_cases_long_demogr_week['cases_count_pos'].value_counts().to_dict().values()) count_zero = US_cases_long_demogr_week['cases_count_pos'].value_counts()[0.0] print("Count of zero is {}, about {:.4f} of the data.".format(count_zero, count_zero / count_total )) # Approach one to generalized linear models for panel data: Generalized Estimating Equations # poisson model poi=Poisson() ar=Autoregressive() gee_model0 = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \ data=US_cases_long_demogr_week, time='week_of_year', cov_struct=ar, family=poi, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"]))) gee_model0_results = gee_model0.fit(maxiter=200) print(gee_model0_results.summary()) print(ar.summary()) print("scale=%.2f" % (gee_model0_results.scale)) # There is warning -- "IterationLimitWarning: Iteration limit reached prior to convergence" even if I specify maxiter = 2000. So, in this case, # specific starting values are needed to get the estimating algorithm to converge. # First run with exchangeable dependence structure. We know from this model that the within-state correlation is roughly 0.077. fam = Poisson() ex = Exchangeable() ex_model = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \ data=US_cases_long_demogr_week, cov_struct=ex, family=fam, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"]))) ex_results = ex_model.fit() print(ex_results.summary()) print(ex.summary())
import numpy as np import pandas as pd import math from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.cov_struct import (Exchangeable, Independence,Autoregressive) from statsmodels.genmod.families import Poisson fam = Poisson() ind = Independence() df = pd.read_csv("file:///C:/Users/Luke/Documents/drugi_test.csv") count = 0 for i in range(60, len(df)): df_tmp = df.head(i).tail(60) model1 = GEE.from_formula("liczba ~ indeks", "indeks", df_tmp, cov_struct=ind, family=fam) results = model1.fit() if i>117 and i<120: print(df_tmp.get_value(60,"indeks")) if results.pvalues.Intercept < 0.05 and results.pvalues.indeks < 0.05: suma = 0 for n in range(1, 15): suma += math.exp(results.params.Intercept + results.params.indeks*(i+n)) print(str(i)+": " + str(suma)) else: prediction = df_tmp.mean().liczba*15 print(str(i)+": " + str(prediction))
def test_poisson(self): #library(gee) #Z = read.csv("results/gee_poisson_1.csv", header=FALSE) #Y = Z[,2] #Id = Z[,1] #X1 = Z[,3] #X2 = Z[,4] #X3 = Z[,5] #X4 = Z[,6] #X5 = Z[,7] #mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, # corstr="independence", scale.fix=TRUE) #smi = summary(mi) #u = coefficients(smi) #cfi = paste(u[,1], collapse=",") #sei = paste(u[,4], collapse=",") #me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, # corstr="exchangeable", scale.fix=TRUE) #sme = summary(me) #u = coefficients(sme) #cfe = paste(u[,1], collapse=",") #see = paste(u[,4], collapse=",") #sprintf("cf = [[%s],[%s]]", cfi, cfe) #sprintf("se = [[%s],[%s]]", sei, see) family = Poisson() endog,exog,group_n = load_data("gee_poisson_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[-0.0364450410793481,-0.0543209391301178, 0.0156642711741052,0.57628591338724, -0.00465659951186211,-0.477093153099256], [-0.0315615554826533,-0.0562589480840004, 0.0178419412298561,0.571512795340481, -0.00363255566297332,-0.475971696727736]] se = [[0.0611309237214186,0.0390680524493108, 0.0334234174505518,0.0366860768962715, 0.0304758505008105,0.0316348058881079], [0.0610840153582275,0.0376887268649102, 0.0325168379415177,0.0369786751362213, 0.0296141014225009,0.0306115470200955]] for j,v in enumerate((vi,ve)): md = GEE(endog, exog, group_n, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:,None], group_n[:,None], exog[:,1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y","Id",] + ["X%d" % (k+1) for k in range(exog.shape[1]-1)] for j,v in enumerate((vi,ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
def test_linear(self): #library(gee) #Z = read.csv("results/gee_linear_1.csv", header=FALSE) #Y = Z[,2] #Id = Z[,1] #X1 = Z[,3] #X2 = Z[,4] #X3 = Z[,5] #mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, # corstr="independence", tol=1e-8, maxit=100) #smi = summary(mi) #u = coefficients(smi) #cfi = paste(u[,1], collapse=",") #sei = paste(u[,4], collapse=",") #me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, # corstr="exchangeable", tol=1e-8, maxit=100) #sme = summary(me) #u = coefficients(sme) #cfe = paste(u[,1], collapse=",") #see = paste(u[,4], collapse=",") #sprintf("cf = [[%s],[%s]]", cfi, cfe) #sprintf("se = [[%s],[%s]]", sei, see) family = Gaussian() endog,exog,group = load_data("gee_linear_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[-0.01850226507491,0.81436304278962, -1.56167635393184,0.794239361055003], [-0.0182920577154767,0.814898414022467, -1.56194040106201,0.793499517527478]] se = [[0.0440733554189401,0.0479993639119261, 0.0496045952071308,0.0479467597161284], [0.0440369906460754,0.0480069787567662, 0.049519758758187,0.0479760443027526]] for j,v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10) # Test with formulas D = np.concatenate((endog[:,None], group[:,None], exog[:,1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y","Id",] + ["X%d" % (k+1) for k in range(exog.shape[1]-1)] for j,v in enumerate((vi,ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)
def test_logistic(self): #R code for comparing results: #library(gee) #Z = read.csv("results/gee_logistic_1.csv", header=FALSE) #Y = Z[,2] #Id = Z[,1] #X1 = Z[,3] #X2 = Z[,4] #X3 = Z[,5] #mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="independence") #smi = summary(mi) #u = coefficients(smi) #cfi = paste(u[,1], collapse=",") #sei = paste(u[,4], collapse=",") #me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="exchangeable") #sme = summary(me) #u = coefficients(sme) #cfe = paste(u[,1], collapse=",") #see = paste(u[,4], collapse=",") #ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="AR-M") #sma = summary(ma) #u = coefficients(sma) #cfa = paste(u[,1], collapse=",") #sea = paste(u[,4], collapse=",") #sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa) #sprintf("se = [[%s],[%s],[%s]]", sei, see, sea) endog,exog,group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() ve = Exchangeable() vi = Independence() va = Autoregressive() # From R gee cf = [[0.0167272965285882,1.13038654425893, -1.86896345082962,1.09397608331333], [0.0178982283915449,1.13118798191788, -1.86133518416017,1.08944256230299], [0.0109621937947958,1.13226505028438, -1.88278757333046,1.09954623769449]] se = [[0.127291720283049,0.166725808326067, 0.192430061340865,0.173141068839597], [0.127045031730155,0.165470678232842, 0.192052750030501,0.173174779369249], [0.127240302296444,0.170554083928117, 0.191045527104503,0.169776150974586]] for j,v in enumerate((vi,ve,va)): md = GEE(endog, exog, group, T, family, v) mdf = md.fit() if id(v) != id(va): assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:,None], group[:,None], exog[:,1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y","Id",] + ["X%d" % (k+1) for k in range(exog.shape[1]-1)] for j,v in enumerate((vi,ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
vals = np.size(dataset2.iloc[:, i].unique()) if vals >= 100: vals = 100 plt.hist(dataset2.iloc[:, i], bins=vals, color='#3F5D7D') plt.tight_layout(rect=[0, 0.03, 1, 0.95]) #Correlation with Response Variable dataset2.corrwith(dataset.TC).plot.bar(figsize=(20, 10), title="Correlation with TC", fontsize=15, rot=45, grid=True) X = ['MJJ', 'JJA', 'JAS'] # building the model fam = Poisson() ind = Independence() model1 = GEE.from_formula("TC ~ MJJ + JJA + JAS", cov_struct=ind, family=fam) result1 = model1.fit() print(result1.summary()) # testing the model predVals = poisson_res.predict(X) plt.plot(range(len(TC)), TC, 'r*-', range(len(TC)), predVals, 'bo-') plt.title('Train dataset Real vs. Predicted Values') plt.legend(['Real Values', 'Predicted Values']) plt.show()
def test_linear(self): """ library(gee) Z = read.csv("results/gee_linear_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, corstr="independence", tol=1e-8, maxit=100) smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, corstr="exchangeable", tol=1e-8, maxit=100) sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s]]", cfi, cfe) sprintf("se = [[%s],[%s]]", sei, see) """ family = Gaussian() endog, exog, group = load_data("gee_linear_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[ -0.01850226507491, 0.81436304278962, -1.56167635393184, 0.794239361055003 ], [ -0.0182920577154767, 0.814898414022467, -1.56194040106201, 0.793499517527478 ]] se = [[ 0.0440733554189401, 0.0479993639119261, 0.0496045952071308, 0.0479467597161284 ], [ 0.0440369906460754, 0.0480069787567662, 0.049519758758187, 0.0479760443027526 ]] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D.loc[:, "Id"], family=family, covstruct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10)
def test_poisson(self): """ library(gee) Z = read.csv("results/gee_poisson_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] X4 = Z[,6] X5 = Z[,7] mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, corstr="independence", scale.fix=TRUE) smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, corstr="exchangeable", scale.fix=TRUE) sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s]]", cfi, cfe) sprintf("se = [[%s],[%s]]", sei, see) """ family = Poisson() endog, exog, group_n = load_data("gee_poisson_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[ -0.0364450410793481, -0.0543209391301178, 0.0156642711741052, 0.57628591338724, -0.00465659951186211, -0.477093153099256 ], [ -0.0315615554826533, -0.0562589480840004, 0.0178419412298561, 0.571512795340481, -0.00363255566297332, -0.475971696727736 ]] se = [[ 0.0611309237214186, 0.0390680524493108, 0.0334234174505518, 0.0366860768962715, 0.0304758505008105, 0.0316348058881079 ], [ 0.0610840153582275, 0.0376887268649102, 0.0325168379415177, 0.0369786751362213, 0.0296141014225009, 0.0306115470200955 ]] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group_n, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", D, None, groups=D.loc[:, "Id"], family=family, covstruct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
def test_logistic(self): """ R code for comparing results: library(gee) Z = read.csv("results/gee_logistic_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="independence") smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="exchangeable") sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="AR-M") sma = summary(ma) u = coefficients(sma) cfa = paste(u[,1], collapse=",") sea = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa) sprintf("se = [[%s],[%s],[%s]]", sei, see, sea) """ endog, exog, group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = range(len(jj)) family = Binomial() ve = Exchangeable() vi = Independence() va = Autoregressive() # From R gee cf = [[ 0.0167272965285882, 1.13038654425893, -1.86896345082962, 1.09397608331333 ], [ 0.0178982283915449, 1.13118798191788, -1.86133518416017, 1.08944256230299 ], [ 0.0109621937947958, 1.13226505028438, -1.88278757333046, 1.09954623769449 ]] se = [[ 0.127291720283049, 0.166725808326067, 0.192430061340865, 0.173141068839597 ], [ 0.127045031730155, 0.165470678232842, 0.192052750030501, 0.173174779369249 ], [ 0.127240302296444, 0.170554083928117, 0.191045527104503, 0.169776150974586 ]] for j, v in enumerate((vi, ve, va)): md = GEE(endog, exog, group, T, family, v) mdf = md.fit() if id(v) != id(va): assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D.loc[:, "Id"], family=family, covstruct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Check for run-time exceptions in summary print mdf.summary()
# -*- coding: utf-8 -*- """ Created on Fri Aug 12 11:36:51 2016 @author: emg """ import numpy as np import pandas as pd from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.cov_struct import (Exchangeable, Independence, Autoregressive) from statsmodels.genmod.families import Poisson fam = Poisson() ind = Independence() model1 = GEE.from_formula("author_count ~ top + mod", "author", authors, cov_struct=ind, family=fam) result1 = model1.fit() print(result1.summary())