def test_poisson_formula(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 0.5) for vb in False, True: glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident) if vb: rslt1 = glmm1.fit_vb() else: rslt1 = glmm1.fit_map() # Build categorical variables that match exog_vc df = pd.DataFrame({"y": y, "x1": exog_fe[:, 0]}) z1 = np.zeros(len(y)) for j, k in enumerate(np.flatnonzero(ident == 0)): z1[exog_vc[:, k] == 1] = j df["z1"] = z1 z2 = np.zeros(len(y)) for j, k in enumerate(np.flatnonzero(ident == 1)): z2[exog_vc[:, k] == 1] = j df["z2"] = z2 fml = "y ~ 0 + x1" from collections import OrderedDict vc_fml = OrderedDict({}) vc_fml["z1"] = "0 + C(z1)" vc_fml["z2"] = "0 + C(z2)" glmm2 = PoissonBayesMixedGLM.from_formula(fml, vc_fml, df) if vb: rslt2 = glmm2.fit_vb() else: rslt2 = glmm2.fit_map() assert_allclose(rslt1.params, rslt2.params, rtol=1e-5)
def test_simple_poisson_vb(): y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 1) exog_vc = sparse.csr_matrix(exog_vc) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt2 = glmm2.fit_vb(rslt1.params) rslt1.summary() rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[ -0.07233493, -0.06706505, -0.47159649, 1.12575122, -1.02442201], rtol=1e-4, atol=1e-4) assert_allclose(rslt1.cov_params.flat[0:5], np.r_[ 0.00790914, 0.00080666, -0.00050719, 0.00022648, 0.00046235], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ -0.07088814, -0.06373107, -0.22770786, 1.12923746, -1.26161339], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.cov_params[0:5], np.r_[ 0.00747782, 0.0092554, 0.04508904, 0.02934488, 0.20312746], rtol=1e-4, atol=1e-4)
def test_simple_poisson_map(): y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 0.2) exog_vc = sparse.csr_matrix(exog_vc) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map() assert_allclose(glmm1.logposterior_grad(rslt1.params), np.zeros_like(rslt1.params), atol=1e-3) # This should give the same answer as above glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt2 = glmm2.fit_map() assert_allclose(rslt1.params, rslt2.params, atol=1e-4) # Test the predict method for linear in False, True: for exog in None, exog_fe: pr1 = rslt1.predict(linear=linear, exog=exog) pr2 = rslt2.predict(linear=linear, exog=exog) pr3 = glmm1.predict(rslt1.params, linear=linear, exog=exog) pr4 = glmm2.predict(rslt2.params, linear=linear, exog=exog) assert_allclose(pr1, pr2, rtol=1e-5) assert_allclose(pr2, pr3, rtol=1e-5) assert_allclose(pr3, pr4, rtol=1e-5) if not linear: assert_equal(pr1.min() >= 0, True) assert_equal(pr2.min() >= 0, True) assert_equal(pr3.min() >= 0, True)
def test_crossed_poisson_map(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 1) exog_vc = sparse.csr_matrix(exog_vc) glmm = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt = glmm.fit_map() assert_allclose(glmm.logposterior_grad(rslt.params), np.zeros_like(rslt.params), atol=1e-4)
def test_simple_poisson_map(): y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 0.2) exog_vc = sparse.csr_matrix(exog_vc) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map() assert_allclose(glmm1.logposterior_grad(rslt1.params), np.zeros_like(rslt1.params), atol=1e-3) # This should give the same answer as above glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt2 = glmm2.fit_map() assert_allclose(rslt1.params, rslt2.params, atol=1e-4)
def test_doc_examples(): np.random.seed(8767) n = 200 m = 20 data = pd.DataFrame({"Year": np.random.uniform(0, 1, n), "Village": np.random.randint(0, m, n)}) data['year_cen'] = data['Year'] - data.Year.mean() # Binomial outcome lpr = np.random.normal(size=m)[data.Village] lpr += np.random.normal(size=m)[data.Village] * data.year_cen y = (np.random.uniform(size=n) < 1 / (1 + np.exp(-lpr))) data["y"] = y.astype(np.int) # These lines should agree with the example in the class docstring. random = {"a": '0 + C(Village)', "b": '0 + C(Village)*year_cen'} model = BinomialBayesMixedGLM.from_formula( 'y ~ year_cen', random, data) result = model.fit_vb() _ = result # Poisson outcome lpr = np.random.normal(size=m)[data.Village] lpr += np.random.normal(size=m)[data.Village] * data.year_cen data["y"] = np.random.poisson(np.exp(lpr)) # These lines should agree with the example in the class docstring. random = {"a": '0 + C(Village)', "b": '0 + C(Village)*year_cen'} model = PoissonBayesMixedGLM.from_formula( 'y ~ year_cen', random, data) result = model.fit_vb() _ = result
def test_simple_poisson_vb(): y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 1) exog_vc = sparse.csr_matrix(exog_vc) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt2 = glmm2.fit_vb(rslt1.params) rslt1.summary() rslt2.summary() assert_allclose( rslt1.params[0:5], np.r_[-0.07233493, -0.06706505, -0.47159649, 1.12575122, -1.02442201], rtol=1e-4, atol=1e-4) assert_allclose( rslt1.cov_params().flat[0:5], np.r_[0.00790914, 0.00080666, -0.00050719, 0.00022648, 0.00046235], rtol=1e-4, atol=1e-4) assert_allclose( rslt2.params[0:5], np.r_[-0.07088814, -0.06373107, -0.22770786, 1.12923746, -1.26161339], rtol=1e-4, atol=1e-4) assert_allclose( rslt2.cov_params()[0:5], np.r_[0.00747782, 0.0092554, 0.04508904, 0.02934488, 0.20312746], rtol=1e-4, atol=1e-4) for rslt in rslt1, rslt2: cp = rslt.cov_params() p = len(rslt.params) if rslt is rslt1: assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp) else: assert_equal(cp.shape, np.r_[p,]) assert_equal(cp > 0, True*np.ones(p))
def test_elbo_grad(): for f in range(2): for j in range(2): if f == 0: if j == 0: y, exog_fe, exog_vc, ident = gen_simple_logit(10, 10, 2) else: y, exog_fe, exog_vc, ident = gen_crossed_logit( 10, 10, 1, 2) elif f == 1: if j == 0: y, exog_fe, exog_vc, ident = gen_simple_poisson( 10, 10, 0.5) else: y, exog_fe, exog_vc, ident = gen_crossed_poisson( 10, 10, 1, 0.5) exog_vc = sparse.csr_matrix(exog_vc) if f == 0: glmm1 = BinomialBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) else: glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map() for k in range(3): if k == 0: vb_mean = rslt1.params vb_sd = np.ones_like(vb_mean) elif k == 1: vb_mean = np.zeros(len(vb_mean)) vb_sd = np.ones_like(vb_mean) else: vb_mean = np.random.normal(size=len(vb_mean)) vb_sd = np.random.uniform(1, 2, size=len(vb_mean)) mean_grad, sd_grad = glmm1.vb_elbo_grad(vb_mean, vb_sd) def elbo(vec): n = len(vec) // 2 return glmm1.vb_elbo(vec[:n], vec[n:]) x = np.concatenate((vb_mean, vb_sd)) g1 = approx_fprime(x, elbo, 1e-5) n = len(x) // 2 mean_grad_n = g1[:n] sd_grad_n = g1[n:] assert_allclose(mean_grad, mean_grad_n, atol=1e-2, rtol=1e-2) assert_allclose(sd_grad, sd_grad_n, atol=1e-2, rtol=1e-2)
def test_poisson_formula(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 0.5) for vb in False, True: glmm1 = PoissonBayesMixedGLM( y, exog_fe, exog_vc, ident) if vb: rslt1 = glmm1.fit_vb() else: rslt1 = glmm1.fit_map() # Build categorical variables that match exog_vc df = pd.DataFrame({"y": y, "x1": exog_fe[:, 0]}) z1 = np.zeros(len(y)) for j,k in enumerate(np.flatnonzero(ident == 0)): z1[exog_vc[:, k] == 1] = j df["z1"] = z1 z2 = np.zeros(len(y)) for j,k in enumerate(np.flatnonzero(ident == 1)): z2[exog_vc[:, k] == 1] = j df["z2"] = z2 fml = "y ~ 0 + x1" from collections import OrderedDict vc_fml = OrderedDict({}) vc_fml["z1"] = "0 + C(z1)" vc_fml["z2"] = "0 + C(z2)" glmm2 = PoissonBayesMixedGLM.from_formula(fml, vc_fml, df) if vb: rslt2 = glmm2.fit_vb() else: rslt2 = glmm2.fit_map() assert_allclose(rslt1.params, rslt2.params, rtol=1e-5) for rslt in rslt1, rslt2: cp = rslt.cov_params() p = len(rslt.params) if vb: assert_equal(cp.shape, np.r_[p,]) assert_equal(cp > 0, True*np.ones(p)) else: assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp)
def test_crossed_poisson_map(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 1) exog_vc = sparse.csr_matrix(exog_vc) glmm = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt = glmm.fit_map() assert_allclose(glmm.logposterior_grad(rslt.params), np.zeros_like(rslt.params), atol=1e-4) # Check dimensions and PSD status of cov_params cp = rslt.cov_params() p = len(rslt.params) assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp)
def test_poisson_formula(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 0.5) for vb in False, True: glmm1 = PoissonBayesMixedGLM( y, exog_fe, exog_vc, ident) if vb: rslt1 = glmm1.fit_vb() else: rslt1 = glmm1.fit_map() # Build categorical variables that match exog_vc df = pd.DataFrame({"y": y, "x1": exog_fe[:, 0]}) z1 = np.zeros(len(y)) for j,k in enumerate(np.flatnonzero(ident == 0)): z1[exog_vc[:, k] == 1] = j df["z1"] = z1 z2 = np.zeros(len(y)) for j,k in enumerate(np.flatnonzero(ident == 1)): z2[exog_vc[:, k] == 1] = j df["z2"] = z2 fml = "y ~ 0 + x1" vc_fml = {} vc_fml["z1"] = "0 + C(z1)" vc_fml["z2"] = "0 + C(z2)" glmm2 = PoissonBayesMixedGLM.from_formula(fml, vc_fml, df) if vb: rslt2 = glmm2.fit_vb() else: rslt2 = glmm2.fit_map() assert_allclose(rslt1.params, rslt2.params, rtol=1e-5) for rslt in rslt1, rslt2: cp = rslt.cov_params() p = len(rslt.params) if vb: assert_equal(cp.shape, np.r_[p,]) assert_equal(cp > 0, True*np.ones(p)) else: assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp)
def test_crossed_poisson_map(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 1) exog_vc = sparse.csr_matrix(exog_vc) glmm = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt = glmm.fit_map() assert_allclose( glmm.logposterior_grad(rslt.params), np.zeros_like(rslt.params), atol=1e-4) # Check dimensions and PSD status of cov_params cp = rslt.cov_params() p = len(rslt.params) assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp)
def test_crossed_poisson_vb(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 0.5) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) rslt2 = glmm2.fit_vb(mean=rslt1.params) rslt1.summary() rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[-0.54855281, 0.10458834, -0.68777741, -0.01699925, 0.77200546], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[-0.54691502, 0.22297158, -0.52673802, -0.06218684, 0.74385237], rtol=1e-4, atol=1e-4)
def test_crossed_poisson_vb(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 0.5) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) rslt2 = glmm2.fit_vb(mean=rslt1.params) rslt1.summary() rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[ -0.54855281, 0.10458834, -0.68777741, -0.01699925, 0.77200546], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[ -0.54691502, 0.22297158, -0.52673802, -0.06218684, 0.74385237], rtol=1e-4, atol=1e-4)
def test_crossed_poisson_vb(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 0.5) glmm1 = PoissonBayesMixedGLM( y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM( y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) rslt2 = glmm2.fit_vb(mean=rslt1.params) rslt1.summary() rslt2.summary() assert_allclose( rslt1.params[0:5], np.r_[-0.54855281, 0.10458834, -0.68777741, -0.01699925, 0.77200546], rtol=1e-4, atol=1e-4) assert_allclose( rslt2.params[0:5], np.r_[-0.54691502, 0.22297158, -0.52673802, -0.06218684, 0.74385237], rtol=1e-4, atol=1e-4) for rslt in rslt1, rslt2: cp = rslt.cov_params() p = len(rslt.params) if rslt is rslt1: assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp) else: assert_equal(cp.shape, np.r_[p,]) assert_equal(cp > 0, True*np.ones(p))
def test_simple_poisson_map(): y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 0.2) exog_vc = sparse.csr_matrix(exog_vc) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map() assert_allclose( glmm1.logposterior_grad(rslt1.params), np.zeros_like(rslt1.params), atol=1e-3) # This should give the same answer as above glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt2 = glmm2.fit_map() assert_allclose(rslt1.params, rslt2.params, atol=1e-4) # Test the predict method for linear in False, True: for exog in None, exog_fe: pr1 = rslt1.predict(linear=linear, exog=exog) pr2 = rslt2.predict(linear=linear, exog=exog) pr3 = glmm1.predict(rslt1.params, linear=linear, exog=exog) pr4 = glmm2.predict(rslt2.params, linear=linear, exog=exog) assert_allclose(pr1, pr2, rtol=1e-5) assert_allclose(pr2, pr3, rtol=1e-5) assert_allclose(pr3, pr4, rtol=1e-5) if not linear: assert_equal(pr1.min() >= 0, True) assert_equal(pr2.min() >= 0, True) assert_equal(pr3.min() >= 0, True) # Check dimensions and PSD status of cov_params for rslt in rslt1, rslt2: cp = rslt.cov_params() p = len(rslt.params) assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp)
def test_simple_poisson_vb(): y, exog_fe, exog_vc, ident = gen_simple_poisson(10, 10, 1) exog_vc = sparse.csr_matrix(exog_vc) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5) rslt2 = glmm2.fit_vb(rslt1.params) rslt1.summary() rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[-0.07233493, -0.06706505, -0.47159649, 1.12575122, -1.02442201], rtol=1e-4, atol=1e-4) assert_allclose(rslt1.cov_params().flat[0:5], np.r_[0.00790914, 0.00080666, -0.00050719, 0.00022648, 0.00046235], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[-0.07088814, -0.06373107, -0.22770786, 1.12923746, -1.26161339], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.cov_params()[0:5], np.r_[0.00747782, 0.0092554, 0.04508904, 0.02934488, 0.20312746], rtol=1e-4, atol=1e-4) for rslt in rslt1, rslt2: cp = rslt.cov_params() p = len(rslt.params) if rslt is rslt1: assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp) else: assert_equal(cp.shape, np.r_[p, ]) assert_equal(cp > 0, True * np.ones(p))
def test_crossed_poisson_vb(): y, exog_fe, exog_vc, ident = gen_crossed_poisson(10, 10, 1, 0.5) glmm1 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) rslt1 = glmm1.fit_map() glmm2 = PoissonBayesMixedGLM(y, exog_fe, exog_vc, ident, vcp_p=0.5, fe_p=0.5) rslt2 = glmm2.fit_vb(mean=rslt1.params) rslt1.summary() rslt2.summary() assert_allclose(rslt1.params[0:5], np.r_[-0.54855281, 0.10458834, -0.68777741, -0.01699925, 0.77200546], rtol=1e-4, atol=1e-4) assert_allclose(rslt2.params[0:5], np.r_[-0.54691502, 0.22297158, -0.52673802, -0.06218684, 0.74385237], rtol=1e-4, atol=1e-4) for rslt in rslt1, rslt2: cp = rslt.cov_params() p = len(rslt.params) if rslt is rslt1: assert_equal(cp.shape, np.r_[p, p]) np.linalg.cholesky(cp) else: assert_equal(cp.shape, np.r_[p, ]) assert_equal(cp > 0, True * np.ones(p))
nfl_rush_2019 = nfl_2019[nfl_2019["play_type"] == "rush"] # first, fit rush outcome models # 1 - touchdown rush_penalty_mod = BinomialBayesMixedGLM.from_formula( 'penalty ~ shotgun + no_huddle + qb_dropback + run_location + run_gap', ['0 + rusher_id', '0 + def_id'], data=nfl_rush_2019) rush_penalty_result = rush_penalty_mod.fit_vb # 2 - rushing yards rush_yard_mod = PoissonBayesMixedGLM.from_formula( 'yards_gained ~ shotgun + no_huddle + qb_dropback + run_location + run_gap', ['0 + rusher_id', '0 + def_id'], data=nfl_rush_2019) rush_yard_result = rush_yard_mod.fit_vb() # 3 - rushing turnovers (fumbles) rush_turnover_mod = BinomialBayesMixedGLM.from_formula( 'turnover ~ shotgun + no_huddle + qb_dropback + run_location + run_gap', ['0 + rusher_id', '0 + def_id'], data=nfl_rush_2019) rush_turnover_result = rush_turnover_mod.fit_vb() # 4 - penalty (holding) rush_penalty_mod = BinomialBayesMixedGLM.from_formula( 'penalty ~ shotgun + no_huddle + qb_dropback + run_location + run_gap',
stats.probplot(gee_model2_results.resid_pearson, plot=plt, fit=True) axs[1,0].set_xlabel("Theoretical quantiles",fontsize=10) axs[1,0].set_ylabel("Sample quantiles",fontsize=10) axs[1,0].set_title("Q-Q plot of normalized residuals",fontsize=10) plt.subplots_adjust(left=0.12, hspace=0.25) plt.show() # model comparison print(gee_model2_results.qic()) print(gee_model1_results.qic()) # Both models improve, but are not satisfatory, probably because they cannot take account of excessive zeros and they only use cluster-robust standard errors and thus # cannot model how lower level coefficients vary across groups of the higher level. Python statsmodels has zero-inflated count model methods, but they cannot deal with # panel/clustered data. # Approach two to generalized linear models for panel data: Generalized Linear Mixed Effects Model support Poisson models using Bayesian methods # poisson mixed effects model with one random effect formula = "cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black" po_bay_panel1 = PoissonBayesMixedGLM.from_formula(formula, {'state': '0 + C(state)'}, US_cases_long_demogr_week) po_bay_panel1_results = po_bay_panel1.fit_map() print(po_bay_panel1_results.summary()) # poisson mixed effects model with two independnet random effects formula = "cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black" po_bay_panel2 = PoissonBayesMixedGLM.from_formula(formula, {'state': '0 + C(state)', "week_of_year": '0 + C(week_of_year)'}, US_cases_long_demogr_week) po_bay_panel2_results = po_bay_panel2.fit_map() print(po_bay_panel2_results.summary())