def setup_class(cls): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs,6) data_exog = rvs data_exog = sm.add_constant(data_exog, prepend=False) xbeta = 1 + 0.1*rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) cls.res_glm = mod_glm.fit() #estimate generic MLE #cls.mod = PoissonGMLE(data_endog, data_exog) #res = cls.mod.fit() #create offset variable based on first exog cls.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) offset = cls.res_discrete.params[0] * data_exog[:,0] #1d ??? #estimate discretemod.Poisson as benchmark, now has offset cls.res_discrete = Poisson(data_endog, data_exog[:,1:], offset=offset).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) cls.res_glm = mod_glm.fit() #cls.res = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset).fit(start_params = np.ones(6)/2., method='nm') modo = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset) cls.res = modo.fit(start_params = 0.9*cls.res_discrete.params, method='bfgs', disp=0)
def __init__(self): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs, 6) data_exog = rvs data_exog = sm.add_constant(data_exog, prepend=False) xbeta = 1 + 0.1 * rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) self.res_glm = mod_glm.fit() #estimate generic MLE #self.mod = PoissonGMLE(data_endog, data_exog) #res = self.mod.fit() #create offset variable based on first exog self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) offset = self.res_discrete.params[0] * data_exog[:, 0] #1d ??? #estimate discretemod.Poisson as benchmark, now has offset self.res_discrete = Poisson(data_endog, data_exog[:, 1:], offset=offset).fit(disp=0) # Note : ZI has one extra parameter self.res = PoissonZiGMLE( data_endog, data_exog[:, 1:], offset=offset).fit( start_params=np.r_[0.9 * self.res_discrete.params, 10], method='bfgs', disp=0) self.decimal = 4
def junk(): # FIXME: make this into a test, or move/remove # Singular Matrix in mod1a.fit() # same as Stata default formula2 = 'deaths ~ C(agecat) + C(smokes) : C(agecat)' mod = Poisson.from_formula(formula2, data=data, exposure=data['pyears'].values) mod.fit() constraints = 'C(smokes)[T.1]:C(agecat)[3] = C(smokes)[T.1]:C(agec`at)[4]' import patsy lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constraints) R, q = lc.coefs, lc.constants mod.fit_constrained(R, q, fit_kwds={'method': 'bfgs'}) # example without offset formula1a = 'deaths ~ logpyears + smokes + C(agecat)' mod1a = Poisson.from_formula(formula1a, data=data) mod1a.fit() lc_1a = patsy.DesignInfo(mod1a.exog_names).linear_constraint( 'C(agecat)[T.4] = C(agecat)[T.5]') mod1a.fit_constrained(lc_1a.coefs, lc_1a.constants, fit_kwds={'method': 'newton'})
def __init__(self, endog, exog, exog_infl=None, offset=None, exposure=None, inflation='logit', missing='none', **kwargs): super(ZeroInflatedPoisson, self).__init__(endog, exog, offset=offset, inflation=inflation, exog_infl=exog_infl, exposure=exposure, missing=missing, **kwargs) self.model_main = Poisson(self.endog, self.exog, offset=offset, exposure=exposure) self.distribution = zipoisson self.result_class = ZeroInflatedPoissonResults self.result_class_wrapper = ZeroInflatedPoissonResultsWrapper self.result_class_reg = L1ZeroInflatedPoissonResults self.result_class_reg_wrapper = L1ZeroInflatedPoissonResultsWrapper
def junk(): # Singular Matrix in mod1a.fit() formula1 = 'deaths ~ smokes + C(agecat)' formula2 = 'deaths ~ C(agecat) + C(smokes) : C(agecat)' # same as Stata default mod = Poisson.from_formula(formula2, data=data, exposure=data['pyears'].values) res0 = mod.fit() constraints = 'C(smokes)[T.1]:C(agecat)[3] = C(smokes)[T.1]:C(agecat)[4]' import patsy lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constraints) R, q = lc.coefs, lc.constants resc = mod.fit_constrained(R,q, fit_kwds={'method':'bfgs'}) # example without offset formula1a = 'deaths ~ logpyears + smokes + C(agecat)' mod1a = Poisson.from_formula(formula1a, data=data) print(mod1a.exog.shape) res1a = mod1a.fit() lc_1a = patsy.DesignInfo(mod1a.exog_names).linear_constraint('C(agecat)[T.4] = C(agecat)[T.5]') resc1a = mod1a.fit_constrained(lc_1a.coefs, lc_1a.constants, fit_kwds={'method':'newton'}) print(resc1a[0]) print(resc1a[1])
def setup_class(cls): expected_params = [1, 1, 0.5] np.random.seed(987123) nobs = 500 exog = np.ones((nobs, 2)) exog[:nobs // 2, 1] = 0 # offset is used to create misspecification of the model # for predicted probabilities conditional moment test #offset = 0.5 * np.random.randn(nobs) #range_mix = 0.5 #offset = -range_mix / 2 + range_mix * np.random.rand(nobs) offset = 0 mu_true = np.exp(exog.dot(expected_params[:-1]) + offset) endog_poi = np.random.poisson(mu_true / 5) # endog3 = distr.zigenpoisson.rvs(mu_true, 0, # 2, 0.01, size=mu_true.shape) model_poi = Poisson(endog_poi, exog) res_poi = model_poi.fit(method='bfgs', maxiter=5000, maxfun=5000) cls.exog = exog cls.endog = endog_poi cls.res = res_poi cls.nobs = nobs
def poisson_regression(self, endog, exog, clean_data="greedy"): s = self.map_column_to_sheet(endog) arg_endog = endog arg_exog = exog # prepare data v = np.copy(exog) v = np.append(v, endog) dfClean = s.cleanData(v, clean_data) exog = sm.add_constant(dfClean[exog]) endog = dfClean[endog] poisson = Poisson(endog, exog) fit = poisson.fit() utterance = ( "Here are the results of a Poisson regression with endogenous variables " ) utterance = ( utterance + str(arg_endog) + " and exogenous variables " + str(arg_exog) + ".\n" ) utterance = utterance + str(fit.summary()) return QueryResult(fit.summary(), utterance)
def junk(): # Singular Matrix in mod1a.fit() formula1 = 'deaths ~ smokes + C(agecat)' formula2 = 'deaths ~ C(agecat) + C(smokes) : C(agecat)' # same as Stata default mod = Poisson.from_formula(formula2, data=data, exposure=data['pyears'].values) res0 = mod.fit() constraints = 'C(smokes)[T.1]:C(agecat)[3] = C(smokes)[T.1]:C(agecat)[4]' import patsy lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constraints) R, q = lc.coefs, lc.constants resc = mod.fit_constrained(R, q, fit_kwds={'method': 'bfgs'}) # example without offset formula1a = 'deaths ~ logpyears + smokes + C(agecat)' mod1a = Poisson.from_formula(formula1a, data=data) print(mod1a.exog.shape) res1a = mod1a.fit() lc_1a = patsy.DesignInfo( mod1a.exog_names).linear_constraint('C(agecat)[T.4] = C(agecat)[T.5]') resc1a = mod1a.fit_constrained(lc_1a.coefs, lc_1a.constants, fit_kwds={'method': 'newton'}) print(resc1a[0]) print(resc1a[1])
def test_netchop_improvement(key): res = Poisson( ddf[key].values, add_constant(ddf.method_simultaneous) ).fit() print(res.summary()) return res
def setup_class(cls): expected_params = [1, 1, 0.5] np.random.seed(987123) nobs = 500 exog = np.ones((nobs, 2)) exog[:nobs//2, 1] = 0 # offset is used to create misspecification of the model # for predicted probabilities conditional moment test #offset = 0.5 * np.random.randn(nobs) #range_mix = 0.5 #offset = -range_mix / 2 + range_mix * np.random.rand(nobs) offset = 0 mu_true = np.exp(exog.dot(expected_params[:-1]) + offset) endog_poi = np.random.poisson(mu_true / 5) # endog3 = distr.zigenpoisson.rvs(mu_true, 0, # 2, 0.01, size=mu_true.shape) model_poi = Poisson(endog_poi, exog) res_poi = model_poi.fit(method='bfgs', maxiter=5000, maxfun=5000) cls.exog = exog cls.endog = endog_poi cls.res = res_poi cls.nobs = nobs
def _initialize(cls): y, x = cls.y, cls.x modp = Poisson(y, x) cls.res2 = modp.fit(disp=0) mod = PoissonPenalized(y, x) mod.pen_weight = 0 cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0) cls.atol = 5e-6
def setup_class(cls): # here we don't need to check convergence from default start_params start_params = [14.1709, 0.7085, -3.4548, -0.539, 3.2368, -7.9299, -5.0529] mod_poi = Poisson(endog, exog) res_poi = mod_poi.fit(start_params=start_params) marge_poi = res_poi.get_margeff(dummy=True) cls.res = res_poi cls.margeff = marge_poi cls.res1_slice = [0, 1, 2, 3, 5, 6] cls.res1 = res_stata.results_poisson_margins_dummy
def setup_class(cls): df = data_bin mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Poisson()) res = mod.fit(attach_wls=True, atol=1e-10) from statsmodels.discrete.discrete_model import Poisson mod2 = Poisson(df['constrict'], df[['const', 'log_rate', 'log_volumne']]) res2 = mod2.fit(tol=1e-10) cls.infl0 = res.get_influence() cls.infl1 = res2.get_influence()
def _initialize(cls): y, x = cls.y, cls.x modp = Poisson(y, x[:, :cls.k_nonzero]) cls.res2 = modp.fit(disp=0) mod = PoissonPenalized(y, x, penal=cls.penalty) mod.pen_weight *= 1.5 mod.penal.tau = 0.05 cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0) cls.exog_index = slice(None, cls.k_nonzero, None) cls.atol = 5e-3
def setup_class(cls): # here we don't need to check convergence from default start_params start_params = [ 14.1709, 0.7085, -3.4548, -0.539, 3.2368, -7.9299, -5.0529 ] mod_poi = Poisson(endog, exog) res_poi = mod_poi.fit(start_params=start_params) marge_poi = res_poi.get_margeff(dummy=True) cls.res = res_poi cls.margeff = marge_poi cls.res1_slice = [0, 1, 2, 3, 5, 6] cls.res1 = res_stata.results_poisson_margins_dummy
def setup_class(cls): # here we don't need to check convergence from default start_params start_params = [14.1709, 0.7085, -3.4548, -0.539, 3.2368, -7.9299, -5.0529] mod_poi = Poisson(endog, exog) res_poi = mod_poi.fit(start_params=start_params) #res_poi = mod_poi.fit(maxiter=100) marge_poi = res_poi.get_margeff() cls.res = res_poi cls.margeff = marge_poi cls.rtol_fac = 1 cls.res1_slice = slice(None, None, None) cls.res1 = res_stata.results_poisson_margins_cont
def setup_class(cls): # here we don't need to check convergence from default start_params start_params = [ 14.1709, 0.7085, -3.4548, -0.539, 3.2368, -7.9299, -5.0529 ] mod_poi = Poisson(endog, exog) res_poi = mod_poi.fit(start_params=start_params) #res_poi = mod_poi.fit(maxiter=100) marge_poi = res_poi.get_margeff() cls.res = res_poi cls.margeff = marge_poi cls.rtol_fac = 1 cls.res1_slice = slice(None, None, None) cls.res1 = res_stata.results_poisson_margins_cont
def setup_class(cls): cls.res2 = results.results_exposure_constraint2 #cls.idx = [3, 4, 5, 6, 0, 1] # 2 is dropped baseline for categorical cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical # example without offset formula = 'deaths ~ smokes + C(agecat)' mod = Poisson.from_formula(formula, data=data, offset=np.log(data['pyears'].values)) constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, fit_kwds={ 'method': 'newton', 'disp': 0 }) cls.constraints = lc # TODO: bfgs fails # test method of Poisson, not monkey patched cls.res1m = mod.fit_constrained(constr, method='bfgs', disp=0, start_params=cls.res1[0])
def setup_class(cls): cls.res2 = results.results_noexposure_constraint2 cls.idx = [7, 3, 4, 5, 6, 0, 1] # 2 is dropped baseline for categorical # example without offset formula = 'deaths ~ logpyears + smokes + C(agecat)' mod = Poisson.from_formula(formula, data=data) # get start_params, example fails to converge on one py TravisCI k_vars = len(mod.exog_names) start_params = np.zeros(k_vars) start_params[0] = np.log(mod.endog.mean()) # if we need it, this is desired params p = np.array([-9.43762015, 1.52762442, 2.74155711, 3.58730007, 4.08730007, 1.15987869, 0.12111539]) constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, start_params=start_params, fit_kwds={'method': 'bfgs', 'disp': 0}) # TODO: Newton fails # test method of Poisson, not monkey patched cls.res1m = mod.fit_constrained(constr, start_params=start_params, method='bfgs', disp=0)
def setupClass(cls): data = sm.datasets.randhie.load() exog = sm.add_constant(data.exog, prepend=False) cls.res1 = Poisson(data.endog, exog).fit(method='newton', disp=0) res2 = RandHIE() res2.poisson() cls.res2 = res2
def setup_class(cls): cls.res2 = results.results_noexposure_constraint cls.idx = [7, 3, 4, 5, 6, 0, 1] # 2 is dropped baseline for categorical # example without offset formula = 'deaths ~ logpyears + smokes + C(agecat)' mod = Poisson.from_formula(formula, data=data) #res1a = mod1a.fit() # get start_params, example fails to converge on one py TravisCI k_vars = len(mod.exog_names) start_params = np.zeros(k_vars) start_params[0] = np.log(mod.endog.mean()) # if we need it, this is desired params p = np.array([-3.93478643, 1.37276214, 2.33077032, 2.71338891, 2.71338891, 0.57966535, 0.97254074]) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, start_params=start_params, fit_kwds={'method': 'bfgs', 'disp': 0}) # TODO: Newton fails # test method of Poisson, not monkey patched cls.res1m = mod.fit_constrained(constr, start_params=start_params, method='bfgs', disp=0)
def __init__(self): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs,6) data_exog = rvs data_exog = sm.add_constant(data_exog, prepend=False) xbeta = 1 + 0.1*rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #estimate discretemod.Poisson as benchmark self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) self.res_glm = mod_glm.fit() #estimate generic MLE #self.mod = PoissonGMLE(data_endog, data_exog) #res = self.mod.fit() offset = self.res_discrete.params[0] * data_exog[:,0] #1d ??? #self.res = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset).fit(start_params = np.ones(6)/2., method='nm') modo = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset) self.res = modo.fit(start_params = 0.9*self.res_discrete.params[1:], method='nm', disp=0)
def test_compare_glm_poisson(self): res1 = self.res1m res2 = self.res2 formula = 'deaths ~ smokes + C(agecat)' mod = Poisson.from_formula(formula, data=data, exposure=data['pyears'].values) #offset=np.log(data['pyears'].values)) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' res2 = mod.fit_constrained(constr, start_params=self.res1m.params, method='newton', warn_convergence=False, disp=0) # we get high precision because we use the params as start_params # basic, just as check that we have the same model assert_allclose(res1.params, res2.params, rtol=1e-12) assert_allclose(res1.bse, res2.bse, rtol=1e-12) # check predict, fitted, ... predicted = res1.predict() assert_allclose(predicted, res2.predict(), rtol=1e-10) assert_allclose(res1.mu, predicted, rtol=1e-10) assert_allclose(res1.fittedvalues, predicted, rtol=1e-10) assert_allclose(res2.predict(linear=True), res2.predict(linear=True), rtol=1e-10)
def test_compare_glm_poisson(self): res1 = self.res1m res2 = self.res2 formula = 'deaths ~ smokes + C(agecat)' mod = Poisson.from_formula( formula, data=data, #offset=np.log(data['pyears'].values)) exposure=data['pyears'].values) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' res2 = mod.fit_constrained(constr, start_params=self.res1m.params, method='newton', warn_convergence=False, disp=0) # we get high precision because we use the params as start_params # basic, just as check that we have the same model assert_allclose(res1.params, res2.params, rtol=1e-12) assert_allclose(res1.bse, res2.bse, rtol=1e-11) # check predict, fitted, ... predicted = res1.predict() assert_allclose(predicted, res2.predict(), rtol=1e-10) assert_allclose(res1.mu, predicted, rtol=1e-10) assert_allclose(res1.fittedvalues, predicted, rtol=1e-10) assert_allclose(res2.predict(linear=True), res2.predict(linear=True), rtol=1e-10)
def setupClass(cls): from results.results_discrete import RandHIE data = sm.datasets.randhie.load() exog = sm.add_constant(data.exog) cls.res1 = Poisson(data.endog, exog).fit(method='newton', disp=0) res2 = RandHIE() res2.poisson() cls.res2 = res2
def testSimulate(self): np.random.seed(123) beta0 = np.r_[1.1, 2.2, 3.3, 4.4] y, X = poisson.simulate(100, beta0) self.assertEqual(X.shape, (100, 4)) self.assertEqual(y.shape, (100, )) # try to recover params using frequentist regression ml_fit = Poisson(y, X).fit() self.assertLess(np.linalg.norm(beta0 - ml_fit.params, 2), 2.0)
def setup_class(cls): # copy-paste except for model nobs, k_vars = 500, 5 np.random.seed(786452) x = np.random.randn(nobs, k_vars) x[:, 0] = 1 x2 = np.random.randn(nobs, 2) xx = np.column_stack((x, x2)) if cls.dispersed: het = np.random.randn(nobs) y = np.random.poisson(np.exp(x.sum(1) * 0.5 + het)) #y_mc = np.random.negative_binomial(np.exp(x.sum(1) * 0.5), 2) else: y = np.random.poisson(np.exp(x.sum(1) * 0.5)) cls.exog_extra = x2 cls.model_full = Poisson(y, xx) cls.model_drop = Poisson(y, x)
def setup_class(cls): cls.res2 = results.results_exposure_noconstraint cls.idx = [6, 2, 3, 4, 5, 0] # 1 is dropped baseline for categorical # example without offset formula = 'deaths ~ smokes + C(agecat)' mod = Poisson.from_formula(formula, data=data, offset=np.log(data['pyears'].values)) res1 = mod.fit(disp=0)._results # res1 is duplicate check, so we can follow the same pattern cls.res1 = (res1.params, res1.cov_params()) cls.res1m = res1
def setup_class(cls): cls.res2 = results.results_exposure_noconstraint cls.idx = [6, 2, 3, 4, 5, 0] # 1 is dropped baseline for categorical # example without offset formula = 'deaths ~ smokes + C(agecat)' mod = Poisson.from_formula(formula, data=data, #exposure=data['pyears'].values) offset=np.log(data['pyears'].values)) res1 = mod.fit(disp=0)._results # res1 is duplicate check, so we can follow the same pattern cls.res1 = (res1.params, res1.cov_params()) cls.res1m = res1
def fit(self, rs: RecordSet) -> None: """ fit a Probit regression mdl :param rs: The record set to fit with. """ # set params self.data = cp.deepcopy(rs) patterns = self.data.entries[:, :-1] out = self.data.entries[:, -1:] if self.add_intercept: intercept = np.ones((patterns.shape[0], 1)) patterns = np.hstack((intercept, patterns)) # avoid error if self.alpha == 0: raise Exception("Alpha Probit too low to obtain reliable results") self.model = Poisson(endog=out.ravel(), exog=patterns) self.model = self.model.fit_regularized(alpha=self.alpha, maxiter=10e8, disp=False)
def setup_class(cls): from statsmodels.discrete.discrete_model import Poisson import statsmodels.stats.tests.test_anova as ttmod test = ttmod.TestAnova3() test.setup_class() cls.data = test.data.drop([0,1,2]) mod = Poisson.from_formula("Days ~ C(Duration) + C(Weight)", cls.data) cls.res = mod.fit(cov_type='HC0') cls.term_name = "C(Weight)" cls.constraints = ['C(Weight)[T.2]', 'C(Weight)[T.3]', 'C(Weight)[T.3] - C(Weight)[T.2]']
def setup_class(cls): cls.res2 = results.results_exposure_constraint #cls.idx = [3, 4, 5, 6, 0, 1] # 2 is dropped baseline for categorical cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical # example without offset formula = 'deaths ~ smokes + C(agecat)' mod = Poisson.from_formula(formula, data=data, offset=np.log(data['pyears'].values)) #res1a = mod1a.fit() constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, fit_kwds={'method':'newton'}) cls.constraints = lc # TODO: bfgs fails # test method of Poisson, not monkey patched cls.res1m = mod.fit_constrained(constr, method='newton')
def __init__(self): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs,6) data_exog = rvs data_exog = sm.add_constant(data_exog, prepend=False) xbeta = 0.1 + 0.1*rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #estimate discretemod.Poisson as benchmark self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) self.res_glm = mod_glm.fit() #estimate generic MLE self.mod = PoissonGMLE(data_endog, data_exog) self.res = self.mod.fit(start_params=0.9 * self.res_discrete.params, method='nm', disp=0)
def setup_class(cls): cls.res2 = results.results_exposure_constraint2 #cls.idx = [3, 4, 5, 6, 0, 1] # 2 is dropped baseline for categorical cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical # example without offset formula = 'deaths ~ smokes + C(agecat)' mod = Poisson.from_formula(formula, data=data, #offset=np.log(data['pyears'].values)) exposure=data['pyears'].values) #res1a = mod1a.fit() constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, fit_kwds={'method': 'newton', 'disp': 0}) cls.constraints = lc # TODO: bfgs fails to converge. overflow somewhere? # test method of Poisson, not monkey patched cls.res1m = mod.fit_constrained(constr, method='bfgs', disp=0, start_params=cls.res1[0])
def test_spec_tests(self): # regression test, numbers similar to Monte Carlo simulation res_dispersion = np.array([[0.1396096387543, 0.8889684245877], [0.1396096387543, 0.8889684245877], [0.2977840351238, 0.7658680002106], [0.1307899995877, 0.8959414342111], [0.1307899995877, 0.8959414342111], [0.1357101381056, 0.8920504328246], [0.2776587511235, 0.7812743277372]]) res_zi = np.array([ [00.1389582826821, 0.7093188241734], [-0.3727710861669, 0.7093188241734], [-0.2496729648642, 0.8028402670888], [00.0601651553909, 0.8062350958880], ]) respoi = Poisson(self.endog, self.exog).fit(disp=0) dia = PoissonDiagnostic(respoi) t_disp = dia.test_dispersion()[0] assert_allclose(t_disp, res_dispersion, rtol=1e-8) nobs = self.endog.shape[0] t_zi_jh = dia.test_poisson_zeroinflation(method="broek", exog_infl=np.ones(nobs)) t_zib = dia.test_poisson_zeroinflation(method="broek") t_zim = dia.test_poisson_zeroinflation(method="prob") t_zichi2 = dia.test_chisquare_prob(bin_edges=np.arange(3)) t_zi = np.vstack([t_zi_jh[:2], t_zib[:2], t_zim[:2], t_zichi2[:2]]) assert_allclose(t_zi, res_zi, rtol=1e-8) # test jansakul and hinde with exog_infl t_zi_ex = dia.test_poisson_zeroinflation(method="broek", exog_infl=self.exog) res_zi_ex = np.array([3.7813218150779, 0.1509719973257]) assert_allclose(t_zi_ex[:2], res_zi_ex, rtol=1e-8)
def test_poisson_screening(): np.random.seed(987865) y, x, idx_nonzero_true, beta = _get_poisson_data() nobs = len(y) xnames_true = ['var%4d' % ii for ii in idx_nonzero_true] xnames_true[0] = 'const' parameters = pd.DataFrame(beta[idx_nonzero_true], index=xnames_true, columns=['true']) xframe_true = pd.DataFrame(x[:, idx_nonzero_true], columns=xnames_true) res_oracle = Poisson(y, xframe_true).fit() parameters['oracle'] = res_oracle.params mod_initial = PoissonPenalized(y, np.ones(nobs), pen_weight=nobs * 5) screener = VariableScreening(mod_initial) exog_candidates = x[:, 1:] res_screen = screener.screen_exog(exog_candidates, maxiter=10) assert_equal(np.sort(res_screen.idx_nonzero), idx_nonzero_true) xnames = ['var%4d' % ii for ii in res_screen.idx_nonzero] xnames[0] = 'const' # smoke test res_screen.results_final.summary(xname=xnames) res_screen.results_pen.summary() assert_equal(res_screen.results_final.mle_retvals['converged'], True) ps = pd.Series(res_screen.results_final.params, index=xnames, name='final') parameters = parameters.join(ps, how='outer') assert_allclose(parameters['oracle'], parameters['final'], atol=5e-6)
def initialize(cls): from statsmodels.discrete.discrete_model import Poisson mod = Poisson.from_formula("Days ~ C(Duration, Sum)*C(Weight, Sum)", cls.data) cls.res = mod.fit(cov_type='HC0')
class PredictPlayerStats(ConvertMixin): def __init__(self, engine, player_name, stat_to_predict, opposing_team_name, predictor_stats=('csum_min_kills', 'csum_min_minions_killed'), defense_predictor_stats=('csum_prev_min_allowed_kills', 'csum_prev_min_allowed_assists'), game_range=None): self.engine = engine self.player_name = player_name self.stat_to_predict = stat_to_predict if predictor_stats: self.predictor_stats = ('csum_prev_min_kills', 'csum_prev_min_minions_killed') else: self.predictor_stats = ('csum_prev_min_kills', 'csum_prev_min_minions_killed') role_stats = ('Jungler', 'Mid', 'Coach', 'Support', 'AD', 'Sub', 'Top') self.predictor_stats = self.predictor_stats + defense_predictor_stats + role_stats self.opposing_team_name = opposing_team_name self.player_stats_table_name = 'player_stats_df' self.processed_player_stars_table_name = 'processed_player_stats_df' self.key_stats = ('kills', 'deaths', 'assists', 'minions_killed', 'gold', 'k_a', 'a_over_k') self.game_range = game_range self._process_player_stats_and_train() def _process_player_stats_and_train(self): processed_player_stats_df = self._get_processed_player_stats_in_df() self.latest_predictor_numpy_array = self._get_latest_player_stats_numpy_array(processed_player_stats_df) print('latest predictors numpy array {}'.format(self.latest_predictor_numpy_array)) predictors, y_array = self._get_predictors_in_numpy_arrays(processed_player_stats_df) self._train_model(predictors, y_array) def _get_latest_player_stats_numpy_array(self, processed_player_stats_df): player_id = self._get_player_id_by_player_name(self.player_name) player_stats_df = processed_player_stats_df[processed_player_stats_df['player_id'] == player_id] latest_player_stats_df = player_stats_df.sort(['game_id'], ascending=False).head(1) dict_player = latest_player_stats_df.to_dict('records')[0] player_predictor_stats = [] for predictor_stat in self.predictor_stats: # print('processing predictor stat {}'.format(predictor_stat)) player_predictor_stats.append(dict_player[predictor_stat]) latest_predictor_numpy_array = numpy.array([player_predictor_stats]) return latest_predictor_numpy_array def _get_predictors_in_numpy_arrays(self, processed_player_stats_df): player_game_records = self._get_predictors(processed_player_stats_df) game_list = [] y_array_list = [] for player_game_record in player_game_records: game_predictor_stats = [] if not (numpy.isnan(player_game_record['csum_prev_min_kills']) or numpy.isnan(player_game_record['csum_prev_min_allowed_kills'])): if player_game_record['csum_prev_min_assists'] != 0: prev_predictor_stats = self._convert_predictors_to_prev_csum(self.predictor_stats) for prev_predictor_stat in prev_predictor_stats: game_predictor_stats.append(player_game_record[prev_predictor_stat]) game_list.append(game_predictor_stats) y_array_list.append(player_game_record['y_element']) predictors = numpy.array(game_list) y_array = numpy.array([y_array_list]) return predictors, y_array def _get_predictors(self, processed_player_stats_df): player_game_records = processed_player_stats_df.to_dict('records') player_game_records.sort(key=itemgetter('game_id')) for player_game_record in player_game_records: player_game_record['y_element'] = player_game_record[self.stat_to_predict] return player_game_records def _train_model(self, predictors, y_array): y_1darray = numpy.squeeze(y_array) self.poisson = Poisson(y_1darray, predictors) self.pos_result = self.poisson.fit(method='bfgs') def _get_game_ids_from_database(self): game_ids_row = Game.objects.values_list('id', flat=True) game_ids = [game for game in game_ids_row] return game_ids def _get_lastest_processed_team_stats_by_name(self): return ProcessedTeamStatsDf.objects.filter(name=self.opposing_team_name).order_by('-id').first() def _get_game_by_ids(self, game_ids): return Game.objects.filter(id__in=game_ids) def _get_player_id_by_player_name(self, player_name): player = Player.objects.filter(name=player_name) return player[0].id def _get_processed_player_stats_in_df(self): game_ids = self._get_game_ids_from_database() last_game_number = game_ids[-1] has_processed_team_stats_table = self.engine.has_table(self.processed_player_stars_table_name) if has_processed_team_stats_table: df_game_stats = pandas.read_sql(self.player_stats_table_name, self.engine) df_game_stats_all = df_game_stats[df_game_stats.game_id.isin(game_ids)] # Using game_numbers here since we need the last few games to check. max_game_id_cached = df_game_stats_all['game_id'].max() max_index_cached = df_game_stats_all['index'].max() if pandas.isnull(max_game_id_cached): max_game_id_cached = game_ids[0] # Check if all the game numbers have been cached, # if not return what game to start form and what game to end from. if max_game_id_cached != last_game_number: # Get the index of the max_game_id max_game_id_index = game_ids.index(max_game_id_cached) # Trim down the list to only the games that need to be retrieved, # start from the max_id + 1 because we don't # want to count max_id we already have it game_ids_to_find = game_ids[max_game_id_index:] games = self._get_game_by_ids(game_ids_to_find) player_stats_df = self._get_player_stats_in_df(games, max_index_cached) self._insert_into_player_stats_df_tables(player_stats_df) else: # If everything was cached return cached as true and just return the last numbers # I could do this part better. print("everything cached no need to retrieve from api") else: _get_player_stats_in_df = 0 # Table did not exist, have to get all games = self._get_game_by_ids(game_ids) player_stats_df = self._get_player_stats_in_df(games, _get_player_stats_in_df) print('table does not exist inserting full table') self._insert_into_player_stats_df_tables(player_stats_df) print('table inserted') if self.game_range == '5': processed_player_stats_df = pandas.read_sql('select * from processed_player_stats_df_limit_5', con=self.engine) elif self.game_range == '10': processed_player_stats_df = pandas.read_sql('select * from processed_player_stats_df_limit_10', con=self.engine) else: processed_player_stats_df = pandas.read_sql_table(self.processed_player_stars_table_name, self.engine) return processed_player_stats_df def _process_player_stats_df(self, player_stats_df): player_stats_df = player_stats_df.sort(['game_id', 'player_id']) key_stats = ['game_length_minutes'] + (list(self.key_stats)) player_stats_df['clean_kills'] = player_stats_df['kills'] player_stats_df.ix[player_stats_df.clean_kills == 0, 'clean_kills'] = 1 player_stats_df['k_a'] = \ player_stats_df['kills'] + player_stats_df['assists'] player_stats_df['a_over_k'] = \ player_stats_df['assists'] / player_stats_df['clean_kills'] player_stats_for_pivot = player_stats_df[['player_name', 'role']] player_stats_for_pivot['value'] = 1 player_pivot_df = player_stats_for_pivot.pivot_table(index='player_name', columns='role', values='value') player_pivot_df.fillna(0, inplace=True) player_pivot_df.reset_index(inplace=True) player_stats_df = pandas.merge(player_stats_df, player_pivot_df, on='player_name') for key_stat in key_stats: print('doing key stats {}'.format(key_stat)) player_stats_df['csum_{}'.format(key_stat)] = player_stats_df.groupby(by='player_id')[key_stat].cumsum() player_stats_df['csum_prev_{}'.format(key_stat)] = \ player_stats_df['csum_{}'.format(key_stat)] - player_stats_df[key_stat] # player_stats_df['csum_prev_avg_{}'.format(key_stat)] = \ # player_stats_df['csum_prev_{}'.format(key_stat)] / player_stats_df['csum_prev_game_number'] player_stats_df['per_min_{}'.format(key_stat)] = player_stats_df[key_stat] / \ player_stats_df['game_length_minutes'] if key_stat not in ['game_number', 'game_length_minutes']: print('doing stats not game_number {}'.format(key_stat)) player_stats_df['csum_min_{}'.format(key_stat)] = \ player_stats_df['csum_{}'.format(key_stat)] / player_stats_df['csum_game_length_minutes'] player_stats_df['csum_prev_min_{}'.format(key_stat)] = \ player_stats_df['csum_prev_{}'.format(key_stat)] / player_stats_df['csum_prev_game_length_minutes'] player_stats_df['csum_prev_min_{}'.format(key_stat)].fillna(0, inplace=True) player_stats_df = player_stats_df.sort(['game_id']) return player_stats_df def _get_player_stats_in_df(self, games, max_index_cached): player_stats_df = None for game in games: players_stats = self._convert_game_to_player_stats_df(game) if player_stats_df is None: player_stats_df = pandas.DataFrame(players_stats, index=list(range(max_index_cached, (max_index_cached + 10)))) else: single_game_player_stats_df = pandas.DataFrame(players_stats, index=list(range(max_index_cached, (max_index_cached + 10)))) player_stats_df = player_stats_df.append(single_game_player_stats_df) max_index_cached += 10 return player_stats_df def _convert_game_to_player_stats_df(self, game): players_stats = game.playerstats_set.all() players_stats_dict = game.playerstats_set.all().values() player_stats_list = [] for player_stats, player_stats_dict in zip(players_stats, players_stats_dict): player_stats_dict['game_length_minutes'] = float(game.game_length_minutes) player_stats_dict['gold'] = float(player_stats_dict['gold']) player_stats_dict['player_name'] = player_stats.player.name self._populate_player_stats_with_defense_stats(player_stats_dict, player_stats, game) player_stats_list.append(player_stats_dict) return player_stats_list def _populate_player_stats_with_defense_stats(self, player_stats_dict, player_stats, game): current_team = player_stats.team processed_team_stats_dict = game.processedteamstatsdf_set.exclude(team_name=current_team).values()[0] for key_stat in self.key_stats: player_stats_dict['csum_prev_min_allowed_{}'.format(key_stat)] = \ processed_team_stats_dict['csum_prev_min_allowed_{}'.format(key_stat)] player_stats_dict['csum_min_allowed_{}'.format(key_stat)] = \ processed_team_stats_dict['csum_min_allowed_{}'.format(key_stat)] def _insert_into_player_stats_df_tables(self, player_stats_df): player_stats_df.to_sql(self.player_stats_table_name, self.engine, if_exists='append') # Could be optimized kinda a hack player_stats_df = pandas.read_sql("select ps.*, p.role, p.image from player_stats_df ps, player p " "where ps.player_id = p.id", self.engine) processed_team_stats_df = self._process_player_stats_df(player_stats_df) processed_team_stats_df.to_sql(self.processed_player_stars_table_name, self.engine, if_exists='append') def predict_player_stat(self): #reshaped_numpy_array = numpy.reshape(self.latest_predictor_numpy_array, 3,1) probability_in_numpy_array = self.poisson.predict(self.pos_result.params, self.latest_predictor_numpy_array) return {self.player_name: probability_in_numpy_array}
def _train_model(self, predictors, y_array): y_1darray = numpy.squeeze(y_array) self.poisson = Poisson(y_1darray, predictors) self.pos_result = self.poisson.fit(method='bfgs')
class ZeroInflatedPoisson(GenericZeroInflated): __doc__ = """ Poisson Zero Inflated model for count data %(params)s %(extra_params)s Attributes ----------- endog : array A reference to the endogenous response variable exog : array A reference to the exogenous design. exog_infl: array A reference to the zero-inflated exogenous design. """ % {'params' : base._model_params_doc, 'extra_params' : _doc_zi_params + base._missing_param_doc} def __init__(self, endog, exog, exog_infl=None, offset=None, exposure=None, inflation='logit', missing='none', **kwargs): super(ZeroInflatedPoisson, self).__init__(endog, exog, offset=offset, inflation=inflation, exog_infl=exog_infl, exposure=exposure, missing=missing, **kwargs) self.model_main = Poisson(self.endog, self.exog, offset=offset, exposure=exposure) self.distribution = zipoisson self.result_class = ZeroInflatedPoissonResults self.result_class_wrapper = ZeroInflatedPoissonResultsWrapper self.result_class_reg = L1ZeroInflatedPoissonResults self.result_class_reg_wrapper = L1ZeroInflatedPoissonResultsWrapper def _hessian_main(self, params): params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] y = self.endog w = self.model_infl.predict(params_infl) w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) score = self.score(params) zero_idx = np.nonzero(y == 0)[0] nonzero_idx = np.nonzero(y)[0] mu = self.model_main.predict(params_main) hess_arr = np.zeros((self.k_exog, self.k_exog)) coeff = (1 + w[zero_idx] * (np.exp(mu[zero_idx]) - 1)) #d2l/dp2 for i in range(self.k_exog): for j in range(i, -1, -1): hess_arr[i, j] = (( self.exog[zero_idx, i] * self.exog[zero_idx, j] * mu[zero_idx] * (w[zero_idx] - 1) * (1 / coeff - w[zero_idx] * mu[zero_idx] * np.exp(mu[zero_idx]) / coeff**2)).sum() - (mu[nonzero_idx] * self.exog[nonzero_idx, i] * self.exog[nonzero_idx, j]).sum()) return hess_arr def _predict_prob(self, params, exog, exog_infl, exposure, offset): params_infl = params[:self.k_inflate] params_main = params[self.k_inflate:] counts = np.atleast_2d(np.arange(0, np.max(self.endog)+1)) if len(exog_infl.shape) < 2: transform = True w = np.atleast_2d( self.model_infl.predict(params_infl, exog_infl))[:, None] else: transform = False w = self.model_infl.predict(params_infl, exog_infl)[:, None] w = np.clip(w, np.finfo(float).eps, 1 - np.finfo(float).eps) mu = self.model_main.predict(params_main, exog, offset=offset)[:, None] result = self.distribution.pmf(counts, mu, w) return result[0] if transform else result def _get_start_params(self): start_params = self.model_main.fit(disp=0, method="nm").params start_params = np.append(np.ones(self.k_inflate) * 0.1, start_params) return start_params
from statsmodels.formula.api import ols, glm, poisson from statsmodels.discrete.discrete_model import Poisson import statsmodels.stats.tests.test_anova as ttmod test = ttmod.TestAnova3() test.setupClass() data = test.data.drop([0,1,2]) res_ols = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit(use_t=False) res_glm = glm("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() res_poi = Poisson.from_formula("Days ~ C(Weight) * C(Duration)", data).fit(cov_type='HC0') res_poi_2 = poisson("Days ~ C(Weight) + C(Duration)", data).fit(cov_type='HC0') print('\nOLS') print(res_ols.wald_test_terms()) print('\nGLM') print(res_glm.wald_test_terms(skip_single=False, combine_terms=['Duration', 'Weight'])) print('\nPoisson 1') print(res_poi.wald_test_terms(skip_single=False, combine_terms=['Duration', 'Weight'])) print('\nPoisson 2') print(res_poi_2.wald_test_terms(skip_single=False)) from statsmodels.discrete.discrete_model import NegativeBinomial res_nb2 = NegativeBinomial.from_formula("Days ~ C(Weight) * C(Duration)", data).fit() print('\nNegative Binomial nb2') print(res_nb2.wald_test_terms(skip_single=False))
nobs = 1000 rvs = np.random.randn(nobs,6) data_exog = rvs data_exog = sm.add_constant(data_exog) xbeta = 1 + 0.1*rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #print data_endog modp = PoissonGMLE(data_endog, data_exog) resp = modp.fit() print resp.params print resp.bse from statsmodels.discrete.discrete_model import Poisson resdp = Poisson(data_endog, data_exog).fit() print '\ncompare with discretemod' print 'compare params' print resdp.params - resp.params print 'compare bse' print resdp.bse - resp.bse gmlp = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) resgp = gmlp.fit() ''' this creates a warning, bug bse is double defined ??? c:\josef\eclipsegworkspace\statsmodels-josef-experimental-gsoc\scikits\statsmodels\decorators.py:105: CacheWriteWarning: The attribute 'bse' cannot be overwritten warnings.warn(errmsg, CacheWriteWarning) ''' print '\ncompare with GLM' print 'compare params' print resgp.params - resp.params