def poisson_reg(formula, data, cov='normal'): """ Creates a poisson model (counting y variable) and returns its summary and average parcial effects (get_margeff). Documentation: https://www.statsmodels.org/stable/examples/notebooks/generated/discrete_choice_example.html Remember to use mod = poisson_reg(...)! :param formula: patsy formula :param data: dataframe :param cov: str normal: common standard errors robust: HC1 standard errors cluster or clustered: clustered standard errors (must specify group) :return : statsmodels model instance """ # Creating and fitting the model if cov == "robust": mod = poisson(formula, data).fit(use_t=True, cov_type='HC1') elif cov == "cluster" or cov == "clustered": group = str(input("What is the group column?")) try: mod = poisson(formula, data).fit(use_t=True, cov_type='cluster', cov_kwds={'groups': data[group]}) except KeyError: erro = "It was not possible to find the desired group. Check the spelling and the data and try again!" return erro else: mod = poisson(formula, data).fit(use_t=True) ## Calculating under/overdispersion sigma = np.around((sum(mod.resid ** 2 / mod.predict()) / mod.df_resid) ** (1 / 2), 2) ## Capturing marginal effects mfx = mod.get_margeff(at='overall') clear_output() print(mod.summary()) print( f"The coefficient to determine over/underdispersion is σ = {sigma}, " + f"which must be close to one for standard errors to be valid. " + f"If not, they must be multiplied by {sigma}.") print("##############################################################################") print(mfx.summary()) print( "\nMarginal effects on certain values can be found using 'mod.get_margeff(atexog = values).summary()', " + "where values must be generated using:\nvalues = dict(zip(range(1,n), values.tolist())).update({0:1})") print( "\nUsually, the wanted effect of the poisson coefficients is it's semi-elasticity, which is 100*[exp(ß) - 1].") print("To predict values using the CDF, use mod.predict(X). X can be blank (use values from the dataset") print("or a K x N Dimensional array, where K = number of variables and N = number of observations.") return mod
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params.values, rslt2.params.values, decimal=10)
def GoMiningNumBabes(df): """Search for variables that predict number of children borne. df: DataFrame of pregnancy records return: list of (rsquared, variable name) pairs """ variables = [] for name in df.columns: try: if df[name].var() < 1e-7: continue formula = 'numbabes ~ ' + name # The following seems to be required in some environments # formula = formula.encode('ascii') model = smf.poisson(formula, data=df) nobs = len(model.endog) if nobs < len(df) / 2: continue results = model.fit() except (ValueError, TypeError): continue variables.append((results.prsquared, name)) return variables
def VariableMiningPoisson(df, y): """Searches variables using Poisson regression to find ones that predict the target dependent variable 'y'. Args: df (DataFrame): DataFrame that holds all the variables. y (string): Column name of dependent variable y. Returns: variables (list): A list of tuples each containing r-squared value and variable name """ variables = [] for name in df.columns: try: if df[name].var() < 1e-7: continue formula = '{} ~ '.format(y) + name model = smf.poisson(formula, data=df) nobs = len(model.endog) if nobs < len(df) / 2: continue results = model.fit() except: continue variables.append((results.prsquared, name)) return variables
def analyze_stats(df_og, list_of_artificials, formula, covariates): """Analyze stats for real vs artificial dataframes.""" result_real = sm.poisson(formula=formula, data=df_og).fit(disp=0) params = result_real.params params['real'] = "Yes" coefs = [] coefs.append(params) for df_art in list_of_artificials: result_fake = sm.poisson(formula=formula, data=df_art).fit(disp=0) params = result_fake.params params['real'] = "No" coefs.append(params) return pd.DataFrame(coefs)
def analyze_stats_for_single(df, formula, covariates): """Analyze stats for single lexicon.""" result_real = sm.poisson(formula=formula, data=df).fit(disp=0) params = result_real.params # params['real'] = "Yes" coefs = [] coefs.append(params) return pd.DataFrame(coefs)
def fitmodel(ldf, ycol, xcols, modeltype, interactionpairs): string= "{} ~ {}".format(ycol, ' + '.join(xcols)) for intpair in interactionpairs: string += ' + '+intpair[0]+':'+intpair[1] print "Running {} regression model:".format(modeltype) print string print "***************************************************************************************" print "***************************************************************************************" if modeltype=='logistic': model = smfa.logit(string, ldf) elif modeltype=='linear': model = smfa.ols(string, ldf) elif modeltype=='poisson': model = smfa.poisson(string, ldf) elif modeltype=='probit': model = smfa.probit(string, ldf) result=model.fit(maxiter=10000) return result
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit() sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(sml.params.values, md.params, decimal=10)
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
import statsmodels.api as sm import statsmodels.formula.api as smf import numpy as np import pandas as pd # Load the R dataset Insurance from the MASS package. insurance_set = sm.datasets.get_rdataset('Insurance','MASS').data # Capture the data as a pandas dataframe. df = pd.DataFrame(insurance_set) # Build a Poisson regression model with a log of an independent variable Holders, and dependent variable Claims. poisson_model = smf.poisson('Claims ~ np.log(Holders)', df) # Fit the model with data, and find the sum of the residuals. poisson_model_result = poisson_model.fit() print(np.sum(poisson_model_result.resid))
from sklearn import linear_model lr = linear_model.LogisticRegression(max_iter=1000) # 调用fit方法拟合模型,这和拟合线性模型的方法相同 results = lr.fit(X=predictors, y=acs['ge150k_i']) # 输出系数 print(results.coef_) print(results.intercept_) values = np.append(results.intercept_, results.coef_) # 得到值的名称 names = np.append('intercept', predictors.columns) # 全部放入一个带标签的DataFrame中 results = pd.DataFrame(values, index=names, columns=['coef']) print(results) results['or'] = np.exp(results['coef']) print(results) results = smf.poisson('NumChildren ~ FamilyIncome + FamilyType + OwnRent', data=acs).fit() print(results.summary()) import statsmodels import statsmodels.api as sm import statsmodels.formula.api as smf model = smf.glm('NumChildren ~ FamilyIncome + FamilyType + OwnRent', data=acs, family=sm.families.Poisson(sm.genmod.families.links.log)) results = model.fit() print(results.summary()) model = smf.glm('NumChildren ~ FamilyIncome + FamilyType + OwnRent', data=acs, family=sm.families.NegativeBinomial( sm.genmod.families.links.log))
entrance, vclass, lambda_parameter, intercept, alpha, chisqstat, pvalue \ = [], [], [], [], [], [], [] for i in range(len(entrance_site)): for j in range(len(vehicle_class)): file = file_copy[(file_copy["entrance_site"] \ == entrance_site[i]) & (file_copy["vehicle_class"] \ == vehicle_class[j]) & (file_copy["year"] == 2019) \ & (file_copy["payment_type"] == "E-ZPass")] x = file["week_of_year"].unique() x.sort() dataframe = file.drop(["exit_site","payment_type","profit","year"],\ axis = 1).groupby("week_of_year").mean() y = dataframe["vehicle_count"].values model_p = smf.poisson("vehicle_count ~ 1", data=dataframe) result_p = model_p.fit() lambda_ = float(np.exp(result_p.params)) entrance.append(entrance_site[i]) vclass.append(vehicle_class[j]) lambda_parameter.append(lambda_) minimum = int(y.min()) maximum = int(y.max()) while minimum % 10 != 0: minimum -= 1 while maximum % 10 != 0: maximum += 1 X = stats.poisson(lambda_) v, k = np.histogram(y, range=(minimum, maximum), density=True)
# b) all_results = defaultdict(dict) all_regressions = [ "claim_rate ~ C(age) + C(car) + C(dist) + C(age):C(car) + C(age):C(dist) + C(car):C(dist) ", "claim_rate ~ C(age) + C(car) + C(age):C(car)", "claim_rate ~ C(age) + C(car) + C(age):C(car)", "claim_rate ~ C(age) + C(dist) + C(age):C(dist)", "claim_rate ~ C(car) + C(dist) + C(car):C(dist)", "claim_rate ~ C(car)", "claim_rate ~ C(dist)", "claim_rate ~ C(age)", ] for reg in all_regressions: regression = stt.poisson(reg,data=data) results = regression.fit() helper.wrap_regression_results(all_results,results) results = pd.DataFrame(all_results).transpose() combinaisons = it.permutations(range(7),2) for pair in combinaisons: a,b = pair if results.iloc[a].degree_of_freedom > results.iloc[b].degree_of_freedom: Deviance = -2*(results.iloc[a].log - results.iloc[b].log) Degree_of_freedom = results.iloc[a].degree_of_freedom - results.iloc[b].degree_of_freedom if Deviance < dist.chi2.ppf(0.95,Degree_of_freedom): print(results.iloc[a].name, 'est rejete pour', results.iloc[b].name) # Best regression : claim_rate ~ C(dist)
from statsmodels.formula.api import ols, glm, poisson from statsmodels.discrete.discrete_model import Poisson import statsmodels.stats.tests.test_anova as ttmod test = ttmod.TestAnova3() test.setupClass() data = test.data.drop([0,1,2]) res_ols = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit(use_t=False) res_glm = glm("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() res_poi = Poisson.from_formula("Days ~ C(Weight) * C(Duration)", data).fit(cov_type='HC0') res_poi_2 = poisson("Days ~ C(Weight) + C(Duration)", data).fit(cov_type='HC0') print('\nOLS') print(res_ols.wald_test_terms()) print('\nGLM') print(res_glm.wald_test_terms(skip_single=False, combine_terms=['Duration', 'Weight'])) print('\nPoisson 1') print(res_poi.wald_test_terms(skip_single=False, combine_terms=['Duration', 'Weight'])) print('\nPoisson 2') print(res_poi_2.wald_test_terms(skip_single=False)) from statsmodels.discrete.discrete_model import NegativeBinomial res_nb2 = NegativeBinomial.from_formula("Days ~ C(Weight) * C(Duration)", data).fit() print('\nNegative Binomial nb2') print(res_nb2.wald_test_terms(skip_single=False))
import pandas as pd import matplotlib.pylab as plt import statsmodels.formula.api as smf import numpy as np import scipy.optimize # 今回使用するデータ data = pd.read_csv('./psn_data.csv') ## 答え ################################################## results = smf.poisson('y ~ x', data=data).fit() print(results.summary()) # パラメータの推定値を取得 a, b = results.params ## 自作 #################################################### params = [np.random.rand(), np.random.rand()] def likelihood(params, y_vector, x_vector): ret = 0 # ポアソン分布のパラメータ theta_reg = lambda params, x: np.exp(params[0] + params[1] * x) for i in range(y_vector.shape[0]): ret += y_vector[i] * np.log(theta_reg( params, x_vector[i])) - theta_reg(params, x_vector[i]) return -ret new_params = scipy.optimize.minimize(likelihood,
true_neg = (1 - predict) * (1 - actual) sum(true_pos), sum(true_neg) ## calculate accuracy vs baseline print('baseline:\n', baseline) acc = (sum(true_pos) + sum(true_neg)) / len(actual) print('acc:\n', acc) ## mine the join DataFrame variables = GoMiningNumBabes(join) ## read the variables relevant = MiningReport(variables, n=60) ## make model with relevant predictive variables model = smf.poisson('numbabes ~ ager + educat + C(race) + totincr', data=join) results = model.fit() results.summary() ## predict numbabes columns = ['ager', 'race', 'educat', 'totincr'] new = pd.DataFrame([[35, 1, 16, 14]], columns=columns) predict_babes = results.predict(new) print('predict_babes:\n', predict_babes) ## predict married/divorced model = smf.mnlogit('rmarital ~ ager + C(race) + totincr + educat', data=join) results = model.fit() results.summary()
import pandas as pd import statsmodels.formula.api as smf # Poisson Model describes a process where dependent variable refers to success count of many attempts # and each attempt has a very low probability of success. # Let's understand how to fit a Poisson regression model for a data set available at UCLA repository. # The dataset contains details of a number of awards earned, type of program enrolled, # and score obtained in final math exam by students at a high school. # The dataset is fetched as a pandas data frame as shown below. awards_df = pd.read_csv( "https://stats.idre.ucla.edu/stat/data/poisson_sim.csv") print(awards_df.head(3)) # Now let's create a Poisson model with the patsy formula num_awards ~ math + C(prog). poisson_model = smf.poisson('num_awards ~ math + C(prog)', awards_df) # Fitting the Model poisson_model_result = poisson_model.fit() # Viewing Model Summary # Analyzing Model Summary # The coefficient for math variable is 0.07, which means for every one unit increase in math, the log count increases by 0.07. # Having enrolled for prog=2, i.e., "Academic", instead of "Generic" program, changes the log count by 1.08. # Having enrolled for prog=3, i.e., "Vocational", instead of "Generic" program, changes the log count by 0.37. print(poisson_model_result.summary())
Created on: 2013/10/19 03:31:02 Purpose: To show Copyright: BSD / Apache ---------------------------------------------------------------------------- """ from pandas import read_csv from statsmodels.formula.api import poisson data_0 = read_csv('nosipsson.csv', header=None) data_0 = data_0.drop(range(8, 12), 1) data_0.columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] fit_0 = poisson('H ~ E + F + G', data_0).fit() """ ---------------------------------------------------------------------------- End note: (end note starts here) ============================================================================ """
opt_results = minimize(neg_lklh, theta_init, args=fbock) opt_results np.exp(opt_results.x) # пуассоновская регрессия # rate_i = exp(a * 1 + b * y_i) # ghost_i | rate_i ~ Poisson(rate_i) # theta = (a, b) def neg_lklh(theta, fbock): a, b = theta rate = np.exp(a + b * fbock['y']) ln_prob = -rate + fbock['ghost'] * np.log(rate) lklh = np.sum(ln_prob) # в силу независимости дней return -lklh theta_init = [0, 0] opt_results = minimize(neg_lklh, theta_init, args=fbock) opt_results # многие модели регрессии живут в statsmodels import statsmodels.formula.api as smf model = smf.poisson('ghost ~ 1 + y', data=fbock).fit() model.summary()
print "line fit for age and log(teamID)" print "slope: ", slope_log, "intercept: ", intercept_log pointsLineLog = [intercept_log + slope_log * x for x in team_ID] plt.figure() plt.plot(team_ID, log_teamID, "ro") plt.plot(team_ID, pointsLineLog, '--') plt.show() formula = 'yearID ~ teamID + salary + IgID' data_dict = { "yearId": year_ID, "teamID": team_ID, "salary": Salary, "IgID": Ig_ID } model = smf.poisson(formula, data=data_dict) model = model.fit() print "predicted salary for 870000 NL with year of born 1985: ", model.predict( { "salary": 8700, "IgID": 'NL', "yearID": 1985 }) print "slope for yearID and salary is: ", slope print "\n\nMultiple regression model:" modelMR = smf.ols('salary ~ salary + YearId + IgID', data=data_dict) modelMR = modelMR.fit() print modelMR.summary()
# b) all_results = defaultdict(dict) all_regressions = [ "claim_rate ~ C(age) + C(car) + C(dist) + C(age):C(car) + C(age):C(dist) + C(car):C(dist) ", "claim_rate ~ C(age) + C(car) + C(age):C(car)", "claim_rate ~ C(age) + C(car) + C(age):C(car)", "claim_rate ~ C(age) + C(dist) + C(age):C(dist)", "claim_rate ~ C(car) + C(dist) + C(car):C(dist)", "claim_rate ~ C(car)", "claim_rate ~ C(dist)", "claim_rate ~ C(age)", ] for reg in all_regressions: regression = stt.poisson(reg, data=data) results = regression.fit() helper.wrap_regression_results(all_results, results) results = pd.DataFrame(all_results).transpose() combinaisons = it.permutations(range(7), 2) for pair in combinaisons: a, b = pair if results.iloc[a].degree_of_freedom > results.iloc[b].degree_of_freedom: Deviance = -2 * (results.iloc[a].log - results.iloc[b].log) Degree_of_freedom = results.iloc[a].degree_of_freedom - results.iloc[ b].degree_of_freedom if Deviance < dist.chi2.ppf(0.95, Degree_of_freedom): print(results.iloc[a].name, 'est rejete pour', results.iloc[b].name)
plt.savefig("histogram poisson dist.pdf") from patsy import dmatrices import statsmodels.api as sm formula = """Parasitized ~ C(Treatment)""" response, predictors = dmatrices(formula, data, return_type='dataframe') po_results = sm.GLM(response, predictors, family=sm.families.Poisson()).fit() print(po_results.summary()) formula = """Parasitized ~ C(Treatment)""" response, predictors = dmatrices(formula, data, return_type='dataframe') po_results = sm.GLM(response, predictors, family=sm.families.NegativeBinomial()).fit() print(po_results.summary()) modpoiss = smf.poisson(formula, data).fit() print(modpoiss.summary()) modNB = smf.negativebinomial(formula, data).fit() print(modNB.summary()) stats.probplot(modpoiss.resid, dist='poisson', sparams=(2.4, ), plot=plt) plt.show() stats.probplot(modNB.resid, dist='nbinom', sparams=(2.15, 0.4), plot=plt) plt.show() print(modNB.get_margeff('mean').summary())
'qemp86 + inc86 + black + hispan + born60', data=crime1) results_lin = reg_lin.fit() # print regression table: table_lin = pd.DataFrame({ 'b': round(results_lin.params, 4), 'se': round(results_lin.bse, 4), 't': round(results_lin.tvalues, 4), 'pval': round(results_lin.pvalues, 4) }) print(f'table_lin: \n{table_lin}\n') # estimate Poisson model: reg_poisson = smf.poisson(formula='narr86 ~ pcnv + avgsen + tottime +' 'ptime86 + qemp86 + inc86 + black +' 'hispan + born60', data=crime1) results_poisson = reg_poisson.fit(disp=0) # print regression table: table_poisson = pd.DataFrame({ 'b': round(results_poisson.params, 4), 'se': round(results_poisson.bse, 4), 't': round(results_poisson.tvalues, 4), 'pval': round(results_poisson.pvalues, 4) }) print(f'table_poisson: \n{table_poisson}\n') # estimate Quasi-Poisson model: reg_qpoisson = smf.glm(formula='narr86 ~ pcnv + avgsen + tottime + ptime86 +' 'qemp86 + inc86 + black + hispan + born60',
import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf # awards_df = pd.read_csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv") # poission_model = smf.poisson("num_awards ~ math + C(prog)",data=awards_df) # model_result = poission_model.fit() # print(model_result.summary()) insurance_data = sm.datasets.get_rdataset("Insurance", package="MASS").data poission_model = smf.poisson("Claims ~ np.log(Holders)", data=insurance_data) model_result = poission_model.fit() print(model_result.resid)
# reading data data_crime = pd.read_excel(r'C:\Users\digiovanniyani\Desktop\NY_CRIMES.xlsx') #, names = ['MH_MURD', 'MH_RAPE', 'MH_ROBB', 'MH_ASSA', 'MH_BURG', 'MH_LARC'])#, delim_whitespace=True, header=0) total_nyc_crime=data_crime["MH_TOT"]+ data_crime["BK_TOT"]+ data_crime["QN_TOT"] + data_crime["BX_TOT"] + data_crime["SI_TOT"] sumcols_brooklyn= data_crime["BK_LARC"]+ data_crime["BK_ROBB"]+ data_crime["BK_MOTO"] + data_crime["BK_BURG"] + data_crime["BK_ASSA"] sumcols_queens= data_crime["QN_LARC"]+ data_crime["QN_ROBB"]+ data_crime["QN_MOTO"] + data_crime["QN_BURG"] + data_crime["QN_ASSA"] sumcols_manhattan= data_crime["MH_LARC"]+ data_crime["MH_ROBB"]+ data_crime["MH_MOTO"] + data_crime["MH_BURG"] #+ data_crime["MH_ASSA"] sumcols_staten= data_crime["SI_LARC"]+ data_crime["SI_ROBB"]+ data_crime["SI_MOTO"] + data_crime["SI_BURG"] + data_crime["SI_ASSA"] import random dates=(list(range(2005,2020))) str_dates = [str(i) for i in dates] #+ BK_HSGR + BK_UNEM m1 = poisson('QN_TOT ~ QN_HSGR + QN_INC + QN_UNEM ', data = data_crime).fit() #maxiter=1000, method='nm') print (m1.summary()) Y=data_crime['QN_TOT'] print(Y) preds = m1.predict() plt.plot(range(len(Y)), Y, 'r*-', range(len(Y)), preds, 'bo-') plt.title('NYC total crimes VS queens grad rate, unemployment,income') plt.xticks(np.arange(0,15 , step=1)) # Set label locations. plt.xticks(np.arange(0,15), [i for i in str_dates], rotation=30) plt.show() """ data_row=data.iloc[2, 1:5]
import statsmodels.api as sm import statsmodels.formula.api as smf import matplotlib.pyplot as plt import numpy as np import pandas as pd from pandas import Series from pandas import read_csv from pandas import DataFrame series = read_csv('wc98_workload_hour.csv', header=0, parse_dates=[0], index_col=0, squeeze=True) df = DataFrame(series) fig, ax = plt.subplots(1, 1) df.plot(kind='bar', ax=ax) model = smf.poisson("count ~ 1", data=df) result = model.fit() print(result.summary())
from statsmodels.discrete.discrete_model import Poisson import statsmodels.stats.tests.test_anova as ttmod test = ttmod.TestAnova3() test.setup_class() data = test.data.drop([0, 1, 2]) res_ols = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit(use_t=False) res_glm = glm("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() res_poi = Poisson.from_formula("Days ~ C(Weight) * C(Duration)", data).fit(cov_type='HC0') res_poi_2 = poisson("Days ~ C(Weight) + C(Duration)", data).fit(cov_type='HC0') print('\nOLS') print(res_ols.wald_test_terms()) print('\nGLM') print( res_glm.wald_test_terms(skip_single=False, combine_terms=['Duration', 'Weight'])) print('\nPoisson 1') print( res_poi.wald_test_terms(skip_single=False, combine_terms=['Duration', 'Weight'])) print('\nPoisson 2') print(res_poi_2.wald_test_terms(skip_single=False)) from statsmodels.discrete.discrete_model import NegativeBinomial
df = dataset.data.set_index("time") # In[126]: df.head(10).T # In[127]: fig, ax = plt.subplots(1, 1, figsize=(16, 4)) df.plot(kind='bar', ax=ax) fig.tight_layout() fig.savefig("ch14-discoveries.pdf") # In[128]: model = smf.poisson("discoveries ~ 1", data=df) # In[129]: result = model.fit() # In[130]: print(result.summary()) # In[131]: lmbda = np.exp(result.params) # In[132]:
import patsy import matplotlib.pyplot as plt import numpy as np import pandas as pd from scipy import stats import ssl ssl._create_default_https_context = ssl._create_unverified_context dataset = sm.datasets.get_rdataset("discoveries") df = dataset.data.set_index('time').rename(columns={'values': 'discoveries'}) print(df.head(10).T) fig, ax = plt.subplots(1, 1, figsize=(16, 4)) df.plot(kind='bar', ax=ax) plt.show() model = smf.poisson("df.values ~ 1", data=df) result = model.fit() print(result.summary()) lmbda = np.exp(result.params) X = stats.poisson(lmbda) print(result.conf_int) X_ci_l = stats.poisson(np.exp(result.conf_int().values)[0, 0]) X_ci_u = stats.poisson(np.exp(result.conf_int().values)[0, 1]) v, k = np.histogram(df.values, bins=12, range=(0, 12), normed=True) fig, ax = plt.subplots(1, 1, figsize=(12, 4)) ax.bar(k[:-1], v,