def test_summary_col(): from statsmodels.iolib.summary2 import summary_col ids = [1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] # hard coded simulated y # ids = np.asarray(ids) # np.random.seed(123987) # y = x + np.array([-1, 0, 1])[ids - 1] + 2 * np.random.randn(len(y)) y = np.array([ 1.727, -1.037, 2.904, 3.569, 4.629, 5.736, 6.747, 7.020, 5.624, 10.155, 10.400, 17.164, 17.276, 14.988, 14.453 ]) d = {'Y': y, 'X': x, 'IDS': ids} d = pd.DataFrame(d) # provide start_params to speed up convergence sp1 = np.array([-1.26722599, 1.1617587, 0.19547518]) mod1 = MixedLM.from_formula('Y ~ X', d, groups=d['IDS']) results1 = mod1.fit(start_params=sp1) sp2 = np.array([3.48416861, 0.55287862, 1.38537901]) mod2 = MixedLM.from_formula('X ~ Y', d, groups=d['IDS']) results2 = mod2.fit(start_params=sp2) out = summary_col([results1, results2], stars=True) s = ('\n=============================\n Y X \n' '-----------------------------\nGroup Var 0.1955 1.3854 \n' ' (0.6032) (2.7377) \nIntercept -1.2672 3.4842* \n' ' (1.6546) (1.8882) \nX 1.1618*** \n' ' (0.1959) \nY 0.5529***\n' ' (0.2080) \n=============================\n' 'Standard errors in\nparentheses.\n* p<.1, ** p<.05, ***p<.01') assert_equal(str(out), s)
def test_summarycol_drop_omitted(self): # gh-3702 x = [1, 5, 7, 3, 5] x = add_constant(x) x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1) y1 = [6, 4, 2, 7, 4] y2 = [8, 5, 0, 12, 4] reg1 = OLS(y1, x).fit() reg2 = OLS(y2, x2).fit() actual = summary_col([reg1, reg2], regressor_order=['const', 'x1'], drop_omitted=True) assert 'x2' not in str(actual) actual = summary_col([reg1, reg2], regressor_order=['x1'], drop_omitted=False) assert 'const' in str(actual) assert 'x2' in str(actual)
def test_summarycol(self): # Test for latex output of summary_col object desired = r''' \begin{table} \caption{} \begin{center} \begin{tabular}{lcc} \hline & y I & y II \\ \midrule \midrule const & 7.7500 & 12.4231 \\ & (1.1058) & (3.1872) \\ x1 & -0.7500 & -1.5769 \\ & (0.2368) & (0.6826) \\ \hline \end{tabular} \end{center} \end{table} ''' x = [1,5,7,3,5] x = add_constant(x) y1 = [6,4,2,7,4] y2 = [8,5,0,12,4] reg1 = OLS(y1,x).fit() reg2 = OLS(y2,x).fit() actual = summary_col([reg1,reg2]).as_latex() actual = '\n%s\n' % actual assert_equal(desired, actual)
def test_summary_col_ordering_preserved(self): # gh-3767 x = [1, 5, 7, 3, 5] x = add_constant(x) x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1) y1 = [6, 4, 2, 7, 4] y2 = [8, 5, 0, 12, 4] reg1 = OLS(y1, x2).fit() reg2 = OLS(y2, x2).fit() info_dict = {'R2': lambda x: '{:.3f}'.format(int(x.rsquared)), 'N': lambda x: '{0:d}'.format(int(x.nobs))} original = actual = summary_col([reg1, reg2], float_format='%0.4f') actual = summary_col([reg1, reg2], regressor_order=['x2', 'x1'], float_format='%0.4f', info_dict=info_dict) variables = ('const', 'x1', 'x2') for line in str(original).split('\n'): for variable in variables: if line.startswith(variable): assert line in str(actual)
def test_summarycol_float_format(self): # Test for latex output of summary_col object desired = r""" ================= y I y II ----------------- const 7.7 12.4 (1.1) (3.2) x1 -0.7 -1.6 (0.2) (0.7) ================= Standard errors in parentheses. """ x = [1, 5, 7, 3, 5] x = add_constant(x) y1 = [6, 4, 2, 7, 4] y2 = [8, 5, 0, 12, 4] reg1 = OLS(y1, x).fit() reg2 = OLS(y2, x).fit() actual = summary_col([reg1, reg2], float_format='%0.1f').as_text() actual = '%s\n' % actual assert_equal(actual, desired)
# QRs same ls_same_qr_fits = [('pct_same_Q{:.2f}'.format(quantile), smf.quantreg('pct_same~{:s} {:s}'.format(col_sc, str_ctrls), data = df_ppd_reg).fit(quantile))\ for quantile in ls_quantiles] # Prepare for output: OLS & QRs ls_rr_op = [ls_dis_ols_fits[1][1]] + [x[1] for x in ls_rr_qr_fits] ls_std_op = [ls_dis_ols_fits[2][1]] + [x[1] for x in ls_std_qr_fits] ls_same_op = [ls_dis_ols_fits[3][1]] + [x[1] for x in ls_same_qr_fits] ls_model_names = ['OLS'] + [u'Q{:2.0f}'.format(quantile*100) for quantile in ls_quantiles] su_rr = summary_col(ls_rr_op, model_names=ls_model_names, stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R2':lambda x: "{:.2f}".format(x.rsquared)}) su_std = summary_col(ls_std_op, model_names= ls_model_names, float_format='%0.2f', stars=True, info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R2':lambda x: "{:.2f}".format(x.rsquared)}) su_same = summary_col(ls_same_op, model_names= ls_model_names, float_format='%0.2f', stars=True, info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
def regression(self, portfolio_ret, ff3_factors, umd_factor): # Cleaning DataFrame ff3_factors.index = pd.to_datetime(ff3_factors.index, format='%Y%m%d') umd_factor.index = pd.to_datetime(umd_factor.index, format='%Y%m%d') ff3_factors.rename(columns={'Mkt-RF': 'MKT'}, inplace=True) factors = pd.concat([ff3_factors, umd_factor], axis=1) # Convert in percentile factors = factors.apply(lambda x: x / 100) # Filter factors = factors[factors.index > "2014-01-01"] # Merging the stock and factor returns dataframes together df_stock_factor = pd.merge(portfolio_ret, factors, left_index=True, right_index=True) df_stock_factor['XsRet'] = df_stock_factor['Portfolio Returns'] - \ df_stock_factor['RF'] # Calculating excess returns # Running CAPM and FF3 models. CAPM = smf.ols(formula='XsRet ~ MKT', data=df_stock_factor).fit(cov_type='HAC', cov_kwds={'maxlags': 1}) FF3 = smf.ols(formula='XsRet ~ MKT + SMB + HML', data=df_stock_factor).fit(cov_type='HAC', cov_kwds={'maxlags': 1}) UMD = smf.ols(formula='XsRet ~ MKT + SMB + HML + WML', data=df_stock_factor).fit(cov_type='HAC', cov_kwds={'maxlags': 1}) # t-Stats CAPMtstat = CAPM.tvalues FF3tstat = FF3.tvalues UMDtstat = UMD.tvalues # Coeffs CAPMcoeff = CAPM.params FF3coeff = FF3.params UMDcoeff = UMD.params # DataFrame with coefficients and t-stats results_df = pd.DataFrame( { 'CAPMcoeff': CAPMcoeff, 'CAPMtstat': CAPMtstat, 'FF3coeff': FF3coeff, 'FF3tstat': FF3tstat, 'UMDcoeff': UMDcoeff, 'UMDtstat': UMDtstat }, index=['Intercept', 'MKT', 'SMB', 'HML', 'UMD']) dfoutput = summary_col( [CAPM, FF3, UMD], stars=True, float_format='%0.4f', model_names=['CAPM', 'FF3', 'UMD'], info_dict={ 'N': lambda x: "{0:d}".format(int(x.nobs)), 'Adjusted R2': lambda x: "{:.4f}".format(x.rsquared_adj) }, regressor_order=['Intercept', 'MKT', 'SMB', 'HML', 'UMD']) print(dfoutput) return { 'DataFrame': { 'Portfolio_Factors': df_stock_factor, 'Results': results_df }, 'Factors': { 'Fama-French': FF3, 'CAPM': CAPM, 'UMD': UMD } }
ols4 = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data_maize).fit() ols4.summary() data_groundnuts = data.loc[data['cropID'] == 'GROUNDNUTS', :] ols5 = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data_groundnuts).fit() ols5.summary() data_bananafood = data.loc[data['cropID'] == 'BANANA FOOD', :] ols6 = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data_bananafood).fit() ols6.summary() data_sorghum = data.loc[data['cropID'] == 'SORGHUM', :] ols7 = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data_sorghum).fit() ols7.summary() results = summary_col([ols1, ols2, ols3, ols4, ols5, ols7], stars=True) print(results) #%% As in the model data_cassava = data.loc[data['cropID'] == 'CASSAVA', :] ols1 = sm.ols(formula=" lny ~ +lnm", data=data_cassava).fit() ols1.summary() data_swpotatoes = data.loc[data['cropID'] == 'SWEET POTATOES', :] ols2 = sm.ols(formula=" lny ~ +lnm", data=data_swpotatoes).fit() ols2.summary() data_beans = data.loc[data['cropID'] == 'BEANS', :] ols3 = sm.ols(formula=" lny ~ +lnm", data=data_beans).fit() ols3.summary()
fit2.rsquared fit2.rsquared_adj # ## factor (from continuous variable) auto.columns auto['displacement'].describe() pd.qcut(auto['displacement'], 3, labels=['low', 'med', 'high']) auto['disp2'] = pd.qcut(auto['displacement'], 3, labels=['low', 'med', 'high']) fit3 = ols('mpg ~ weight + disp2', data=auto).fit() fit3.summary() fit3b = ols("mpg ~ weight + C(disp2, Treatment(reference='med'))", data=auto).fit() fit3b.summary() # ## compare models summary_col([fit1, fit2, fit3], stars=True) # ## write results to CSV file with open('mod3_summary.csv', 'w') as f: f.write(fit3.summary().as_csv()) # ## table for regression models tab_params = pd.concat([fit1.params, fit2.params, fit3.params], axis=1, keys=['Model 1', 'Model 2', 'Model 3']) tab_bse = pd.concat([fit1.bse, fit2.bse, fit3.bse], axis=1, keys=['Model 1', 'Model 2', 'Model 3']) tab_params['stat'] = 'Beta' tab_bse['stat'] = 'Std Error' tab_all = tab_params.append(tab_bse)
print(reg2.summary()) reg3 = sm.OLS( y, mergeddata[['const', 'gdp', 'PopChange', 'easebus', 'Inno', 'highedu']]).fit() print(reg3.summary()) reg4 = sm.OLS(y, mergeddata[['const', 'gdp', 'PopChange', 'Inno', 'highedu']]).fit() print(reg4.summary()) reg5 = sm.OLS(y, mergeddata[['const', 'PopChange', 'Inno', 'highedu']]).fit() print(reg5.summary()) allreg = [reg0, reg1, reg2, reg3, reg4, reg5] output = summary_col(allreg, stars=True, float_format='%0.2f', info_dict={ 'N': lambda x: "{0:d}".format(int(x.nobs)), 'R2': lambda x: "{:.2f}".format(x.rsquared) }) print(output) output_as_html = output.as_html() outputdf = pd.read_html(output_as_html, header=0, index_col=0)[0] outputdf.to_csv(os.getcwd() + '/multiregression table.csv') ## Visualizations #Inno & HighEdu fig, ax = plt.subplots() ax.scatter(mergeddata['Inno'],
reg = smf.ols("spread ~ selic + inad + ibc", data=series).fit() reg_info = { "Observações": lambda x: x.nobs, "R^2": lambda x: x.rsquared, "R^2 Ajustado": lambda x: x.rsquared_adj, "Estatística F": lambda x: f"{x.fvalue:.3f} ({x.f_pvalue:.3f})", "Jarque-Bera": lambda x: f"{jarque_bera(x.resid)[0]:.3f} ({jarque_bera(x.resid)[1]:.3f})", "Dickey-Fuller": lambda x: f"{adfuller(x.resid, maxlag=1, autolag=None)[0]:.3f} ({adfuller(x.resid, maxlag=1, autolag=None)[1]:.3f})", "Durbin-Watson": lambda x: f"{durbin_watson(x.resid):.3f}" } print(summary_col([reg], stars=True, info_dict=reg_info).as_latex()) print(Stargazer([reg]).render_latex()) reg_resid = reg.resid.shift(1).dropna() reg_resid.name = "equilibrio" y = d_series.spread, X = pd.concat([reg_resid, d_series.selic, d_series.inad, d_series.ibc], axis="columns") ecm = sm.OLS( endog=d_series.spread, exog=pd.concat([reg_resid, d_series.selic, d_series.inad, d_series.ibc], axis="columns"), ).fit()
# summary reg2.summary() # display the results in a single table: summary_col from statsmodels.iolib.summary2 import summary_col info_dict = { 'R_squared': lambda x: "{:.2f}".format(x.rsquared), 'No. observations': lambda x: "{0:d}".format(int(x.nobs)) } results_table = summary_col( results=[reg1, reg2, reg3], float_format='%0.2f', stars=True, model_names=['Model 1', 'Model 3', 'Model 4'], info_dict=info_dict, regressor_order=['const', 'avexpr', 'lat_abst', 'asia', 'africa']) results_table.add_title('Table 2 - OLS Regressions') print(results_table) ## Two-stage least squares(2SLS)regression: for endogeneity issues(biased and inconsistent estimates # Dropping NA's is required to use numpy's polyfit df1_subset2 = df1.dropna(subset=['logem4', 'avexpr']) df1_subset2.head() X = df1_subset2['logem4'] y = df1_subset2['avexpr']
def PortfolioFactorReg(df_stk): # Reading in factor data df_factors = web.DataReader('F-F_Research_Data_5_Factors_2x3_daily', 'famafrench')[0] df_factors.rename(columns={'Mkt-RF': 'MKT'}, inplace=True) #Convert PCT Returns back to log returns df_factors['MKT'] = np.log(df_factors['MKT'] / 100 + 1) #equiv of np.log(FV/PV) df_factors['SMB'] = np.log(df_factors['SMB'] / 100 + 1) df_factors['HML'] = np.log(df_factors['HML'] / 100 + 1) df_factors['RMW'] = np.log(df_factors['RMW'] / 100 + 1) df_factors['CMA'] = np.log(df_factors['CMA'] / 100 + 1) df_stk.name = "Returns" df_stock_factor = pd.concat([df_stk, df_factors], axis=1).dropna( ) # Merging the stock and factor returns dataframes together print("Factor Regression Start: {}".format(df_stock_factor.index[0])) print("Factor Regression End: {}".format(df_stock_factor.index[-1])) df_stock_factor['XsRet'] = df_stock_factor['Returns'] - df_stock_factor[ 'RF'] # Calculating excess returns # Running CAPM, FF3, and FF5 models. CAPM = sm.ols(formula='XsRet ~ MKT', data=df_stock_factor).fit(cov_type='HAC', cov_kwds={'maxlags': 1}) FF3 = sm.ols(formula='XsRet ~ MKT + SMB + HML', data=df_stock_factor).fit(cov_type='HAC', cov_kwds={'maxlags': 1}) FF5 = sm.ols(formula='XsRet ~ MKT + SMB + HML + RMW + CMA', data=df_stock_factor).fit(cov_type='HAC', cov_kwds={'maxlags': 1}) CAPMtstat = CAPM.tvalues FF3tstat = FF3.tvalues FF5tstat = FF5.tvalues CAPMcoeff = CAPM.params FF3coeff = FF3.params FF5coeff = FF5.params # DataFrame with coefficients and t-stats results_df = pd.DataFrame( { 'CAPMcoeff': CAPMcoeff, 'CAPMtstat': CAPMtstat, 'FF3coeff': FF3coeff, 'FF3tstat': FF3tstat, 'FF5coeff': FF5coeff, 'FF5tstat': FF5tstat }, index=['Intercept', 'MKT', 'SMB', 'HML', 'RMW', 'CMA']) dfoutput = summary_col( [CAPM, FF3, FF5], stars=True, float_format='%0.4f', model_names=['CAPM', 'FF3', 'FF5'], info_dict={ 'N': lambda x: "{0:d}".format(int(x.nobs)), 'Adjusted R2': lambda x: "{:.4f}".format(x.rsquared_adj) }, regressor_order=['Intercept', 'MKT', 'SMB', 'HML', 'RMW', 'CMA']) print("MKT Cummulative Returns: {}".format(df_factors['MKT'].sum())) print(dfoutput) return results_df
for column in columns: new_columns.append(column) dmatrix = 'felony_in_one_year ~ ' interation = 0 for indicators in new_columns: if len(new_columns) == 1: dmatrix = dmatrix + indicators elif len(new_columns) > 1: if interation == (len(new_columns) - 1): dmatrix = dmatrix + indicators break else: interation = interation + 1 dmatrix = dmatrix + indicators + " + " # print(dmatrix) print("\n") y, X = dmatrices(dmatrix, data=evaluation_data, return_type='dataframe') input_data = sm.add_constant(X) logit_mod = sm.Probit(y, input_data) logit_res = logit_mod.fit() # print(logit_res.summary()) # print(logit_res.params) A = np.identity(len(logit_res.params)) A = A[1:, :] list_for_printing.append(logit_res) # print(logit_res.f_test(A)) from statsmodels.iolib.summary2 import summary_col dfoutput = summary_col(list_for_printing, stars=True) print(dfoutput)
def reg_table(models,**kwargs): """ Take a list or dict of sm.RegressionResults objects and create a nice table. Summary: (Default) If True, return a summary_col object (from sm.iolib.summary2), which allows for as_text and as_latex Orgtbl: If True, return an orgtable (uses df_to_orgtbl) for the OLS model params. Resultdf: Returns the coefficient and SE df's for modification and subsequent entry into df_to_orgtbl. Useful for adding other columns/rows, like control-group means table_info: A list of model statistics that can be included at the bottom (like with stata's esttab) Allows for "N", "R2", "R2-adj", "F-stat" Defaults to just "N" Transpose: Places outcomes on left with regressors on top. """ summary = kwargs.setdefault("summary", True) orgtbl = kwargs.setdefault("orgtbl", False) resultdf = kwargs.setdefault("resultdf", False) table_info = kwargs.setdefault("table_info", "N") Transpose = kwargs.setdefault("Transpose", False) summary = not any((orgtbl, resultdf)) #~ Summary by default #~ Construct the Summary table, using either table or df_to_orgtbl if table_info: if type(table_info) not in (list,tuple): table_info=[table_info] info_dict = {"N": lambda model: model.nobs, "R2": lambda model: model.rsquared, "R2-adj": lambda model: model.rsquared_adj, "F-stat": lambda model: model.fvalue} info_dict = dict([(x,info_dict[x]) for x in table_info]) if summary: from statsmodels.iolib import summary2 Summary = summary2.summary_col(list(models.values()), stars=True, float_format='%.3f',info_dict=info_dict) #~ This mangles much of the pretty left to the Summary2 object and returns a pd.DF w/o se's if Transpose: Summary = Summary.tables[0].T.drop("",1) else: # Extras = lambda model: pd.Series({"N":model.nobs}) # results = pd.DataFrame({Var:model.params.append(Extras(model)) for Var,model in models.iteritems()}) try: xrange Ms = lambda: models.iteritems() except NameError: Ms = lambda: models.items() results = pd.DataFrame({Var:model.params for Var,model in Ms()}) SEs = pd.DataFrame({Var:model.bse for Var,model in Ms()}) if table_info: try: info_dict.iteritems() info_items = lambda: info_dict.iteritems() except AttributeError: info_items = lambda: info_dict.items() extras = pd.DataFrame({Var: pd.Series({name:stat(model) for name,stat in info_items()}) for Var,model in Ms()}) results = results.append(extras) if Transpose: results,SEs = results.T, SEs.T if orgtbl: Summary = df_to_orgtbl(results,sedf=SEs) else: assert(resultdf) Summary = results, SEs return Summary
ols33 = sm.ols(formula=" Δc ~ Δclimate", data=data).fit() ols33.summary() ols34 = sm.ols(formula=" Δc ~ Δprices", data=data).fit() ols34.summary() ols35 = sm.ols(formula=" Δc ~ Δhealth", data=data).fit() ols35.summary() ols36 = sm.ols(formula=" Δc ~ Δjob", data=data).fit() ols36.summary() ols37 = sm.ols(formula=" Δc ~ Δpests", data=data).fit() ols37.summary() results = summary_col([ols3, ols31, ols32, ols33, ols34, ols35, ols36, ols37], stars=True) print(results) print(results.as_latex()) ols3.bse store_c = pd.DataFrame( np.array([ ols3.params, ols3.bse, ols31.params, ols31.bse, ols32.params, ols32.bse, ols33.params, ols33.bse, ols34.params, ols34.bse, ols35.params, ols35.bse, ols36.params, ols36.bse, ols37.params, ols37.bse ])) print(store_c.to_latex()) # ============================================================================= # Shocks and consumption through gifts?? # =============================================================================
print ("Parameters: ", results.params) print ("Standard errors: ", results.bse) print ("Predicted values: ", results.predict()) preds = results.predict() residuals = results.resid np.savetxt("residuals.csv", residuals, delimiter=",") np.savetxt("fitted_values.csv", preds, delimiter=",") coef = results.params se = results.bse np.savetxt("coef.csv", coef, delimiter=",") np.savetxt("se.csv", se, delimiter=",") print summary_col([results]) # LASSO model ls_model = linear_model.Lasso(alpha=0.1) ls_model.fit(design, tips) coef = list(ls_model.coef_) print ", ".join(map(str, coef)) # Reduced model formula2 = "tip_percent ~ passenger_count + trip_distance + surcharge + tolls_amount + fare_amount + C(tod)" model = smf.ols(formula=formula2, data=data) results2 = model.fit() print results2.summary()
#est_salary_alone.conf_int #confidence interval of coefficient #regression of emission content against sexe est_sex_alone = estimate_OLS( np.array(bts.full_insee_table['SEXE']).reshape((-1, 1))) print('Regressing mean emission content against sex') print(est_sex_alone.summary()) #regression of emission content against wages and sexe est_wages_and_sex = estimate_OLS( bts.full_insee_table[['log_salary_value', 'SEXE']]) print('Regressing mean emission content against wages and sex') print(est_wages_and_sex.summary()) df = summary_col([est_wages_and_sex], stars=True, float_format='%0.3f', info_dict={'$R^2$': lambda x: "{:.3f}".format(x.rsquared)}) latex_str = df.as_latex() eof = '\n' list_of_line = latex_str.split(eof) ##to test output #with open('econometric_results.tex','w') as file: # file.write(df.as_latex()) # file.close() #then tweak the string to format as wanted with open(OUTPUTS_PATH + 'econometric_results.tex', 'w') as file: file.write('\\begin{tabular}{N{3cm}N{2cm}}' + eof) file.write('\\toprule' + eof) file.write('dependent variable & log carbon intensity\\\\' + eof)
print('Parameters: ', results.params) print('Standard errors: ', results.bse) print('Predicted values: ', results.predict()) preds = results.predict() residuals = results.resid np.savetxt('residuals.csv', residuals, delimiter=",") np.savetxt('fitted_values.csv', preds, delimiter=",") coef = results.params se = results.bse np.savetxt('coef.csv', coef, delimiter=",") np.savetxt('se.csv', se, delimiter=",") print summary_col([results]) #LASSO model ls_model = linear_model.Lasso(alpha=0.1) ls_model.fit(design, tips) coef = list(ls_model.coef_) print ', '.join(map(str, coef)) #Reduced model formula2 = 'tip_percent ~ passenger_count + trip_distance + surcharge + tolls_amount + fare_amount + C(tod)' model = smf.ols(formula=formula2, data=data) results2 = model.fit() print results2.summary() preds = results2.predict() residuals = results2.resid
est0.summary() predict = est0.predict() # now use the transformed, within-congress ("wc") specificity as DV est1 = sm.ols(formula='wc_specZ ~ ideoDiff', missing='drop', data=hd).fit() est1.summary() est1.mse_resid est1.mse_total est2 = sm.ols(formula='wc_specZ ~ ideoDiff + seniority', missing='drop', data=hd).fit() est2.summary() # using the between-congress ("bc") specificity as DV est2 = sm.ols(formula='bc_specZ ~ ideoDiff', missing='drop', data=hd).fit() est2.summary() print summary_col([est0, est1, est2], stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2': lambda x: "{:.2f}".format(x.rsquared)}).as_latex() est3 = sm.ols(formula='wc_specZ ~ seniority ', missing='drop', data=hd).fit() est3.summary() est4 = sm.ols(formula='bc_specZ ~ seniority', missing='drop', data=hd).fit() est4.summary() print summary_col([est3, est4], stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2': lambda x: "{:.2f}".format(x.rsquared)}).as_latex() # using the wc DV, but with more variables for robustness check est5 = sm.ols(formula='wc_specZ ~ ideoDiff + divgov + interactive + seniority', missing='drop', data=hd).fit() est5.summary() # using the between-congress ("bc") specificity as DV est4 = sm.ols(formula='bc_specZ ~ ideoDiff', missing='drop', data=hd).fit() est4.summary()
"abs_bias_int5~log_ret+volatility+skewness+amihud+maxmin_ratio+time+vol_pre+spread+open_interest+slope+volume+contract_is_call+inter_call_money+inter_put_money+inter_call_skewness", data=used_data, hasconst=True).fit() model_2 = stf.ols( "relative_bias_int5~log_ret+volatility+skewness+amihud+maxmin_ratio+time+vol_pre+spread+open_interest+slope+volume+contract_is_call+inter_call_money+inter_put_money+inter_call_skewness", data=used_data, hasconst=True).fit() model_2_abs = stf.ols( "relative_abs_bias_int5~log_ret+volatility+skewness+amihud+maxmin_ratio+time+vol_pre+spread+open_interest+slope+volume+contract_is_call+inter_call_money+inter_put_money+inter_call_skewness", data=used_data, hasconst=True).fit() summaries = summary_col( [model_1, model_1_abs, model_2, model_2_abs], stars=True, info_dict={ "observations": lambda x: x.nobs, "R-Squared": lambda x: x.rsquared, "Adjusted R-Squared": lambda x: x.rsquared_adj }) re_for_tabular = re.compile(r"\\begin{tabular}[\d\D]*\\end{tabular}") def cut(x): x = re_for_tabular.findall(x)[0] return x with open("drift/regression_table.tex", "w") as f: tex = summaries.as_latex() tex = cut(tex) f.write(tex)
est0.summary() predict = est0.predict() # now use the transformed, within-congress ("wc") specificity as DV est1 = sm.ols(formula='wc_specZ ~ ideoDiff', missing='drop', data=hd).fit() est1.summary() est1.mse_resid est1.mse_total est2 = sm.ols(formula='wc_specZ ~ ideoDiff + seniority', missing='drop', data=hd).fit() est2.summary() # using the between-congress ("bc") specificity as DV est2 = sm.ols(formula='bc_specZ ~ ideoDiff', missing='drop', data=hd).fit() est2.summary() print summary_col([est0, est1, est2], stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2': lambda x: "{:.2f}".format(x.rsquared)}).as_latex() est3 = sm.ols(formula='wc_specZ ~ seniority ', missing='drop', data=hd).fit() est3.summary() est4 = sm.ols(formula='bc_specZ ~ seniority', missing='drop', data=hd).fit() est4.summary() print summary_col([est3, est4], stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2': lambda x: "{:.2f}".format(x.rsquared)}).as_latex() # using the wc DV, but with more variables for robustness check est5 = sm.ols(formula='wc_specZ ~ ideoDiff + divgov + interactive + seniority', missing='drop', data=hd).fit() est5.summary() # using the between-congress ("bc") specificity as DV est4 = sm.ols(formula='bc_specZ ~ ideoDiff', missing='drop', data=hd).fit() est4.summary()
# visualizer.show() # Finalize and render the figure # In[108]: # # Load classification dataset # X, y = load_credit() # cv = StratifiedKFold(5) # visualizer = RFECV(RandomForestClassifier(), cv=cv, scoring='f1_weighted') # visualizer.fit(X, y) # Fit the data to the visualizer # visualizer.show() # Finalize and render the figure # In[125]: #Adding constant column of ones, mandatory for sm.OLS model X_1 = sm.add_constant(X) #Fitting sm.OLS model model = sm.OLS(y, X_1).fit() (model.pvalues).sort_values(ascending=False) # In[154]: # outreg2 output of Stata dfoutput = summary_col([lm, lm2], stars=True) print(dfoutput) # In[ ]: # In[ ]:
# estimate three linear models for all three stock features X = ['daily_returns_index', 'bullishness', 'news_count', 'sent_std'] #X = ['bullishness_d', 'bullishness_a', 'bullishness_b'] Y1 = 'abnormal_returns' Y2 = 'volume_dollar' Y3 = 'volatility_parks' model1 = panel_regression(X, Y1, 'SentimentHE', 'news', 'robust') model2 = panel_regression(X, Y2, 'SentimentHE', 'news', 'robust') model3 = panel_regression(X, Y3, 'SentimentHE', 'news', 'robust') # generate output for all three models dfoutput = summary_col([model1, model2, model3], stars=True) print(dfoutput) # safe output today = str(datetime.date.today().strftime("%Y%m%d")) file_path = "C:\\Users\\jonas\\Documents\\BA_JonasIls\\Literatur & Analysen\\Regression\\{}_panel_regression".format( today) dfoutput.as_latex(file_path) sentiment_dict = 'SentimentHE' company = 'AAPL' vol_min = 0 sent_min = 0 df_twi = open_df_c2c('AAPL', 'SentimentHE', 0, 0, 'twitter') df_news = open_df_c2c('AAPL', 'SentimentHE', 0, 0, 'news')
# REGRESSIONS ls_res, ls_names = [], [] for title_temp, df_temp in [['All', df_mds], ['Before', df_mds[df_mds['date'] <= '2012-07-01']], ['After', df_mds[df_mds['date'] >= '2013-02-01']]]: #print() #print('-'*60) #print(title_temp) #print() for disp_stat in ['range', 'std']: formula = '{:s} ~ cost + nb_c_3km'.format(disp_stat) res = smf.ols(formula, data = df_temp).fit() res = res.get_robustcov_results(cov_type = 'cluster', groups = df_temp[['int_id', 'int_date']].values, use_correction = True) #print(disp_stat) #print(res.summary()) ls_res.append(res) ls_names.append(title_temp[0:2] + '-' + disp_stat) su = summary_col(ls_res, model_names=ls_names, stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R2':lambda x: "{:.2f}".format(x.rsquared)}) print() print(su)
## model_names = [ 'Model ' + str( i ) for i in range( 1, 8) ] ## ## create a variable to hold the statistics to print; this is a dictionary ## info_dict = { '\nn': lambda x: "{0:d}".format( int( x.nobs ) ), 'R2 Adjusted': lambda x: "{:0.3f}".format( x.rsquared_adj ), 'AIC': lambda x: "{:0.2f}".format( x.aic ), 'F': lambda x: "{:0.2f}".format( x.fvalue ), } ## ## create the portfolio summary table ## summary_table = summary_col( [ reg01, reg02, reg03, reg04, reg05, reg06,reg07 ], float_format = '%0.2f', model_names = model_names, stars = True, info_dict = info_dict ) summary_table.add_title( 'Summary Table for House Price Models' ) print( summary_table ) # In[48]: ##elasticities dYdX = reg02.params[1] eta = dYdX * (df.belowpovlevel.mean()/df.drugabuse.mean()) print( 'eta = ', round(eta, 4))
formula="win ~ teamId + C(firstBlood) + C(firstTower) + C(firstInhibitor) \ + C(firstBaron) + C(firstDragon) + C(firstRiftHerald) + towerKills + \ inhibitorKills + baronKills + dragonKills + riftHeraldKills", data=teamStats, ) res1 = mod.fit() textfile = open("output/team/regressions/win_on_teamStats.txt", "w") print( summary_col( [res1], stars=True, float_format="%0.2f", model_names=["\n(0)"], info_dict={ "N": lambda x: "{:d}".format(int(x.nobs)), "R2": lambda x: f"{x.rsquared:.2f}", }, ).as_text() ) textfile.write( summary_col( [res1], stars=True, float_format="%0.2f", model_names=["\n(0)"], info_dict={ "N": lambda x: "{:d}".format(int(x.nobs)), "R2": lambda x: f"{x.rsquared:.2f}", },
stats = { 'R\sq': lambda x: f"{x.rsquared:.4f}", 'Adjusted R\sq': lambda x: f"{x.rsquared_adj:.4f}", 'Observations': lambda x: f"{int(x.nobs):d}" } coefs = ['lshares', 'ldiv0', 'lalpha'] #'\\multicolumn{2}{c|}{\\textbf{BLP}} & \\multicolumn{2}{c}{\\textbf{Nevo}} caption = '\\\\caption*{Table 7: Correlation with WTP Measure for Nevo (2000b) and Berry et al. (1999).}' outreg = summary_col(results=models, float_format='%0.4f', stars=True, info_dict=stats, model_names=names, regressor_order=coefs, drop_omitted=True) tab_wtp = outreg.as_latex() tab_wtp = re.sub(r'\*\*\*', '*', tab_wtp) tab_wtp = re.sub(r'hline', 'toprule', tab_wtp, count=1) tab_wtp = re.sub(r'hline', 'bottomrule', tab_wtp, count=1) tab_wtp = re.sub(r'lshares', '$\\\log(s_{jt})$', tab_wtp) tab_wtp = re.sub(r'ldiv0', '$\\\log(D_{j,0})$', tab_wtp) tab_wtp = re.sub(r'lalpha', '$\\\log(\\\\abs{\\\\alpha_i})$', tab_wtp) tab_wtp = re.sub(r'\nR\\sq', '\n \\\hline $R^2$', tab_wtp) tab_wtp = re.sub(r'R\\sq', '$R^2$', tab_wtp) tab_wtp = re.sub(r'\begin{table}', '\begin{table}\footnotesize', tab_wtp) tab_wtp = re.sub(r'\\caption{}', caption, tab_wtp)
os.chdir( "/Users/manunavjeevan/Desktop/UCLA/Second Year/Winter 2020/IO/Problem Set 1" ) data = pd.read_csv('dataCleaned.csv') data.head() data #Part 1: Logit ## Want to run a regression of logged share differences against ## price and promotion y = data['shareDiff'] x = data[['price', 'prom']] #x = sm.add_constant(x) model1 = sm.OLS(y, x).fit() print(model1.summary()) print(Stargazer([model1]).render_latex()) summary_col([model1]).as_latex() ## price, promotion, and a dummy for brand brandDummies = pd.get_dummies(data['brand'], prefix='brand') x = data[['price', 'prom']].join(brandDummies) #x = sm.add_constant(x) model2 = sm.OLS(y, x).fit() print(model2.summary()) print(Stargazer([model2]).render_latex()) print(summary_col([model2]).as_latex()) ## Price, promotion and store*brand data['storeBrand'] = data.store + data['brand'] / 100 storeBrandDummies = pd.get_dummies(data['storeBrand']) storeBrandDummies x = data[['price', 'prom']].join(storeBrandDummies)
def summary(fit_list): return summary_col(fit_list, float_format='%0.4f', info_dict={'N': lambda x: "{0:d}".format(int(x.nobs)), 'R2': lambda x: "{:.2f}".format(x.rsquared)}, stars=True ).tables[0]
schema = { 'Date': "date", 'Daily change in cumulative total': "daily_tests", 'Cumulative total': "total_tests", 'Cumulative total per thousand': "total_per_thousand", 'Daily change in cumulative total per thousand': "delta_per_thousand", '7-day smoothed daily change': "smoothed_delta", '7-day smoothed daily change per thousand': "smoothed_delta_per_thousand", 'Short-term positive rate': "positivity", 'Short-term tests per case': "tests_per_case" } testing = pd.read_csv("data/covid-testing-all-observations.csv", parse_dates=["Date"]) testing = testing[testing["ISO code"] == "IND"]\ .dropna()\ [schema.keys()]\ .rename(columns = schema) testing["month"] = testing.date.dt.month def formula(order: int) -> str: powers = " + ".join(f"np.power(delta_per_thousand, {i + 1})" for i in range(order)) return f"smoothed_delta ~ -1 + daily_tests + C(month)*({powers})" model = OLS.from_formula(formula(order = 3), data = testing).fit() print(summary_col(model, regressor_order = ["daily_tests"], drop_omitted = True)) plt.plot(0.2093 * df["TT"][:, "delta", "tested"], label = "test-scaled") plt.plot( df["TT"][:, "delta", "confirmed"], label = "confirmed") plt.legend() plt.show()
def generate_yearly_dict(table_reviews, table_census): models = [] mor = [] morp = [] for year in years: #prep census = pd.read_csv(table_census) reviews = pd.read_csv(table_reviews) sf = clean_dataframe(census, reviews, year, False) apply_log(sf) sf = transform_dataframe(sf) model = lin_reg(sf, "index") models.append(model) print(model.summary()) residuals_dict = dict(model.resid) residuals_df = pd.DataFrame.from_dict(data=residuals_dict, orient="index", columns=["resid"]) residuals_df.index.name = "Ward" mor_index = moran(shapefile, residuals_df) mor.append(round(mor_index[0], 2)) morp.append(round(mor_index[1], 2)) dfoutput = summary_col(models, stars=True, float_format='%0.2f', info_dict={ 'R2': lambda x: "{:.2f}".format(x.rsquared), 'Adj-R2': lambda x: "{:.2f}".format(x.rsquared_adj), 'F-stat': lambda x: "{:.2f}".format(x.f_pvalue) }) print(dfoutput) html_format = dfoutput.as_html() summ = pd.read_html(html_format, header=0, index_col=0)[0] columns = list(summ) morans = pd.Series(dict(zip(columns, mor))) morans.name = "Moran's test" moransp = pd.Series(dict(zip(columns, morp))) moransp.name = "Moran's p" print(columns) summ = summ.append(morans) summ = summ.append(moransp) print(list(summ.index)) summ = summ[summ.index.notnull()] #remove NaN rows cols = [ 'R2', 'Adj-R2', 'F-stat', 'median_age', 'bohemian', 'stem', 'foreign_born', 'diversity', 'number_bedrooms', 'dep_children', 'economic_activity', 'students', 'distance_to_center', 'distance_to_work', "Moran's test", "Moran's p" ] summ = summ.reindex(index=cols) summ.fillna('-') print(summ) latex_format = format_table(summ.to_latex()) return latex_format
# Panel regression with fixed effects panel_ols = sm.ols(formula='gdppc ~ elepc + C(country) + C(year)', data=data_set).fit(cov_type='HC1') print(panel_ols.summary()) # Panel regression with fixed effects and lagged independent variable. panel_ols_lagele = sm.ols(formula='gdppc ~ lagelepc + C(country) + C(year)', data=data_set).fit(cov_type='HC1') print(panel_ols_lagele.summary()) tble = summary_col( [normal_ols, panel_ols, panel_ols_lagele], stars=True, float_format='%0.2f', model_names=['OLS\n(1)', 'Panel\n(2)', 'Lagged Panel\n(3)'], info_dict={ 'N': lambda x: "{0:d}".format(int(x.nobs)), 'R2': lambda x: "{:.2f}".format(x.rsquared) }, regressor_order=['elepc', 'lagelepc', 'Intercept'], drop_omitted=True) print(tble) f = open('res.tex', 'w') f.write(tble.as_latex()) f.close() # Quantile regression, plot the results. mod = sm.quantreg('gdppc ~ elepc + C(country) + C(year)', data_set) quantiles = np.arange(.05, .96, .1)
list_n = [] for item in list_crops: print(item) ols = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data.loc[data['crop_name'] == item, :]).fit() print(ols.summary()) ftest = ols.f_test(" lnk +lnm +lnA +lnl = 1") list_ols.append(ols) list_ftest.append(ftest) n = len(ols.fittedvalues) list_n.append(n) results_1 = summary_col([ list_ols[0], list_ols[1], list_ols[2], list_ols[3], list_ols[4], list_ols[5], list_ols[6], list_ols[7] ], stars=True) results_1 = summary_col([ list_ols[1], list_ols[2], list_ols[4], list_ols[5], list_ols[6], list_ols[7] ], stars=True) print(results_1) print(results_1.as_latex()) results_2 = summary_col([ list_ols[8], list_ols[9], list_ols[11], list_ols[12], list_ols[13], list_ols[14] ],
ols_res = smf.ols('pct_rr ~ {:s}'.format(d_dist), data = df_compa).fit() #print() #print(ols_res.summary()) ls_res = [] ls_quantiles = [0.25, 0.5, 0.75] # use 0.7501 if issue for quantile in ls_quantiles: #print() #print(quantile) #print(smf.quantreg('pct_rr~d_dist_5', data = df_repro_compa).fit(quantile).summary()) ls_res.append(smf.quantreg('pct_rr ~ {:s}'.format(d_dist), data = df_compa[~df_compa[d_dist].isnull()]).fit(quantile)) print(summary_col([ols_res] + ls_res, stars=True, float_format='%0.2f', model_names=['OLS'] + [u'Q{:2.0f}'.format(quantile*100) for quantile in ls_quantiles], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R2':lambda x: "{:.2f}".format(x.rsquared)})) # WITH CONTROLS ols_res_ctrl = smf.ols('pct_rr ~ {:s} + {:s}'.format(d_dist, str_ev), data = df_compa).fit() #print() #print(ols_res_ctrl.summary()) ls_res_ctrl = ([smf.quantreg('pct_rr ~ {:s} + {:s}'.format(d_dist, str_ev), data = df_compa[~df_compa[d_dist].isnull()]).fit(quantile) for quantile in ls_quantiles]) print(summary_col([ols_res_ctrl] + ls_res_ctrl, stars=True,
WML = portreturn11[['date', 'long_short', 'mret', 'IBt-1', 'IU', 'vola']] WML = WML.rename(columns={'IBt-1': 'IBtm1'}) WML['volatm1'] = WML['vola'].shift(1) WML['vartm1'] = WML['volatm1'] * WML['volatm1'] WML['IBmret'] = WML['IBtm1'] * WML['mret'] WML['IBIUmret'] = WML['IBtm1'] * WML['IU'] * WML['mret'] WML['IBvar'] = WML['IBtm1'] * WML['vartm1'] #table3 market return result = sm.formula.ols('long_short ~ 1+mret', missing='drop', data=WML).fit() result2 = sm.formula.ols('long_short ~ 1+IBtm1+mret+IBmret', missing='drop', data=WML).fit() result3 = sm.formula.ols('long_short ~ 1+IBtm1+mret+IBmret+IBIUmret', missing='drop', data=WML).fit() output = summary_col([result, result2, result3], stars=True) print(output) #table5 lagged market variance result4 = sm.formula.ols('long_short ~ 1+IBtm1', missing='drop', data=WML).fit() result5 = sm.formula.ols('long_short ~ 1+vartm1', missing='drop', data=WML).fit() result6 = sm.formula.ols('long_short ~ 1+IBtm1+vartm1', missing='drop', data=WML).fit() result7 = sm.formula.ols('long_short ~ 1+IBvar', missing='drop', data=WML).fit() result8 = sm.formula.ols('long_short ~ 1+IBtm1+vartm1+IBvar', missing='drop', data=WML).fit() output2 = summary_col([result4, result5, result6, result7, result8],
rsq_update(r) # Print Output info_dict = { 'R\sq': lambda x: f"{x.rsquared:.4f}", 'N': lambda x: f"{int(x.nobs):d}" } dfoutput = summary_col(results=[reg1, reg2, reg3, reg4], float_format='%0.4f', stars=True, model_names=['(1)', '(2)', '(3)', '(4)'], info_dict=info_dict, regressor_order=[('retail_share', 'Retail Share'), ('lcap', 'Log(Market Cap)'), ('marginsq', 'Operating Margin'), ('normalized_l2', 'Indexing'), ('big3', 'Big 3 Share'), ('blackrock', 'BlackRock'), ('vanguard', 'Vanguard')('statestreet', 'StateStreet') ], drop_omitted=True) # Clean up the TeX by hand for the table tab_reg2 = re.sub(r'\*\*\*', '*', dfoutput.as_latex()) tab_reg3 = re.sub(r'hline', 'toprule', tab_reg2, count=1) tab_reg4 = re.sub(r'hline', 'bottomrule', tab_reg3, count=1) tab_reg5 = re.sub(r'retail\\_share', 'Retail Share', tab_reg4) # Display table and save
Xl = Xl.rename(columns=lambda x: re.sub("\[|\]", "_", x)) # Estimate average treatment effects from statsmodels.iolib.summary2 import summary_col tmp = pd.DataFrame( dict( birthweight=bw, treatment=treatment, assisted_delivery=df.loc[X.index, "good_assisted_delivery"], )) usage = smf.ols("assisted_delivery ~ treatment", data=tmp).fit(cov_type="cluster", cov_kwds={"groups": loc_id}) health = smf.ols("bw ~ treatment", data=tmp).fit(cov_type="cluster", cov_kwds={"groups": loc_id}) print(summary_col([usage, health])) # for clustering standard errors def get_treatment_se(fit, cluster_id, rows=None): if cluster_id is not None: if rows is None: rows = [True] * len(cluster_id) vcov = sm.stats.sandwich_covariance.cov_cluster( fit, cluster_id.loc[rows]) return np.sqrt(np.diag(vcov)) return fit.HC0_se # Creating generic ml model
"cv ~ C(section) + mean", "iq_pct ~ C(section) + mean", "std_res ~ C(section) + mean", "iq_pct_res ~ C(section) + mean"] ls_res = [smf.ols(formula, data = df_sub).fit() for formula in ls_formulas] from statsmodels.iolib.summary2 import summary_col #print(summary_col(ls_res, # stars=True, # float_format='%0.2f')) print() print(summary_col(ls_res, stars=True, float_format='%0.2f', model_names=['{:d}'.format(i) for i in range(len(ls_formulas))], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R2':lambda x: "{:.2f}".format(x.rsquared)})) print() print(summary_col(ls_res, stars=True, float_format='%0.2f', model_names=['{:d}'.format(i) for i in range(len(ls_formulas))], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R2':lambda x: "{:.2f}".format(x.rsquared)}).as_latex()) # todo: stats des by family (mean std) # (two tables + merge under excel?) # nb prods / price / std / cv / res std / iq / iq res ? ls_desc_cols = ['mean', 'std', 'cv', 'std_res',
# Simple ols ls_dis_ols_fits = [(str_formula, smf.ols(formula = str_formula, data = df_ppd_reg).fit())\ for str_formula in ls_sc_ols_formulas] df_dis_ols_res = format_ls_reg_fits_to_df(ls_dis_ols_fits, [col_sc]) print() print(df_dis_ols_res.to_string()) # QRs rank reversals ls_rr_qr_fits = [('rr_sc_Q{:.2f}'.format(quantile), smf.quantreg('pct_rr~{:s}'.format(col_sc), data = df_ppd_reg).fit(quantile))\ for quantile in ls_quantiles] df_rr_qr_fits = format_ls_reg_fits_to_df(ls_rr_qr_fits, [col_sc]) print() print(df_rr_qr_fits.to_string()) # QRs standard deviation ls_std_qr_fits = [('std_sc_Q{:.2f}'.format(quantile), smf.quantreg('std_spread~{:s}'.format(col_sc), data = df_ppd_reg).fit(quantile))\ for quantile in ls_quantiles] df_std_qr_fits = format_ls_reg_fits_to_df(ls_std_qr_fits, [col_sc]) print() print(df_std_qr_fits.to_string()) print(summary_col(ls_std_qr_fits, stars=True, float_format='%0.2f', model_names=[u'Q{:2.0f}'.format(quantile*100) for quantile in ls_quantiles], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R2':lambda x: "{:.2f}".format(x.rsquared)}))
## read data into a data frame which we call df df = pd.read_csv(path_to_data) ## describe the data df.describe() ## Regression reg1 = smf.ols('effort ~ wage ', data=df).fit() reg2 = smf.ols('effort ~ wage + period', data=df).fit() print(reg1.summary()) ## Regression: print summary print(summary_col([reg1, reg2],stars=True)) ## Scatter plots sns.set_context("poster", font_scale=1) sns.relplot(x='wage', y='effort', data=df) ## Plot equitable effort for each wage level w = np.linspace(0,10,101) plt.plot(w, 3.84 *(1.04*w + 2.25)**(0.5)-5.77) ## Clear plot plt.clf()
def reg_table(models, **kwargs): """ Take a list or dict of sm.RegressionResults objects and create a nice table. Summary: (Default) If True, return a summary_col object (from sm.iolib.summary2), which allows for as_text and as_latex Orgtbl: If True, return an orgtable (uses df_to_orgtbl) for the OLS model params. Resultdf: Returns the coefficient and SE df's for modification and subsequent entry into df_to_orgtbl. Useful for adding other columns/rows, like control-group means table_info: A list of model statistics that can be included at the bottom (like with stata's esttab) Allows for "N", "R2", "R2-adj", "F-stat" Defaults to just "N" Transpose: Places outcomes on left with regressors on top. """ summary = kwargs.setdefault("summary", True) orgtbl = kwargs.setdefault("orgtbl", False) resultdf = kwargs.setdefault("resultdf", False) table_info = kwargs.setdefault("table_info", "N") Transpose = kwargs.setdefault("Transpose", False) summary = not any((orgtbl, resultdf)) #~ Summary by default #~ Construct the Summary table, using either table or df_to_orgtbl if table_info: if type(table_info) not in (list, tuple): table_info = [table_info] info_dict = { "N": lambda model: model.nobs, "R2": lambda model: model.rsquared, "R2-adj": lambda model: model.rsquared_adj, "F-stat": lambda model: model.fvalue } info_dict = dict([(x, info_dict[x]) for x in table_info]) if summary: from statsmodels.iolib import summary2 Summary = summary2.summary_col(list(models.values()), stars=True, float_format='%.3f', info_dict=info_dict) #~ This mangles much of the pretty left to the Summary2 object and returns a pd.DF w/o se's if Transpose: Summary = Summary.tables[0].T.drop("", 1) else: # Extras = lambda model: pd.Series({"N":model.nobs}) # results = pd.DataFrame({Var:model.params.append(Extras(model)) for Var,model in models.iteritems()}) try: xrange Ms = lambda: models.iteritems() except NameError: Ms = lambda: models.items() results = pd.DataFrame({Var: model.params for Var, model in Ms()}) SEs = pd.DataFrame({Var: model.bse for Var, model in Ms()}) if table_info: try: info_dict.iteritems() info_items = lambda: info_dict.iteritems() except AttributeError: info_items = lambda: info_dict.items() extras = pd.DataFrame({ Var: pd.Series({name: stat(model) for name, stat in info_items()}) for Var, model in Ms() }) results = results.append(extras) if Transpose: results, SEs = results.T, SEs.T if orgtbl: Summary = df_to_orgtbl(results, sedf=SEs) else: assert (resultdf) Summary = results, SEs return Summary
def table_3(df): s = (' All', 'classes of', ' groom', 'Excluding ', 'grooms of class ', '1 and 2') row_name = [ 'Dependent variable', 'Percent of soldiers killed x postwar', '', 'Postwar', '', 'Rural', '', 'Bride’s age (/100)', '', 'Groom’s Age (/100)', '', 'Groom class dummies', 'Département dummies', '$R^{2}$', 'Observations' ] table3_1 = pd.DataFrame.from_dict( { s[0]: [], s[1]: [], s[2]: [], s[3]: [], s[4]: [], s[5]: [] }, orient='index').T table3_1[' '] = row_name table3_1 = table3_1.set_index(' ') table3_1.loc['Dependent variable'] = [ 'Class difference', 'Married down', 'Low-class bride', 'Class difference', 'Married down', 'Low-class bride' ] for ind in [ 'classdiff', 'mardn', 'lowbr', 'post_mortality', 'post', 'rural', 'agebrd', 'agegrd', 'clgr', 'depc' ]: df[ind] = pd.to_numeric(df[ind], downcast='float') lst = ['classdiff', 'mardn', 'lowbr'] sample = [df, df[df['clgr'] >= 3]] i = 0 for k in [0, 1]: df_a = sample[k] for var in lst: i += 1 formula = var + "~ post_mortality + post + rural + agebrd + agegrd + C(clgr) + C(depc)" results = smf.ols(formula, data=df_a).fit(cov_type='cluster', cov_kwds={ 'groups': df_a[[ var, 'post_mortality', 'post', 'rural', 'agebrd', 'agegrd', 'clgr', 'depc' ]].dropna()['depc'] }) table_star = summary_col([results], stars=True).tables[0] a = [] a.append(int(results.nobs)) b = table_star.loc['post_mortality':'R-squared', ].to_numpy( ).tolist() + ['Yes', 'Yes'] + a b = list(flatten(b)) b[10], b[11], b[12] = b[12], b[11], b[10] j = i - 1 table3_1.iloc[1:15, j] = b return table3_1