예제 #1
0
def test_summary_col():
    from statsmodels.iolib.summary2 import summary_col
    ids = [1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]
    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    # hard coded simulated y
    # ids = np.asarray(ids)
    # np.random.seed(123987)
    # y = x + np.array([-1, 0, 1])[ids - 1] + 2 * np.random.randn(len(y))
    y = np.array([
        1.727, -1.037, 2.904, 3.569, 4.629, 5.736, 6.747, 7.020, 5.624, 10.155,
        10.400, 17.164, 17.276, 14.988, 14.453
    ])
    d = {'Y': y, 'X': x, 'IDS': ids}
    d = pd.DataFrame(d)

    # provide start_params to speed up convergence
    sp1 = np.array([-1.26722599, 1.1617587, 0.19547518])
    mod1 = MixedLM.from_formula('Y ~ X', d, groups=d['IDS'])
    results1 = mod1.fit(start_params=sp1)
    sp2 = np.array([3.48416861, 0.55287862, 1.38537901])
    mod2 = MixedLM.from_formula('X ~ Y', d, groups=d['IDS'])
    results2 = mod2.fit(start_params=sp2)

    out = summary_col([results1, results2], stars=True)
    s = ('\n=============================\n              Y         X    \n'
         '-----------------------------\nGroup Var 0.1955    1.3854   \n'
         '          (0.6032)  (2.7377) \nIntercept -1.2672   3.4842*  \n'
         '          (1.6546)  (1.8882) \nX         1.1618***          \n'
         '          (0.1959)           \nY                   0.5529***\n'
         '                    (0.2080) \n=============================\n'
         'Standard errors in\nparentheses.\n* p<.1, ** p<.05, ***p<.01')
    assert_equal(str(out), s)
예제 #2
0
 def test_summarycol_drop_omitted(self):
     # gh-3702
     x = [1, 5, 7, 3, 5]
     x = add_constant(x)
     x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1)
     y1 = [6, 4, 2, 7, 4]
     y2 = [8, 5, 0, 12, 4]
     reg1 = OLS(y1, x).fit()
     reg2 = OLS(y2, x2).fit()
     actual = summary_col([reg1, reg2], regressor_order=['const', 'x1'],
                          drop_omitted=True)
     assert 'x2' not in str(actual)
     actual = summary_col([reg1, reg2], regressor_order=['x1'],
                          drop_omitted=False)
     assert 'const' in str(actual)
     assert 'x2' in str(actual)
예제 #3
0
    def test_summarycol(self):
        # Test for latex output of summary_col object
        desired = r'''
\begin{table}
\caption{}
\begin{center}
\begin{tabular}{lcc}
\hline
      &   y I    &   y II    \\
\midrule
\midrule
const & 7.7500   & 12.4231   \\
      & (1.1058) & (3.1872)  \\
x1    & -0.7500  & -1.5769   \\
      & (0.2368) & (0.6826)  \\
\hline
\end{tabular}
\end{center}
\end{table}
'''
        x = [1,5,7,3,5]
        x = add_constant(x)
        y1 = [6,4,2,7,4]
        y2 = [8,5,0,12,4]
        reg1 = OLS(y1,x).fit()
        reg2 = OLS(y2,x).fit()
        actual = summary_col([reg1,reg2]).as_latex()
        actual = '\n%s\n' % actual
        assert_equal(desired, actual)
예제 #4
0
 def test_summary_col_ordering_preserved(self):
     # gh-3767
     x = [1, 5, 7, 3, 5]
     x = add_constant(x)
     x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1)
     y1 = [6, 4, 2, 7, 4]
     y2 = [8, 5, 0, 12, 4]
     reg1 = OLS(y1, x2).fit()
     reg2 = OLS(y2, x2).fit()
     info_dict = {'R2': lambda x: '{:.3f}'.format(int(x.rsquared)),
                  'N': lambda x: '{0:d}'.format(int(x.nobs))}
     original = actual = summary_col([reg1, reg2], float_format='%0.4f')
     actual = summary_col([reg1, reg2], regressor_order=['x2', 'x1'],
                          float_format='%0.4f',
                          info_dict=info_dict)
     variables = ('const', 'x1', 'x2')
     for line in str(original).split('\n'):
         for variable in variables:
             if line.startswith(variable):
                 assert line in str(actual)
예제 #5
0
    def test_summarycol_float_format(self):
        # Test for latex output of summary_col object
        desired = r"""
=================
       y I   y II
-----------------
const 7.7   12.4 
      (1.1) (3.2)
x1    -0.7  -1.6 
      (0.2) (0.7)
=================
Standard errors
in parentheses.
"""
        x = [1, 5, 7, 3, 5]
        x = add_constant(x)
        y1 = [6, 4, 2, 7, 4]
        y2 = [8, 5, 0, 12, 4]
        reg1 = OLS(y1, x).fit()
        reg2 = OLS(y2, x).fit()
        actual = summary_col([reg1, reg2], float_format='%0.1f').as_text()
        actual = '%s\n' % actual
        assert_equal(actual, desired)
  # QRs same
  ls_same_qr_fits  = [('pct_same_Q{:.2f}'.format(quantile),
                       smf.quantreg('pct_same~{:s} {:s}'.format(col_sc, str_ctrls),
                                     data = df_ppd_reg).fit(quantile))\
                          for quantile in ls_quantiles]

  # Prepare for output: OLS & QRs
  ls_rr_op = [ls_dis_ols_fits[1][1]] + [x[1] for x in ls_rr_qr_fits]
  ls_std_op = [ls_dis_ols_fits[2][1]] + [x[1] for x in ls_std_qr_fits]
  ls_same_op = [ls_dis_ols_fits[3][1]] + [x[1] for x in ls_same_qr_fits]

  ls_model_names = ['OLS'] + [u'Q{:2.0f}'.format(quantile*100) for quantile in ls_quantiles]

  su_rr = summary_col(ls_rr_op,
                      model_names=ls_model_names,
                      stars=True,
                      float_format='%0.2f',
                      info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                 'R2':lambda x: "{:.2f}".format(x.rsquared)})
  
  su_std = summary_col(ls_std_op,
                       model_names= ls_model_names,
                       float_format='%0.2f',
                       stars=True,
                       info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                  'R2':lambda x: "{:.2f}".format(x.rsquared)})

  su_same = summary_col(ls_same_op,
                        model_names= ls_model_names,
                        float_format='%0.2f',
                        stars=True,
                        info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
예제 #7
0
    def regression(self, portfolio_ret, ff3_factors, umd_factor):
        # Cleaning DataFrame
        ff3_factors.index = pd.to_datetime(ff3_factors.index, format='%Y%m%d')
        umd_factor.index = pd.to_datetime(umd_factor.index, format='%Y%m%d')
        ff3_factors.rename(columns={'Mkt-RF': 'MKT'}, inplace=True)
        factors = pd.concat([ff3_factors, umd_factor], axis=1)
        # Convert in percentile
        factors = factors.apply(lambda x: x / 100)
        # Filter
        factors = factors[factors.index > "2014-01-01"]

        # Merging the stock and factor returns dataframes together
        df_stock_factor = pd.merge(portfolio_ret,
                                   factors,
                                   left_index=True,
                                   right_index=True)


        df_stock_factor['XsRet'] = df_stock_factor['Portfolio Returns'] - \
            df_stock_factor['RF']  # Calculating excess returns

        # Running CAPM and FF3 models.
        CAPM = smf.ols(formula='XsRet ~ MKT',
                       data=df_stock_factor).fit(cov_type='HAC',
                                                 cov_kwds={'maxlags': 1})

        FF3 = smf.ols(formula='XsRet ~ MKT + SMB + HML',
                      data=df_stock_factor).fit(cov_type='HAC',
                                                cov_kwds={'maxlags': 1})

        UMD = smf.ols(formula='XsRet ~ MKT + SMB + HML + WML',
                      data=df_stock_factor).fit(cov_type='HAC',
                                                cov_kwds={'maxlags': 1})

        # t-Stats
        CAPMtstat = CAPM.tvalues
        FF3tstat = FF3.tvalues
        UMDtstat = UMD.tvalues

        # Coeffs
        CAPMcoeff = CAPM.params
        FF3coeff = FF3.params
        UMDcoeff = UMD.params

        # DataFrame with coefficients and t-stats
        results_df = pd.DataFrame(
            {
                'CAPMcoeff': CAPMcoeff,
                'CAPMtstat': CAPMtstat,
                'FF3coeff': FF3coeff,
                'FF3tstat': FF3tstat,
                'UMDcoeff': UMDcoeff,
                'UMDtstat': UMDtstat
            },
            index=['Intercept', 'MKT', 'SMB', 'HML', 'UMD'])

        dfoutput = summary_col(
            [CAPM, FF3, UMD],
            stars=True,
            float_format='%0.4f',
            model_names=['CAPM', 'FF3', 'UMD'],
            info_dict={
                'N': lambda x: "{0:d}".format(int(x.nobs)),
                'Adjusted R2': lambda x: "{:.4f}".format(x.rsquared_adj)
            },
            regressor_order=['Intercept', 'MKT', 'SMB', 'HML', 'UMD'])
        print(dfoutput)
        return {
            'DataFrame': {
                'Portfolio_Factors': df_stock_factor,
                'Results': results_df
            },
            'Factors': {
                'Fama-French': FF3,
                'CAPM': CAPM,
                'UMD': UMD
            }
        }
ols4 = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data_maize).fit()
ols4.summary()

data_groundnuts = data.loc[data['cropID'] == 'GROUNDNUTS', :]
ols5 = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data_groundnuts).fit()
ols5.summary()

data_bananafood = data.loc[data['cropID'] == 'BANANA FOOD', :]
ols6 = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data_bananafood).fit()
ols6.summary()

data_sorghum = data.loc[data['cropID'] == 'SORGHUM', :]
ols7 = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl", data=data_sorghum).fit()
ols7.summary()

results = summary_col([ols1, ols2, ols3, ols4, ols5, ols7], stars=True)
print(results)

#%% As in the model
data_cassava = data.loc[data['cropID'] == 'CASSAVA', :]
ols1 = sm.ols(formula=" lny ~  +lnm", data=data_cassava).fit()
ols1.summary()

data_swpotatoes = data.loc[data['cropID'] == 'SWEET POTATOES', :]
ols2 = sm.ols(formula=" lny ~  +lnm", data=data_swpotatoes).fit()
ols2.summary()

data_beans = data.loc[data['cropID'] == 'BEANS', :]
ols3 = sm.ols(formula=" lny ~  +lnm", data=data_beans).fit()
ols3.summary()
fit2.rsquared
fit2.rsquared_adj

# ## factor (from continuous variable)
auto.columns
auto['displacement'].describe()
pd.qcut(auto['displacement'], 3, labels=['low', 'med', 'high'])
auto['disp2'] = pd.qcut(auto['displacement'], 3, labels=['low', 'med', 'high'])
fit3 = ols('mpg ~ weight + disp2', data=auto).fit()
fit3.summary()
fit3b = ols("mpg ~ weight + C(disp2, Treatment(reference='med'))",
            data=auto).fit()
fit3b.summary()

# ## compare models
summary_col([fit1, fit2, fit3], stars=True)

# ## write results to CSV file
with open('mod3_summary.csv', 'w') as f:
    f.write(fit3.summary().as_csv())

# ## table for regression models
tab_params = pd.concat([fit1.params, fit2.params, fit3.params],
                       axis=1,
                       keys=['Model 1', 'Model 2', 'Model 3'])
tab_bse = pd.concat([fit1.bse, fit2.bse, fit3.bse],
                    axis=1,
                    keys=['Model 1', 'Model 2', 'Model 3'])
tab_params['stat'] = 'Beta'
tab_bse['stat'] = 'Std Error'
tab_all = tab_params.append(tab_bse)
예제 #10
0
print(reg2.summary())
reg3 = sm.OLS(
    y, mergeddata[['const', 'gdp', 'PopChange', 'easebus', 'Inno',
                   'highedu']]).fit()
print(reg3.summary())
reg4 = sm.OLS(y, mergeddata[['const', 'gdp', 'PopChange', 'Inno',
                             'highedu']]).fit()
print(reg4.summary())
reg5 = sm.OLS(y, mergeddata[['const', 'PopChange', 'Inno', 'highedu']]).fit()
print(reg5.summary())

allreg = [reg0, reg1, reg2, reg3, reg4, reg5]
output = summary_col(allreg,
                     stars=True,
                     float_format='%0.2f',
                     info_dict={
                         'N': lambda x: "{0:d}".format(int(x.nobs)),
                         'R2': lambda x: "{:.2f}".format(x.rsquared)
                     })

print(output)
output_as_html = output.as_html()
outputdf = pd.read_html(output_as_html, header=0, index_col=0)[0]

outputdf.to_csv(os.getcwd() + '/multiregression table.csv')

## Visualizations

#Inno & HighEdu
fig, ax = plt.subplots()
ax.scatter(mergeddata['Inno'],
예제 #11
0
파일: coint.py 프로젝트: phelipetls/mono
reg = smf.ols("spread ~ selic + inad + ibc", data=series).fit()

reg_info = {
    "Observações": lambda x: x.nobs,
    "R^2": lambda x: x.rsquared,
    "R^2 Ajustado": lambda x: x.rsquared_adj,
    "Estatística F": lambda x: f"{x.fvalue:.3f} ({x.f_pvalue:.3f})",
    "Jarque-Bera":
    lambda x: f"{jarque_bera(x.resid)[0]:.3f} ({jarque_bera(x.resid)[1]:.3f})",
    "Dickey-Fuller": lambda x:
    f"{adfuller(x.resid, maxlag=1, autolag=None)[0]:.3f} ({adfuller(x.resid, maxlag=1, autolag=None)[1]:.3f})",
    "Durbin-Watson": lambda x: f"{durbin_watson(x.resid):.3f}"
}

print(summary_col([reg], stars=True, info_dict=reg_info).as_latex())

print(Stargazer([reg]).render_latex())

reg_resid = reg.resid.shift(1).dropna()
reg_resid.name = "equilibrio"

y = d_series.spread,
X = pd.concat([reg_resid, d_series.selic, d_series.inad, d_series.ibc],
              axis="columns")

ecm = sm.OLS(
    endog=d_series.spread,
    exog=pd.concat([reg_resid, d_series.selic, d_series.inad, d_series.ibc],
                   axis="columns"),
).fit()
예제 #12
0
# summary
reg2.summary()

# display the results in a single table: summary_col

from statsmodels.iolib.summary2 import summary_col

info_dict = {
    'R_squared': lambda x: "{:.2f}".format(x.rsquared),
    'No. observations': lambda x: "{0:d}".format(int(x.nobs))
}

results_table = summary_col(
    results=[reg1, reg2, reg3],
    float_format='%0.2f',
    stars=True,
    model_names=['Model 1', 'Model 3', 'Model 4'],
    info_dict=info_dict,
    regressor_order=['const', 'avexpr', 'lat_abst', 'asia', 'africa'])

results_table.add_title('Table 2 - OLS Regressions')

print(results_table)

## Two-stage least squares(2SLS)regression: for endogeneity issues(biased and inconsistent estimates

# Dropping NA's is required to use numpy's polyfit
df1_subset2 = df1.dropna(subset=['logem4', 'avexpr'])
df1_subset2.head()
X = df1_subset2['logem4']
y = df1_subset2['avexpr']
예제 #13
0
def PortfolioFactorReg(df_stk):
    # Reading in factor data
    df_factors = web.DataReader('F-F_Research_Data_5_Factors_2x3_daily',
                                'famafrench')[0]
    df_factors.rename(columns={'Mkt-RF': 'MKT'}, inplace=True)
    #Convert PCT Returns back to log returns
    df_factors['MKT'] = np.log(df_factors['MKT'] / 100 +
                               1)  #equiv of np.log(FV/PV)
    df_factors['SMB'] = np.log(df_factors['SMB'] / 100 + 1)
    df_factors['HML'] = np.log(df_factors['HML'] / 100 + 1)
    df_factors['RMW'] = np.log(df_factors['RMW'] / 100 + 1)
    df_factors['CMA'] = np.log(df_factors['CMA'] / 100 + 1)
    df_stk.name = "Returns"
    df_stock_factor = pd.concat([df_stk, df_factors], axis=1).dropna(
    )  # Merging the stock and factor returns dataframes together
    print("Factor Regression Start: {}".format(df_stock_factor.index[0]))
    print("Factor Regression End: {}".format(df_stock_factor.index[-1]))

    df_stock_factor['XsRet'] = df_stock_factor['Returns'] - df_stock_factor[
        'RF']  # Calculating excess returns
    # Running CAPM, FF3, and FF5 models.
    CAPM = sm.ols(formula='XsRet ~ MKT',
                  data=df_stock_factor).fit(cov_type='HAC',
                                            cov_kwds={'maxlags': 1})
    FF3 = sm.ols(formula='XsRet ~ MKT + SMB + HML',
                 data=df_stock_factor).fit(cov_type='HAC',
                                           cov_kwds={'maxlags': 1})
    FF5 = sm.ols(formula='XsRet ~ MKT + SMB + HML + RMW + CMA',
                 data=df_stock_factor).fit(cov_type='HAC',
                                           cov_kwds={'maxlags': 1})

    CAPMtstat = CAPM.tvalues
    FF3tstat = FF3.tvalues
    FF5tstat = FF5.tvalues

    CAPMcoeff = CAPM.params
    FF3coeff = FF3.params
    FF5coeff = FF5.params

    # DataFrame with coefficients and t-stats
    results_df = pd.DataFrame(
        {
            'CAPMcoeff': CAPMcoeff,
            'CAPMtstat': CAPMtstat,
            'FF3coeff': FF3coeff,
            'FF3tstat': FF3tstat,
            'FF5coeff': FF5coeff,
            'FF5tstat': FF5tstat
        },
        index=['Intercept', 'MKT', 'SMB', 'HML', 'RMW', 'CMA'])

    dfoutput = summary_col(
        [CAPM, FF3, FF5],
        stars=True,
        float_format='%0.4f',
        model_names=['CAPM', 'FF3', 'FF5'],
        info_dict={
            'N': lambda x: "{0:d}".format(int(x.nobs)),
            'Adjusted R2': lambda x: "{:.4f}".format(x.rsquared_adj)
        },
        regressor_order=['Intercept', 'MKT', 'SMB', 'HML', 'RMW', 'CMA'])
    print("MKT Cummulative Returns: {}".format(df_factors['MKT'].sum()))
    print(dfoutput)

    return results_df
for column in columns:
    new_columns.append(column)
    dmatrix = 'felony_in_one_year ~ '
    interation = 0
    for indicators in new_columns:
        if len(new_columns) == 1:
            dmatrix = dmatrix + indicators
        elif len(new_columns) > 1:
            if interation == (len(new_columns) - 1):
                dmatrix = dmatrix + indicators
                break
            else:
                interation = interation + 1
                dmatrix = dmatrix + indicators + " + "
    # print(dmatrix)
    print("\n")
    y, X = dmatrices(dmatrix, data=evaluation_data, return_type='dataframe')
    input_data = sm.add_constant(X)
    logit_mod = sm.Probit(y, input_data)
    logit_res = logit_mod.fit()
    # print(logit_res.summary())
    # print(logit_res.params)
    A = np.identity(len(logit_res.params))
    A = A[1:, :]
    list_for_printing.append(logit_res)
    # print(logit_res.f_test(A))

from statsmodels.iolib.summary2 import summary_col
dfoutput = summary_col(list_for_printing, stars=True)
print(dfoutput)
예제 #15
0
def reg_table(models,**kwargs):
    """ Take a list or dict of sm.RegressionResults objects and create a nice table.
     Summary: (Default)
       If True, return a summary_col object (from sm.iolib.summary2), which allows for as_text and as_latex
     Orgtbl:
       If True, return an orgtable (uses df_to_orgtbl) for the OLS model params.
     Resultdf:
       Returns the coefficient and SE df's for modification and subsequent entry into df_to_orgtbl.
       Useful for adding other columns/rows, like control-group means
     table_info:
       A list of model statistics that can be included at the bottom (like with stata's esttab)
       Allows for "N", "R2", "R2-adj", "F-stat"
       Defaults to just "N"
     Transpose:
       Places outcomes on left with regressors on top.
    """

    summary    = kwargs.setdefault("summary",   True)
    orgtbl     = kwargs.setdefault("orgtbl",    False)
    resultdf   = kwargs.setdefault("resultdf",  False)
    table_info = kwargs.setdefault("table_info", "N")
    Transpose  = kwargs.setdefault("Transpose", False)
    summary    = not any((orgtbl, resultdf)) #~ Summary by default
 
    #~ Construct the Summary table, using either table or df_to_orgtbl
    if table_info:
        if type(table_info) not in (list,tuple): table_info=[table_info]
        info_dict = {"N": lambda model: model.nobs,
                     "R2": lambda model: model.rsquared,
                     "R2-adj": lambda model: model.rsquared_adj,
                     "F-stat": lambda model: model.fvalue}
        info_dict = dict([(x,info_dict[x]) for x in table_info])

    if summary:
        from statsmodels.iolib import summary2
        Summary = summary2.summary_col(list(models.values()), stars=True, float_format='%.3f',info_dict=info_dict)
        #~ This mangles much of the pretty left to the Summary2 object and returns a pd.DF w/o se's
        if Transpose: Summary = Summary.tables[0].T.drop("",1)

    else:
        # Extras = lambda model: pd.Series({"N":model.nobs})
        # results = pd.DataFrame({Var:model.params.append(Extras(model)) for Var,model in models.iteritems()})
        try:
            xrange
            Ms = lambda: models.iteritems()
        except NameError: Ms = lambda: models.items()
        results = pd.DataFrame({Var:model.params for Var,model in Ms()})
        SEs     = pd.DataFrame({Var:model.bse    for Var,model in Ms()})
        if table_info:
            try:
                info_dict.iteritems()
                info_items = lambda: info_dict.iteritems()
            except AttributeError: info_items = lambda: info_dict.items()
            extras = pd.DataFrame({Var: pd.Series({name:stat(model) for name,stat in info_items()}) for Var,model in Ms()})
            results = results.append(extras)
        if Transpose: results,SEs = results.T, SEs.T

        if orgtbl: Summary = df_to_orgtbl(results,sedf=SEs)
        else:
            assert(resultdf)
            Summary = results, SEs

    return Summary
예제 #16
0
ols33 = sm.ols(formula=" Δc ~ Δclimate", data=data).fit()
ols33.summary()

ols34 = sm.ols(formula=" Δc ~ Δprices", data=data).fit()
ols34.summary()

ols35 = sm.ols(formula=" Δc ~ Δhealth", data=data).fit()
ols35.summary()

ols36 = sm.ols(formula=" Δc ~ Δjob", data=data).fit()
ols36.summary()

ols37 = sm.ols(formula=" Δc ~ Δpests", data=data).fit()
ols37.summary()

results = summary_col([ols3, ols31, ols32, ols33, ols34, ols35, ols36, ols37],
                      stars=True)
print(results)
print(results.as_latex())

ols3.bse
store_c = pd.DataFrame(
    np.array([
        ols3.params, ols3.bse, ols31.params, ols31.bse, ols32.params,
        ols32.bse, ols33.params, ols33.bse, ols34.params, ols34.bse,
        ols35.params, ols35.bse, ols36.params, ols36.bse, ols37.params,
        ols37.bse
    ]))
print(store_c.to_latex())
# =============================================================================
# Shocks and consumption through gifts??
# =============================================================================
예제 #17
0
파일: lm.py 프로젝트: itw2000/4201
print ("Parameters: ", results.params)
print ("Standard errors: ", results.bse)
print ("Predicted values: ", results.predict())

preds = results.predict()
residuals = results.resid
np.savetxt("residuals.csv", residuals, delimiter=",")
np.savetxt("fitted_values.csv", preds, delimiter=",")
coef = results.params
se = results.bse

np.savetxt("coef.csv", coef, delimiter=",")
np.savetxt("se.csv", se, delimiter=",")

print summary_col([results])


# LASSO model
ls_model = linear_model.Lasso(alpha=0.1)
ls_model.fit(design, tips)
coef = list(ls_model.coef_)
print ", ".join(map(str, coef))


# Reduced model
formula2 = "tip_percent ~ passenger_count + trip_distance + surcharge + tolls_amount + fare_amount + C(tod)"
model = smf.ols(formula=formula2, data=data)
results2 = model.fit()
print results2.summary()
#est_salary_alone.conf_int #confidence interval of coefficient

#regression of emission content against sexe
est_sex_alone = estimate_OLS(
    np.array(bts.full_insee_table['SEXE']).reshape((-1, 1)))
print('Regressing mean emission content against sex')
print(est_sex_alone.summary())

#regression of emission content against wages and sexe
est_wages_and_sex = estimate_OLS(
    bts.full_insee_table[['log_salary_value', 'SEXE']])
print('Regressing mean emission content against wages and sex')
print(est_wages_and_sex.summary())

df = summary_col([est_wages_and_sex],
                 stars=True,
                 float_format='%0.3f',
                 info_dict={'$R^2$': lambda x: "{:.3f}".format(x.rsquared)})
latex_str = df.as_latex()
eof = '\n'
list_of_line = latex_str.split(eof)

##to test output
#with open('econometric_results.tex','w') as file:
#    file.write(df.as_latex())
#    file.close()

#then tweak the string to format as wanted
with open(OUTPUTS_PATH + 'econometric_results.tex', 'w') as file:
    file.write('\\begin{tabular}{N{3cm}N{2cm}}' + eof)
    file.write('\\toprule' + eof)
    file.write('dependent variable & log carbon intensity\\\\' + eof)
예제 #19
0
파일: lm.py 프로젝트: itw2000/4201
print('Parameters: ', results.params)
print('Standard errors: ', results.bse)
print('Predicted values: ', results.predict())

preds = results.predict()
residuals = results.resid
np.savetxt('residuals.csv', residuals, delimiter=",")
np.savetxt('fitted_values.csv', preds, delimiter=",")
coef = results.params
se = results.bse

np.savetxt('coef.csv', coef, delimiter=",")
np.savetxt('se.csv', se, delimiter=",")

print summary_col([results])

#LASSO model
ls_model = linear_model.Lasso(alpha=0.1)
ls_model.fit(design, tips)
coef = list(ls_model.coef_)
print ', '.join(map(str, coef))

#Reduced model
formula2 = 'tip_percent ~ passenger_count + trip_distance + surcharge + tolls_amount + fare_amount + C(tod)'
model = smf.ols(formula=formula2, data=data)
results2 = model.fit()
print results2.summary()

preds = results2.predict()
residuals = results2.resid
est0.summary()
predict = est0.predict()


# now use the transformed, within-congress ("wc") specificity as DV
est1 = sm.ols(formula='wc_specZ ~ ideoDiff', missing='drop', data=hd).fit()
est1.summary()
est1.mse_resid
est1.mse_total
est2 = sm.ols(formula='wc_specZ ~ ideoDiff + seniority', missing='drop', data=hd).fit()
est2.summary()

# using the between-congress ("bc") specificity as DV
est2 = sm.ols(formula='bc_specZ ~ ideoDiff', missing='drop', data=hd).fit()
est2.summary()
print summary_col([est0, est1, est2], stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2': lambda x: "{:.2f}".format(x.rsquared)}).as_latex()

est3 = sm.ols(formula='wc_specZ ~ seniority ', missing='drop', data=hd).fit()
est3.summary()

est4 = sm.ols(formula='bc_specZ ~ seniority', missing='drop', data=hd).fit()
est4.summary()
print summary_col([est3, est4], stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2': lambda x: "{:.2f}".format(x.rsquared)}).as_latex()

# using the wc DV, but with more variables for robustness check
est5 = sm.ols(formula='wc_specZ ~ ideoDiff + divgov + interactive + seniority', missing='drop', data=hd).fit()
est5.summary()

# using the between-congress ("bc") specificity as DV
est4 = sm.ols(formula='bc_specZ ~ ideoDiff', missing='drop', data=hd).fit()
est4.summary()
예제 #21
0
    "abs_bias_int5~log_ret+volatility+skewness+amihud+maxmin_ratio+time+vol_pre+spread+open_interest+slope+volume+contract_is_call+inter_call_money+inter_put_money+inter_call_skewness",
    data=used_data,
    hasconst=True).fit()

model_2 = stf.ols(
    "relative_bias_int5~log_ret+volatility+skewness+amihud+maxmin_ratio+time+vol_pre+spread+open_interest+slope+volume+contract_is_call+inter_call_money+inter_put_money+inter_call_skewness",
    data=used_data,
    hasconst=True).fit()
model_2_abs = stf.ols(
    "relative_abs_bias_int5~log_ret+volatility+skewness+amihud+maxmin_ratio+time+vol_pre+spread+open_interest+slope+volume+contract_is_call+inter_call_money+inter_put_money+inter_call_skewness",
    data=used_data,
    hasconst=True).fit()
summaries = summary_col(
    [model_1, model_1_abs, model_2, model_2_abs],
    stars=True,
    info_dict={
        "observations": lambda x: x.nobs,
        "R-Squared": lambda x: x.rsquared,
        "Adjusted R-Squared": lambda x: x.rsquared_adj
    })
re_for_tabular = re.compile(r"\\begin{tabular}[\d\D]*\\end{tabular}")


def cut(x):
    x = re_for_tabular.findall(x)[0]
    return x


with open("drift/regression_table.tex", "w") as f:
    tex = summaries.as_latex()
    tex = cut(tex)
    f.write(tex)
est0.summary()
predict = est0.predict()


# now use the transformed, within-congress ("wc") specificity as DV
est1 = sm.ols(formula='wc_specZ ~ ideoDiff', missing='drop', data=hd).fit()
est1.summary()
est1.mse_resid
est1.mse_total
est2 = sm.ols(formula='wc_specZ ~ ideoDiff + seniority', missing='drop', data=hd).fit()
est2.summary()

# using the between-congress ("bc") specificity as DV
est2 = sm.ols(formula='bc_specZ ~ ideoDiff', missing='drop', data=hd).fit()
est2.summary()
print summary_col([est0, est1, est2], stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2': lambda x: "{:.2f}".format(x.rsquared)}).as_latex()

est3 = sm.ols(formula='wc_specZ ~ seniority ', missing='drop', data=hd).fit()
est3.summary()

est4 = sm.ols(formula='bc_specZ ~ seniority', missing='drop', data=hd).fit()
est4.summary()
print summary_col([est3, est4], stars=True, float_format='%0.2f', info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),'R2': lambda x: "{:.2f}".format(x.rsquared)}).as_latex()

# using the wc DV, but with more variables for robustness check
est5 = sm.ols(formula='wc_specZ ~ ideoDiff + divgov + interactive + seniority', missing='drop', data=hd).fit()
est5.summary()

# using the between-congress ("bc") specificity as DV
est4 = sm.ols(formula='bc_specZ ~ ideoDiff', missing='drop', data=hd).fit()
est4.summary()
예제 #23
0
# visualizer.show()           # Finalize and render the figure

# In[108]:

# # Load classification dataset
# X, y = load_credit()

# cv = StratifiedKFold(5)
# visualizer = RFECV(RandomForestClassifier(), cv=cv, scoring='f1_weighted')

# visualizer.fit(X, y)        # Fit the data to the visualizer
# visualizer.show()           # Finalize and render the figure

# In[125]:

#Adding constant column of ones, mandatory for sm.OLS model
X_1 = sm.add_constant(X)
#Fitting sm.OLS model
model = sm.OLS(y, X_1).fit()
(model.pvalues).sort_values(ascending=False)

# In[154]:

# outreg2 output of Stata
dfoutput = summary_col([lm, lm2], stars=True)
print(dfoutput)

# In[ ]:

# In[ ]:
예제 #24
0

# estimate three linear models for all three stock features
X = ['daily_returns_index', 'bullishness', 'news_count', 'sent_std']
#X = ['bullishness_d', 'bullishness_a', 'bullishness_b']

Y1 = 'abnormal_returns'
Y2 = 'volume_dollar'
Y3 = 'volatility_parks'

model1 = panel_regression(X, Y1, 'SentimentHE', 'news', 'robust')
model2 = panel_regression(X, Y2, 'SentimentHE', 'news', 'robust')
model3 = panel_regression(X, Y3, 'SentimentHE', 'news', 'robust')

# generate output for all three models
dfoutput = summary_col([model1, model2, model3], stars=True)
print(dfoutput)

# safe output
today = str(datetime.date.today().strftime("%Y%m%d"))
file_path = "C:\\Users\\jonas\\Documents\\BA_JonasIls\\Literatur & Analysen\\Regression\\{}_panel_regression".format(
    today)
dfoutput.as_latex(file_path)

sentiment_dict = 'SentimentHE'
company = 'AAPL'
vol_min = 0
sent_min = 0

df_twi = open_df_c2c('AAPL', 'SentimentHE', 0, 0, 'twitter')
df_news = open_df_c2c('AAPL', 'SentimentHE', 0, 0, 'news')
  # REGRESSIONS
  ls_res, ls_names = [], []
  for title_temp, df_temp in [['All', df_mds],
                              ['Before', df_mds[df_mds['date'] <= '2012-07-01']],
                              ['After', df_mds[df_mds['date'] >= '2013-02-01']]]:
    #print()
    #print('-'*60)
    #print(title_temp)
    #print()
    for disp_stat in ['range', 'std']:
      formula = '{:s} ~ cost + nb_c_3km'.format(disp_stat)
      res = smf.ols(formula,
                     data = df_temp).fit()
      res = res.get_robustcov_results(cov_type = 'cluster',
                                      groups = df_temp[['int_id', 'int_date']].values,
                                      use_correction = True)
      #print(disp_stat)
      #print(res.summary())
      ls_res.append(res)
      ls_names.append(title_temp[0:2] + '-' + disp_stat)
  
  su = summary_col(ls_res,
                   model_names=ls_names,
                   stars=True,
                   float_format='%0.2f',
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                              'R2':lambda x: "{:.2f}".format(x.rsquared)})

  print()
  print(su)
예제 #26
0
## 
model_names = [ 'Model ' + str( i ) for i in range( 1, 8) ]
##
## create a variable to hold the statistics to print; this is a dictionary
##
info_dict = { '\nn': lambda x: "{0:d}".format( int( x.nobs ) ),
              'R2 Adjusted': lambda x: "{:0.3f}".format( x.rsquared_adj ),
              'AIC': lambda x: "{:0.2f}".format( x.aic ),
              'F': lambda x: "{:0.2f}".format( x.fvalue ),
}
##
## create the portfolio summary table
##
summary_table = summary_col( [ reg01, reg02, reg03, reg04, reg05, reg06,reg07 ],
            float_format = '%0.2f',
            model_names = model_names,
            stars = True, 
            info_dict = info_dict 
)
summary_table.add_title( 'Summary Table for House Price Models' )
print( summary_table )


# In[48]:


##elasticities

dYdX = reg02.params[1]
eta = dYdX * (df.belowpovlevel.mean()/df.drugabuse.mean())
print( 'eta = ', round(eta, 4))
예제 #27
0
    formula="win ~ teamId + C(firstBlood) + C(firstTower) +  C(firstInhibitor) \
    + C(firstBaron) + C(firstDragon) + C(firstRiftHerald) + towerKills + \
    inhibitorKills + baronKills + dragonKills + riftHeraldKills",
    data=teamStats,
)
res1 = mod.fit()


textfile = open("output/team/regressions/win_on_teamStats.txt", "w")

print(
    summary_col(
        [res1],
        stars=True,
        float_format="%0.2f",
        model_names=["\n(0)"],
        info_dict={
            "N": lambda x: "{:d}".format(int(x.nobs)),
            "R2": lambda x: f"{x.rsquared:.2f}",
        },
    ).as_text()
)
textfile.write(
    summary_col(
        [res1],
        stars=True,
        float_format="%0.2f",
        model_names=["\n(0)"],
        info_dict={
            "N": lambda x: "{:d}".format(int(x.nobs)),
            "R2": lambda x: f"{x.rsquared:.2f}",
        },
예제 #28
0
stats = {
    'R\sq': lambda x: f"{x.rsquared:.4f}",
    'Adjusted R\sq': lambda x: f"{x.rsquared_adj:.4f}",
    'Observations': lambda x: f"{int(x.nobs):d}"
}

coefs = ['lshares', 'ldiv0', 'lalpha']

#'\\multicolumn{2}{c|}{\\textbf{BLP}} & \\multicolumn{2}{c}{\\textbf{Nevo}}

caption = '\\\\caption*{Table 7: Correlation with WTP Measure for Nevo (2000b) and Berry et al. (1999).}'

outreg = summary_col(results=models,
                     float_format='%0.4f',
                     stars=True,
                     info_dict=stats,
                     model_names=names,
                     regressor_order=coefs,
                     drop_omitted=True)

tab_wtp = outreg.as_latex()
tab_wtp = re.sub(r'\*\*\*', '*', tab_wtp)
tab_wtp = re.sub(r'hline', 'toprule', tab_wtp, count=1)
tab_wtp = re.sub(r'hline', 'bottomrule', tab_wtp, count=1)
tab_wtp = re.sub(r'lshares', '$\\\log(s_{jt})$', tab_wtp)
tab_wtp = re.sub(r'ldiv0', '$\\\log(D_{j,0})$', tab_wtp)
tab_wtp = re.sub(r'lalpha', '$\\\log(\\\\abs{\\\\alpha_i})$', tab_wtp)
tab_wtp = re.sub(r'\nR\\sq', '\n \\\hline $R^2$', tab_wtp)
tab_wtp = re.sub(r'R\\sq', '$R^2$', tab_wtp)
tab_wtp = re.sub(r'\begin{table}', '\begin{table}\footnotesize', tab_wtp)
tab_wtp = re.sub(r'\\caption{}', caption, tab_wtp)
예제 #29
0
os.chdir(
    "/Users/manunavjeevan/Desktop/UCLA/Second Year/Winter 2020/IO/Problem Set 1"
)
data = pd.read_csv('dataCleaned.csv')
data.head()
data
#Part 1: Logit
## Want to run a regression of logged share differences against
## price and promotion
y = data['shareDiff']
x = data[['price', 'prom']]
#x = sm.add_constant(x)
model1 = sm.OLS(y, x).fit()
print(model1.summary())
print(Stargazer([model1]).render_latex())
summary_col([model1]).as_latex()

## price, promotion, and a dummy for brand
brandDummies = pd.get_dummies(data['brand'], prefix='brand')
x = data[['price', 'prom']].join(brandDummies)
#x = sm.add_constant(x)
model2 = sm.OLS(y, x).fit()
print(model2.summary())
print(Stargazer([model2]).render_latex())
print(summary_col([model2]).as_latex())

## Price, promotion and store*brand
data['storeBrand'] = data.store + data['brand'] / 100
storeBrandDummies = pd.get_dummies(data['storeBrand'])
storeBrandDummies
x = data[['price', 'prom']].join(storeBrandDummies)
예제 #30
0
def summary(fit_list):
    return summary_col(fit_list,
                       float_format='%0.4f',
                       info_dict={'N': lambda x: "{0:d}".format(int(x.nobs)),
                                  'R2': lambda x: "{:.2f}".format(x.rsquared)}, stars=True
                       ).tables[0]
schema = { 
    'Date': "date",
    'Daily change in cumulative total': "daily_tests",
    'Cumulative total': "total_tests",
    'Cumulative total per thousand': "total_per_thousand",
    'Daily change in cumulative total per thousand': "delta_per_thousand",
    '7-day smoothed daily change': "smoothed_delta",
    '7-day smoothed daily change per thousand': "smoothed_delta_per_thousand",
    'Short-term positive rate': "positivity",
    'Short-term tests per case': "tests_per_case"
}

testing = pd.read_csv("data/covid-testing-all-observations.csv", parse_dates=["Date"])
testing = testing[testing["ISO code"] == "IND"]\
            .dropna()\
            [schema.keys()]\
            .rename(columns = schema)
testing["month"]     = testing.date.dt.month

def formula(order: int) -> str: 
    powers = " + ".join(f"np.power(delta_per_thousand, {i + 1})" for i in range(order))
    return f"smoothed_delta ~ -1 + daily_tests + C(month)*({powers})"

model = OLS.from_formula(formula(order = 3), data = testing).fit()
print(summary_col(model, regressor_order = ["daily_tests"], drop_omitted = True))

plt.plot(0.2093 * df["TT"][:, "delta", "tested"],    label = "test-scaled")
plt.plot(         df["TT"][:, "delta", "confirmed"], label = "confirmed")
plt.legend()
plt.show()
def generate_yearly_dict(table_reviews, table_census):
    models = []
    mor = []
    morp = []

    for year in years:
        #prep
        census = pd.read_csv(table_census)
        reviews = pd.read_csv(table_reviews)
        sf = clean_dataframe(census, reviews, year, False)
        apply_log(sf)
        sf = transform_dataframe(sf)
        model = lin_reg(sf, "index")
        models.append(model)
        print(model.summary())
        residuals_dict = dict(model.resid)
        residuals_df = pd.DataFrame.from_dict(data=residuals_dict,
                                              orient="index",
                                              columns=["resid"])
        residuals_df.index.name = "Ward"
        mor_index = moran(shapefile, residuals_df)
        mor.append(round(mor_index[0], 2))
        morp.append(round(mor_index[1], 2))

    dfoutput = summary_col(models,
                           stars=True,
                           float_format='%0.2f',
                           info_dict={
                               'R2': lambda x: "{:.2f}".format(x.rsquared),
                               'Adj-R2':
                               lambda x: "{:.2f}".format(x.rsquared_adj),
                               'F-stat': lambda x: "{:.2f}".format(x.f_pvalue)
                           })
    print(dfoutput)
    html_format = dfoutput.as_html()
    summ = pd.read_html(html_format, header=0, index_col=0)[0]
    columns = list(summ)
    morans = pd.Series(dict(zip(columns, mor)))
    morans.name = "Moran's test"
    moransp = pd.Series(dict(zip(columns, morp)))
    moransp.name = "Moran's p"
    print(columns)
    summ = summ.append(morans)
    summ = summ.append(moransp)
    print(list(summ.index))
    summ = summ[summ.index.notnull()]  #remove NaN rows

    cols = [
        'R2', 'Adj-R2', 'F-stat', 'median_age', 'bohemian', 'stem',
        'foreign_born', 'diversity', 'number_bedrooms', 'dep_children',
        'economic_activity', 'students', 'distance_to_center',
        'distance_to_work', "Moran's test", "Moran's p"
    ]
    summ = summ.reindex(index=cols)
    summ.fillna('-')

    print(summ)

    latex_format = format_table(summ.to_latex())

    return latex_format
# Panel regression with fixed effects
panel_ols = sm.ols(formula='gdppc ~ elepc + C(country) + C(year)',
                   data=data_set).fit(cov_type='HC1')
print(panel_ols.summary())

# Panel regression with fixed effects and lagged independent variable.
panel_ols_lagele = sm.ols(formula='gdppc ~ lagelepc + C(country) + C(year)',
                          data=data_set).fit(cov_type='HC1')
print(panel_ols_lagele.summary())

tble = summary_col(
    [normal_ols, panel_ols, panel_ols_lagele],
    stars=True,
    float_format='%0.2f',
    model_names=['OLS\n(1)', 'Panel\n(2)', 'Lagged Panel\n(3)'],
    info_dict={
        'N': lambda x: "{0:d}".format(int(x.nobs)),
        'R2': lambda x: "{:.2f}".format(x.rsquared)
    },
    regressor_order=['elepc', 'lagelepc', 'Intercept'],
    drop_omitted=True)
print(tble)
f = open('res.tex', 'w')
f.write(tble.as_latex())
f.close()

# Quantile regression, plot the results.
mod = sm.quantreg('gdppc ~ elepc + C(country) + C(year)', data_set)
quantiles = np.arange(.05, .96, .1)

예제 #34
0
list_n = []

for item in list_crops:
    print(item)
    ols = sm.ols(formula=" lny ~ lnk + lnA +lnm +lnl",
                 data=data.loc[data['crop_name'] == item, :]).fit()
    print(ols.summary())
    ftest = ols.f_test(" lnk +lnm +lnA +lnl = 1")
    list_ols.append(ols)
    list_ftest.append(ftest)
    n = len(ols.fittedvalues)
    list_n.append(n)

results_1 = summary_col([
    list_ols[0], list_ols[1], list_ols[2], list_ols[3], list_ols[4],
    list_ols[5], list_ols[6], list_ols[7]
],
                        stars=True)
results_1 = summary_col([
    list_ols[1], list_ols[2], list_ols[4], list_ols[5], list_ols[6],
    list_ols[7]
],
                        stars=True)

print(results_1)
print(results_1.as_latex())

results_2 = summary_col([
    list_ols[8], list_ols[9], list_ols[11], list_ols[12], list_ols[13],
    list_ols[14]
],
예제 #35
0
 ols_res = smf.ols('pct_rr ~ {:s}'.format(d_dist), data = df_compa).fit()
 #print()
 #print(ols_res.summary())
 
 ls_res = []
 ls_quantiles = [0.25, 0.5, 0.75] # use 0.7501 if issue
 for quantile in ls_quantiles:
   #print()
   #print(quantile)
   #print(smf.quantreg('pct_rr~d_dist_5', data = df_repro_compa).fit(quantile).summary())
   ls_res.append(smf.quantreg('pct_rr ~ {:s}'.format(d_dist),
                              data = df_compa[~df_compa[d_dist].isnull()]).fit(quantile))
 
 print(summary_col([ols_res] + ls_res,
                   stars=True,
                   float_format='%0.2f',
                   model_names=['OLS'] + [u'Q{:2.0f}'.format(quantile*100) for quantile in ls_quantiles],
                   info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                              'R2':lambda x: "{:.2f}".format(x.rsquared)}))
 
 # WITH CONTROLS
 ols_res_ctrl = smf.ols('pct_rr ~ {:s} + {:s}'.format(d_dist, str_ev),
                          data = df_compa).fit()
 #print()
 #print(ols_res_ctrl.summary())
 
 ls_res_ctrl = ([smf.quantreg('pct_rr ~ {:s} + {:s}'.format(d_dist, str_ev),
                              data = df_compa[~df_compa[d_dist].isnull()]).fit(quantile)
                  for quantile in ls_quantiles])
 
 print(summary_col([ols_res_ctrl] + ls_res_ctrl,
                   stars=True,
예제 #36
0
WML = portreturn11[['date', 'long_short', 'mret', 'IBt-1', 'IU', 'vola']]
WML = WML.rename(columns={'IBt-1': 'IBtm1'})
WML['volatm1'] = WML['vola'].shift(1)
WML['vartm1'] = WML['volatm1'] * WML['volatm1']
WML['IBmret'] = WML['IBtm1'] * WML['mret']
WML['IBIUmret'] = WML['IBtm1'] * WML['IU'] * WML['mret']
WML['IBvar'] = WML['IBtm1'] * WML['vartm1']
#table3 market return
result = sm.formula.ols('long_short ~ 1+mret', missing='drop', data=WML).fit()
result2 = sm.formula.ols('long_short ~ 1+IBtm1+mret+IBmret',
                         missing='drop',
                         data=WML).fit()
result3 = sm.formula.ols('long_short ~ 1+IBtm1+mret+IBmret+IBIUmret',
                         missing='drop',
                         data=WML).fit()
output = summary_col([result, result2, result3], stars=True)
print(output)
#table5 lagged market variance
result4 = sm.formula.ols('long_short ~ 1+IBtm1', missing='drop',
                         data=WML).fit()
result5 = sm.formula.ols('long_short ~ 1+vartm1', missing='drop',
                         data=WML).fit()
result6 = sm.formula.ols('long_short ~ 1+IBtm1+vartm1',
                         missing='drop',
                         data=WML).fit()
result7 = sm.formula.ols('long_short ~ 1+IBvar', missing='drop',
                         data=WML).fit()
result8 = sm.formula.ols('long_short ~ 1+IBtm1+vartm1+IBvar',
                         missing='drop',
                         data=WML).fit()
output2 = summary_col([result4, result5, result6, result7, result8],
    rsq_update(r)

# Print Output
info_dict = {
    'R\sq': lambda x: f"{x.rsquared:.4f}",
    'N': lambda x: f"{int(x.nobs):d}"
}

dfoutput = summary_col(results=[reg1, reg2, reg3, reg4],
                       float_format='%0.4f',
                       stars=True,
                       model_names=['(1)', '(2)', '(3)', '(4)'],
                       info_dict=info_dict,
                       regressor_order=[('retail_share', 'Retail Share'),
                                        ('lcap', 'Log(Market Cap)'),
                                        ('marginsq', 'Operating Margin'),
                                        ('normalized_l2', 'Indexing'),
                                        ('big3', 'Big 3 Share'),
                                        ('blackrock', 'BlackRock'),
                                        ('vanguard', 'Vanguard')('statestreet',
                                                                 'StateStreet')
                                        ],
                       drop_omitted=True)

# Clean up the TeX by hand for the table
tab_reg2 = re.sub(r'\*\*\*', '*', dfoutput.as_latex())
tab_reg3 = re.sub(r'hline', 'toprule', tab_reg2, count=1)
tab_reg4 = re.sub(r'hline', 'bottomrule', tab_reg3, count=1)
tab_reg5 = re.sub(r'retail\\_share', 'Retail Share', tab_reg4)

# Display table and save
Xl = Xl.rename(columns=lambda x: re.sub("\[|\]", "_", x))

# Estimate average treatment effects
from statsmodels.iolib.summary2 import summary_col

tmp = pd.DataFrame(
    dict(
        birthweight=bw,
        treatment=treatment,
        assisted_delivery=df.loc[X.index, "good_assisted_delivery"],
    ))
usage = smf.ols("assisted_delivery ~ treatment",
                data=tmp).fit(cov_type="cluster", cov_kwds={"groups": loc_id})
health = smf.ols("bw ~ treatment", data=tmp).fit(cov_type="cluster",
                                                 cov_kwds={"groups": loc_id})
print(summary_col([usage, health]))


# for clustering standard errors
def get_treatment_se(fit, cluster_id, rows=None):
    if cluster_id is not None:
        if rows is None:
            rows = [True] * len(cluster_id)
        vcov = sm.stats.sandwich_covariance.cov_cluster(
            fit, cluster_id.loc[rows])
        return np.sqrt(np.diag(vcov))

    return fit.HC0_se


# Creating generic ml model
               "cv ~ C(section) + mean",
               "iq_pct ~ C(section) + mean",
               "std_res ~ C(section) + mean",
               "iq_pct_res ~ C(section) + mean"]

ls_res = [smf.ols(formula, data = df_sub).fit() for formula in ls_formulas]

from statsmodels.iolib.summary2 import summary_col
#print(summary_col(ls_res,
#                  stars=True,
#                  float_format='%0.2f'))

print()
print(summary_col(ls_res,
                  stars=True,
                  float_format='%0.2f',
                  model_names=['{:d}'.format(i) for i in range(len(ls_formulas))],
                  info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.2f}".format(x.rsquared)}))

print()
print(summary_col(ls_res,
                  stars=True,
                  float_format='%0.2f',
                  model_names=['{:d}'.format(i) for i in range(len(ls_formulas))],
                  info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.2f}".format(x.rsquared)}).as_latex())

# todo: stats des by family (mean std)
# (two tables + merge under excel?)
# nb prods / price / std / cv / res std / iq / iq res ?
ls_desc_cols = ['mean', 'std', 'cv', 'std_res',
  # Simple ols
  ls_dis_ols_fits = [(str_formula,
                     smf.ols(formula = str_formula, data = df_ppd_reg).fit())\
                       for str_formula in ls_sc_ols_formulas]
  df_dis_ols_res = format_ls_reg_fits_to_df(ls_dis_ols_fits, [col_sc])
  print()
  print(df_dis_ols_res.to_string())
  # QRs rank reversals
  ls_rr_qr_fits  = [('rr_sc_Q{:.2f}'.format(quantile),
                     smf.quantreg('pct_rr~{:s}'.format(col_sc),
                                  data = df_ppd_reg).fit(quantile))\
                       for quantile in ls_quantiles]
  df_rr_qr_fits = format_ls_reg_fits_to_df(ls_rr_qr_fits, [col_sc])
  print()
  print(df_rr_qr_fits.to_string())
  # QRs standard deviation
  ls_std_qr_fits  = [('std_sc_Q{:.2f}'.format(quantile),
                      smf.quantreg('std_spread~{:s}'.format(col_sc),
                                    data = df_ppd_reg).fit(quantile))\
                         for quantile in ls_quantiles]
  df_std_qr_fits = format_ls_reg_fits_to_df(ls_std_qr_fits, [col_sc])
  print()
  print(df_std_qr_fits.to_string())

print(summary_col(ls_std_qr_fits,
                  stars=True,
                  float_format='%0.2f',
                  model_names=[u'Q{:2.0f}'.format(quantile*100) for quantile in ls_quantiles],
                  info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.2f}".format(x.rsquared)}))
예제 #41
0
## read data into a data frame which we call df
df = pd.read_csv(path_to_data)

## describe the data
df.describe()



## Regression
reg1 = smf.ols('effort ~ wage ', data=df).fit()
reg2 = smf.ols('effort ~ wage + period', data=df).fit()
print(reg1.summary())

## Regression: print summary
print(summary_col([reg1, reg2],stars=True))

## Scatter plots
sns.set_context("poster", font_scale=1)

sns.relplot(x='wage', y='effort', data=df)


## Plot equitable effort for each wage level
w = np.linspace(0,10,101)
plt.plot(w, 3.84 *(1.04*w + 2.25)**(0.5)-5.77)



## Clear plot
plt.clf()
예제 #42
0
def reg_table(models, **kwargs):
    """ Take a list or dict of sm.RegressionResults objects and create a nice table.
     Summary: (Default)
       If True, return a summary_col object (from sm.iolib.summary2), which allows for as_text and as_latex
     Orgtbl:
       If True, return an orgtable (uses df_to_orgtbl) for the OLS model params.
     Resultdf:
       Returns the coefficient and SE df's for modification and subsequent entry into df_to_orgtbl.
       Useful for adding other columns/rows, like control-group means
     table_info:
       A list of model statistics that can be included at the bottom (like with stata's esttab)
       Allows for "N", "R2", "R2-adj", "F-stat"
       Defaults to just "N"
     Transpose:
       Places outcomes on left with regressors on top.
    """

    summary = kwargs.setdefault("summary", True)
    orgtbl = kwargs.setdefault("orgtbl", False)
    resultdf = kwargs.setdefault("resultdf", False)
    table_info = kwargs.setdefault("table_info", "N")
    Transpose = kwargs.setdefault("Transpose", False)
    summary = not any((orgtbl, resultdf))  #~ Summary by default

    #~ Construct the Summary table, using either table or df_to_orgtbl
    if table_info:
        if type(table_info) not in (list, tuple): table_info = [table_info]
        info_dict = {
            "N": lambda model: model.nobs,
            "R2": lambda model: model.rsquared,
            "R2-adj": lambda model: model.rsquared_adj,
            "F-stat": lambda model: model.fvalue
        }
        info_dict = dict([(x, info_dict[x]) for x in table_info])

    if summary:
        from statsmodels.iolib import summary2
        Summary = summary2.summary_col(list(models.values()),
                                       stars=True,
                                       float_format='%.3f',
                                       info_dict=info_dict)
        #~ This mangles much of the pretty left to the Summary2 object and returns a pd.DF w/o se's
        if Transpose: Summary = Summary.tables[0].T.drop("", 1)

    else:
        # Extras = lambda model: pd.Series({"N":model.nobs})
        # results = pd.DataFrame({Var:model.params.append(Extras(model)) for Var,model in models.iteritems()})
        try:
            xrange
            Ms = lambda: models.iteritems()
        except NameError:
            Ms = lambda: models.items()
        results = pd.DataFrame({Var: model.params for Var, model in Ms()})
        SEs = pd.DataFrame({Var: model.bse for Var, model in Ms()})
        if table_info:
            try:
                info_dict.iteritems()
                info_items = lambda: info_dict.iteritems()
            except AttributeError:
                info_items = lambda: info_dict.items()
            extras = pd.DataFrame({
                Var:
                pd.Series({name: stat(model)
                           for name, stat in info_items()})
                for Var, model in Ms()
            })
            results = results.append(extras)
        if Transpose: results, SEs = results.T, SEs.T

        if orgtbl: Summary = df_to_orgtbl(results, sedf=SEs)
        else:
            assert (resultdf)
            Summary = results, SEs

    return Summary
예제 #43
0
def table_3(df):

    s = (' All', 'classes of', ' groom', 'Excluding ', 'grooms of class ',
         '1 and 2')
    row_name = [
        'Dependent variable', 'Percent of soldiers killed x postwar', '',
        'Postwar', '', 'Rural', '', 'Bride’s age (/100)', '',
        'Groom’s Age (/100)', '', 'Groom class dummies', 'Département dummies',
        '$R^{2}$', 'Observations'
    ]

    table3_1 = pd.DataFrame.from_dict(
        {
            s[0]: [],
            s[1]: [],
            s[2]: [],
            s[3]: [],
            s[4]: [],
            s[5]: []
        },
        orient='index').T
    table3_1[' '] = row_name
    table3_1 = table3_1.set_index(' ')
    table3_1.loc['Dependent variable'] = [
        'Class difference', 'Married down', 'Low-class bride',
        'Class difference', 'Married down', 'Low-class bride'
    ]

    for ind in [
            'classdiff', 'mardn', 'lowbr', 'post_mortality', 'post', 'rural',
            'agebrd', 'agegrd', 'clgr', 'depc'
    ]:
        df[ind] = pd.to_numeric(df[ind], downcast='float')

    lst = ['classdiff', 'mardn', 'lowbr']
    sample = [df, df[df['clgr'] >= 3]]

    i = 0

    for k in [0, 1]:
        df_a = sample[k]
        for var in lst:
            i += 1
            formula = var + "~ post_mortality + post + rural + agebrd + agegrd + C(clgr) + C(depc)"
            results = smf.ols(formula,
                              data=df_a).fit(cov_type='cluster',
                                             cov_kwds={
                                                 'groups':
                                                 df_a[[
                                                     var, 'post_mortality',
                                                     'post', 'rural', 'agebrd',
                                                     'agegrd', 'clgr', 'depc'
                                                 ]].dropna()['depc']
                                             })
            table_star = summary_col([results], stars=True).tables[0]

            a = []
            a.append(int(results.nobs))
            b = table_star.loc['post_mortality':'R-squared', ].to_numpy(
            ).tolist() + ['Yes', 'Yes'] + a
            b = list(flatten(b))
            b[10], b[11], b[12] = b[12], b[11], b[10]
            j = i - 1
            table3_1.iloc[1:15, j] = b

    return table3_1