def test_panel_robust_cov(): import pandas as pa import statsmodels.datasets.grunfeld as gr from .results.results_panelrobust import results as res_stata dtapa = gr.data.load_pandas() #Stata example/data seems to miss last firm dtapa_endog = dtapa.endog[:200] dtapa_exog = dtapa.exog[:200] res = OLS(dtapa_endog, add_constant(dtapa_exog[['value', 'capital']], prepend=False)).fit() #time indicator in range(max Ti) time = np.asarray(dtapa_exog[['year']]) time -= time.min() time = np.squeeze(time).astype(int) #sw.cov_nw_panel requires bounds instead of index tidx = [(i * 20, 20 * (i + 1)) for i in range(10)] #firm index in range(n_firms) firm_names, firm_id = np.unique(np.asarray(dtapa_exog[['firm']], 'S20'), return_inverse=True) #panel newey west standard errors cov = sw.cov_nw_panel(res, 0, tidx, use_correction='hac') #dropping numpy 1.4 soon #np.testing.assert_allclose(cov, res_stata.cov_pnw0_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw0_stata, decimal=4) cov = sw.cov_nw_panel(res, 1, tidx, use_correction='hac') #np.testing.assert_allclose(cov, res_stata.cov_pnw1_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw1_stata, decimal=4) cov = sw.cov_nw_panel(res, 4, tidx) #check default #np.testing.assert_allclose(cov, res_stata.cov_pnw4_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw4_stata, decimal=4) #cluster robust standard errors cov_clu = sw.cov_cluster(res, firm_id) assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4) #cluster robust standard errors, non-int groups cov_clu = sw.cov_cluster(res, lmap(str, firm_id)) assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4) #Driscoll and Kraay panel robust standard errors rcov = sw.cov_nw_groupsum(res, 0, time, use_correction=0) assert_almost_equal(rcov, res_stata.cov_dk0_stata, decimal=4) rcov = sw.cov_nw_groupsum(res, 1, time, use_correction=0) assert_almost_equal(rcov, res_stata.cov_dk1_stata, decimal=4) rcov = sw.cov_nw_groupsum(res, 4, time) #check default assert_almost_equal(rcov, res_stata.cov_dk4_stata, decimal=4)
def test_panel_robust_cov(): import pandas as pa import statsmodels.datasets.grunfeld as gr from results.results_panelrobust import results as res_stata dtapa = gr.data.load_pandas() #Stata example/data seems to miss last firm dtapa_endog = dtapa.endog[:200] dtapa_exog = dtapa.exog[:200] res = OLS(dtapa_endog, add_constant(dtapa_exog[['value', 'capital']], prepend=False)).fit() #time indicator in range(max Ti) time = np.asarray(dtapa_exog[['year']]) time -= time.min() time = np.squeeze(time).astype(int) #sw.cov_nw_panel requires bounds instead of index tidx = [(i*20, 20*(i+1)) for i in range(10)] #firm index in range(n_firms) firm_names, firm_id = np.unique(np.asarray(dtapa_exog[['firm']], 'S20'), return_inverse=True) #panel newey west standard errors cov = sw.cov_nw_panel(res, 0, tidx, use_correction='hac') #dropping numpy 1.4 soon #np.testing.assert_allclose(cov, res_stata.cov_pnw0_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw0_stata, decimal=4) cov = sw.cov_nw_panel(res, 1, tidx, use_correction='hac') #np.testing.assert_allclose(cov, res_stata.cov_pnw1_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw1_stata, decimal=4) cov = sw.cov_nw_panel(res, 4, tidx) #check default #np.testing.assert_allclose(cov, res_stata.cov_pnw4_stata, rtol=1e-6) assert_almost_equal(cov, res_stata.cov_pnw4_stata, decimal=4) #cluster robust standard errors cov_clu = sw.cov_cluster(res, firm_id) assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4) #cluster robust standard errors, non-int groups cov_clu = sw.cov_cluster(res, map(str, firm_id)) assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4) #Driscoll and Kraay panel robust standard errors rcov = sw.cov_nw_groupsum(res, 0, time, use_correction=0) assert_almost_equal(rcov, res_stata.cov_dk0_stata, decimal=4) rcov = sw.cov_nw_groupsum(res, 1, time, use_correction=0) assert_almost_equal(rcov, res_stata.cov_dk1_stata, decimal=4) rcov = sw.cov_nw_groupsum(res, 4, time) #check default assert_almost_equal(rcov, res_stata.cov_dk4_stata, decimal=4)
def robust_reg(fit_result, dataframe, cluster_field): # Fails: #clustered = cov_cluster(fit, dfreg['city']) # TODO: submit bug. try: clustered = cov_cluster(fit_result, dataframe[cluster_field], use_correction=True) except: print fit_result.resid.shape, dataframe[cluster_field].shape, set( dataframe[cluster_field].values) raise # From https://github.com/spillz/sci-comp/blob/master/statsmodels-fun/clustered_se.py (def clustered_output(fit_results,group)): cse = numpy.diag(clustered)**0.5 scse = pandas.Series(cse, index=fit.params.index) outp = pandas.DataFrame([fit.params, fit.bse, scse]).transpose() outp.columns = ['Coef', 'SE', 'C1. S'] #print outp # OK, è identica al risultato con STATA, mancano solo i p-value. # Translation of http://www.stata.com/statalist/archive/2006-08/msg00840.html from scipy.stats import t as students_t deg_f = 2 outp['tstat'] = abs(outp['Coef'] / outp['C1. S']) outp['pval'] = 2 * (1 - students_t.cdf(outp['tstat'], deg_f)) # OK, è identica al risultato con STATA. return outp
def cluster_se(fit, gp_name): """ Compute robust "clustered" standard errors. Parameters ========== fit : statsmodels.regression.linear_model.RegressionResultsWrapper The statsmodels fit object obtained from the original regression gp_name : str The name of the group on which the clustering should happen. This needs to be the name of a column in the original DataFrame used to create and fit the model. Returns ======= ser : pd.Series A pandas Series with the variable names and robust standard errors. """ from statsmodels.stats.sandwich_covariance import cov_cluster grp = fit.model.data.frame[gp_name] se = np.diag(cov_cluster(fit, grp))**(1 / 2.) return pd.Series(se, index=fit.params.index)
def cluster_se(fit, gp_name): """ Compute robust "clustered" standard errors. Parameters ========== fit : statsmodels.regression.linear_model.RegressionResultsWrapper The statsmodels fit object obtained from the original regression gp_name : str The name of the group on which the clustering should happen. This needs to be the name of a column in the original DataFrame used to create and fit the model. Returns ======= ser : pd.Series A pandas Series with the variable names and robust standard errors. """ from statsmodels.stats.sandwich_covariance import cov_cluster grp = fit.model.data.frame[gp_name] se = np.diag(cov_cluster(fit, grp)) ** (1/2.) return pd.Series(se, index=fit.params.index)
def setup(self): model = OLS(self.res1.model.endog, self.res1.model.exog) # res_ols = self.res1.model.fit(cov_type='cluster', res_ols = model.fit( cov_type="cluster", cov_kwds=dict( groups=self.groups, use_correction=False, use_t=False, df_correction=True, ), ) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=False) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1 self.small = False self.res2 = res2.results_cluster_large self.skip_f = True self.rtol = 1e-6 self.rtolh = 1e-10
def get_robust_clu(cls): res1 = cls.res1 cov_clu = sc.cov_cluster(res1, group) cls.bse_rob = sc.se_cov(cov_clu) nobs, k_vars = res1.model.exog.shape k_params = len(res1.params) #n_groups = len(np.unique(group)) corr_fact = (nobs - 1.) / float(nobs - k_params) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(corr_fact)
def get_robust_clu(cls): res1 = cls.res1 cov_clu = sw.cov_cluster(res1, group) cls.bse_rob = sw.se_cov(cov_clu) nobs, k_vars = res1.model.exog.shape k_params = len(res1.params) #n_groups = len(np.unique(group)) corr_fact = (nobs-1.) / float(nobs - k_params) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(corr_fact)
def setup(self): res_ols = self.res1.get_robustcov_results("cluster", groups=self.groups, use_correction=True, use_t=True) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1 self.small = True self.res2 = res2.results_cluster_wls_small self.rtol = 1e-6 self.rtolh = 1e-10
def setup(self): res_ols = self.res1.get_robustcov_results( "cluster", groups=self.groups, use_correction=True, use_t=True ) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1 self.small = True self.res2 = res2.results_cluster_wls_small self.rtol = 1e-6 self.rtolh = 1e-10
def setup(self): res_ols = self.res1.model.fit( cov_type="cluster", cov_kwds=dict(groups=self.groups, use_correction=True, use_t=True) ) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1 self.small = True self.res2 = res2.results_cluster self.rtol = 1e-6 self.rtolh = 1e-10
def setup(self): res_ols = self.res1.model.fit(cov_type='cluster', cov_kwds=dict(groups=self.groups, use_correction=True, use_t=True)) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1 self.small = True self.res2 = res2.results_cluster self.rtol = 1e-6 self.rtolh = 1e-10
def se_cluster(fe_results, df, group): """ degrees of freedom adjustment for nested-within-cluster std. err. see http://www.stata.com/statalist/archive/2013-01/msg00596.html """ from numpy.linalg import matrix_rank from statsmodels.stats.sandwich_covariance import cov_cluster cl_se = np.sqrt(cov_cluster(fe_results, df[group]).diagonal()) cols = [v for v in X.columns if not v.startswith("panid")] V = fe_results.cov_params().ix[cols, cols] N = fe_results.nobs N_grp = len(np.unique(df[group].squeeze())) rank = matrix_rank(V) df_cl = N - (rank - 1) - (N_grp - 1) # degrees of freedom for cluster-robust but no nested adjustmnet df_xt = N - (rank - 1) # degrees of freedom for cluster-nested-robust cl_se_adj = cl_se * np.sqrt(df_cl/df_xt) return cl_se_adj
def setup(self): res_ols = self.res1.get_robustcov_results('cluster', groups=self.groups, use_correction=False, use_t=False, df_correction=True) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=False) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1 self.small = False self.res2 = res2.results_cluster_large self.skip_f = True self.rtol = 1e-6 self.rtolh = 1e-10
def se_cluster(fe_results, df, group): """ degrees of freedom adjustment for nested-within-cluster std. err. see http://www.stata.com/statalist/archive/2013-01/msg00596.html """ import numpy as np from numpy.linalg import matrix_rank from statsmodels.stats.sandwich_covariance import cov_cluster cl_se = np.sqrt(cov_cluster(fe_results, df[group]).diagonal()) cols = [v for v in X.columns if not v.startswith("panid")] V = fe_results.cov_params().ix[cols, cols] N = fe_results.nobs N_grp = len(np.unique(df[group].squeeze())) rank = matrix_rank(V) df_cl = N - (rank - 1) - (N_grp - 1) # degrees of freedom for cluster-robust but no nested adjustmnet df_xt = N - (rank - 1) # degrees of freedom for cluster-nested-robust cl_se_adj = cl_se * np.sqrt(df_cl/df_xt) return cl_se_adj
def setup(self): model = OLS(self.res1.model.endog, self.res1.model.exog) # res_ols = self.res1.model.fit(cov_type='cluster', res_ols = model.fit( cov_type="cluster", cov_kwds=dict(groups=self.groups, use_correction=False, use_t=False, df_correction=True) ) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=False) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1 self.small = False self.res2 = res2.results_cluster_large self.skip_f = True self.rtol = 1e-6 self.rtolh = 1e-10
def setup(self): import pandas as pd fat_array = self.groups.reshape(-1, 1) fat_groups = pd.DataFrame(fat_array) res_ols = self.res1.get_robustcov_results('cluster', groups=fat_groups, use_correction=True, use_t=True) self.res3 = self.res1 self.res1 = res_ols self.bse_robust = res_ols.bse self.cov_robust = res_ols.cov_params() cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True) se1 = sw.se_cov(cov1) self.bse_robust2 = se1 self.cov_robust2 = cov1 self.small = True self.res2 = res2.results_cluster self.rtol = 1e-6 self.rtolh = 1e-10
#res.resid is of transformed model #np.corrcoef(res.resid.reshape(-1,n_groups, order='F')) y_pred = np.dot(mod.exog, res.params) resid = y - y_pred print(np.corrcoef(resid.reshape(-1, n_groups, order='F'))) print(resid.std()) err = y_pred - dgp.y_true print(err.std()) #OLS standard errors are too small mod.res_pooled.params mod.res_pooled.bse #heteroscedasticity robust doesn't help mod.res_pooled.HC1_se #compare with cluster robust se print(sw.se_cov(sw.cov_cluster(mod.res_pooled, dgp.groups.astype(int)))) #not bad, pretty close to panel estimator #and with Newey-West Hac print(sw.se_cov(sw.cov_nw_panel(mod.res_pooled, 4, mod.group.groupidx))) #too small, assuming no bugs, #see Peterson assuming it refers to same kind of model print(dgp.cov) mod2 = ShortPanelGLS(y, dgp.exog, dgp.groups) res2 = mod2.fit_iterative(2) print(res2.params) print(res2.bse) #both implementations produce the same results: from numpy.testing import assert_almost_equal assert_almost_equal(res.params, res2.params, decimal=12) assert_almost_equal(res.bse, res2.bse, decimal=13)
#remove nan observation mask = (xx!=-999.0).all(1) #nan code in dta file mask.shape y = y[mask] xx = xx[mask] group = group[mask] #run OLS res_srs = sm.OLS(y, xx).fit() print 'params ', res_srs.params print 'bse_OLS ', res_srs.bse #get cluster robust standard errors and compare with STATA cov_cr = sw.cov_cluster(res_srs, group.astype(int)) bse_cr = sw.se_cov(cov_cr) print 'bse_rob ', bse_cr res_stata = np.rec.array( [ ('growth', '|', -0.1027121, 0.22917029999999999, -0.45000000000000001, 0.65500000000000003, -0.55483519999999997, 0.34941109999999997), ('emer', '|', -5.4449319999999997, 0.72939690000000001, -7.46, 0.0, -6.8839379999999997, -4.0059269999999998), ('yr_rnd', '|', -51.075690000000002, 22.83615, -2.2400000000000002, 0.027, -96.128439999999998, -6.0229350000000004), ('_cons', '|', 740.3981, 13.460760000000001, 55.0, 0.0, 713.84180000000003, 766.95439999999996)], dtype=[('exogname', '|S6'), ('del', '|S1'), ('params', '<f8'), ('bse', '<f8'), ('tvalues', '<f8'), ('pvalues', '<f8'), ('cilow', '<f8'), ('ciupp', '<f8')]) print 'diff Stata', bse_cr - res_stata.bse assert_almost_equal(bse_cr, res_stata.bse, decimal=6)
reg_df.to_csv("estimated_data/final_regression_dfs/rn_prob_decline_ret.csv") # Regression in levels: res = smf.ols(formula='ret ~ ' + var, data=reg_df).fit(cov_type="HC1") res.summary() # Creating a dummy of 20% decline: reg_df["dummy_ret_neg_20"] = np.where(reg_df['ret'] <= -0.2, 1, 0) res = smf.ols(formula='dummy_ret_neg_20 ~ ' + var, data=reg_df).fit(cov_type="HC1") res.summary() # Calculating clustered standard errors: cov_hetero = sandwich_covariance.cov_hc1( res) # Replicates robust option in STATA cov_cluster_firm = sandwich_covariance.cov_cluster( res, reg_df['secid']) # Matches with one in STATA cov_cluster_month = sandwich_covariance.cov_cluster( res, reg_df['month_lead']) # Matches with one in STATA #### Double clustering by firm and year ############################################ # To do it we need to calculate the covariance matrix with clustering # by firm and by time and add them together, then subtract covariance matrix # with Heteroskedascity correction cov_double_cluster = cov_cluster_firm + cov_cluster_month - cov_hetero np.sqrt(np.diag(cov_double_cluster)) # Excluding 2007, 2008 and 2009: reg_df_excl_crisis = reg_df[(reg_df["month_lead"] < "2007-01-01") | (reg_df["month_lead"] > "2009-12-31")] reg_df_excl_crisis["dummy_ret_neg_20"] = np.where(
# res.resid is of transformed model # np.corrcoef(res.resid.reshape(-1,n_groups, order='F')) y_pred = np.dot(mod.exog, res.params) resid = y - y_pred print np.corrcoef(resid.reshape(-1, n_groups, order="F")) print resid.std() err = y_pred - dgp.y_true print err.std() # OLS standard errors are too small mod.res_pooled.params mod.res_pooled.bse # heteroscedasticity robust doesn't help mod.res_pooled.HC1_se # compare with cluster robust se print sw.se_cov(sw.cov_cluster(mod.res_pooled, dgp.groups.astype(int))) # not bad, pretty close to panel estimator # and with Newey-West Hac print sw.se_cov(sw.cov_nw_panel(mod.res_pooled, 4, mod.group.groupidx)) # too small, assuming no bugs, # see Peterson assuming it refers to same kind of model print dgp.cov mod2 = ShortPanelGLS(y, dgp.exog, dgp.groups) res2 = mod2.fit_iterative(2) print res2.params print res2.bse # both implementations produce the same results: from numpy.testing import assert_almost_equal assert_almost_equal(res.params, res2.params, decimal=12)
def test_short_panel(): #this checks that some basic statistical properties are satisfied by the #results, not verified results against other packages #Note: the ranking of robust bse is different if within=True #I added within keyword to PanelSample to be able to use old example #if within is False, then there is no within group variation in exog. nobs = 100 nobs_i = 5 n_groups = nobs // nobs_i k_vars = 3 dgp = PanelSample(nobs, k_vars, n_groups, corr_structure=cs.corr_arma, corr_args=([1], [1., -0.9],), seed=377769, within=False) #print 'seed', dgp.seed y = dgp.generate_panel() noise = y - dgp.y_true #test dgp dgp_cov_e = np.array( [[ 1. , 0.9 , 0.81 , 0.729 , 0.6561], [ 0.9 , 1. , 0.9 , 0.81 , 0.729 ], [ 0.81 , 0.9 , 1. , 0.9 , 0.81 ], [ 0.729 , 0.81 , 0.9 , 1. , 0.9 ], [ 0.6561, 0.729 , 0.81 , 0.9 , 1. ]]) npt.assert_almost_equal(dgp.cov, dgp_cov_e, 13) cov_noise = np.cov(noise.reshape(-1,n_groups, order='F')) corr_noise = cov2corr(cov_noise) npt.assert_almost_equal(corr_noise, dgp.cov, 1) #estimate panel model mod2 = ShortPanelGLS(y, dgp.exog, dgp.groups) res2 = mod2.fit_iterative(2) #whitened residual should be uncorrelated corr_wresid = np.corrcoef(res2.wresid.reshape(-1,n_groups, order='F')) assert_maxabs(corr_wresid, np.eye(5), 0.1) #residual should have same correlation as dgp corr_resid = np.corrcoef(res2.resid.reshape(-1,n_groups, order='F')) assert_maxabs(corr_resid, dgp.cov, 0.1) assert_almost_equal(res2.resid.std(),1, decimal=0) y_pred = np.dot(mod2.exog, res2.params) assert_almost_equal(res2.fittedvalues, y_pred, 13) #compare with OLS res2_ols = mod2._fit_ols() npt.assert_(mod2.res_pooled is res2_ols) res2_ols = mod2.res_pooled #TODO: BUG: requires call to _fit_ols #fitting once is the same as OLS #note: I need to create new instance, otherwise it continuous fitting mod1 = ShortPanelGLS(y, dgp.exog, dgp.groups) res1 = mod1.fit_iterative(1) assert_almost_equal(res1.params, res2_ols.params, decimal=13) assert_almost_equal(res1.bse, res2_ols.bse, decimal=13) res_ols = OLS(y, dgp.exog).fit() assert_almost_equal(res1.params, res_ols.params, decimal=13) assert_almost_equal(res1.bse, res_ols.bse, decimal=13) #compare with old version mod_old = ShortPanelGLS2(y, dgp.exog, dgp.groups) res_old = mod_old.fit() assert_almost_equal(res2.params, res_old.params, decimal=13) assert_almost_equal(res2.bse, res_old.bse, decimal=13) mod5 = ShortPanelGLS(y, dgp.exog, dgp.groups) res5 = mod5.fit_iterative(5) #make sure it's different #npt.assert_array_less(0.009, em.maxabs(res5.bse, res2.bse)) cov_clu = sw.cov_cluster(mod2.res_pooled, dgp.groups.astype(int)) clubse = se_cov(cov_clu) pnwbse = se_cov(sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx)) bser = np.vstack((res2.bse, res5.bse, clubse, pnwbse)) bser_mean = np.mean(bser, axis=0) #cov_cluster close to robust and PanelGLS #is up to 24% larger than mean of bser #npt.assert_array_less(0, clubse / bser_mean - 1) npt.assert_array_less(clubse / bser_mean - 1, 0.25) #cov_nw_panel close to robust and PanelGLS npt.assert_array_less(pnwbse / bser_mean - 1, 0.1) #OLS underestimates bse, robust at least 60% larger npt.assert_array_less(0.6, bser_mean / res_ols.bse - 1) #cov_hac_panel with uniform_kernel is the same as cov_cluster for balanced #panel with full length kernel #I fixe default correction to be equal cov_uni = sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx, weights_func=sw.weights_uniform, use_correction='c') assert_almost_equal(cov_uni, cov_clu, decimal=13) #without correction cov_clu2 = sw.cov_cluster(mod2.res_pooled, dgp.groups.astype(int), use_correction=False) cov_uni2 = sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx, weights_func=sw.weights_uniform, use_correction=False) assert_almost_equal(cov_uni2, cov_clu2, decimal=13) cov_white = sw.cov_white_simple(mod2.res_pooled) cov_pnw0 = sw.cov_nw_panel(mod2.res_pooled, 0, mod2.group.groupidx, use_correction='hac') assert_almost_equal(cov_pnw0, cov_white, decimal=13)
#remove nan observation mask = (xx!=-999.0).all(1) #nan code in dta file mask.shape y = y[mask] xx = xx[mask] group = group[mask] #run OLS res_srs = sm.OLS(y, xx).fit() print('params ', res_srs.params) print('bse_OLS ', res_srs.bse) #get cluster robust standard errors and compare with STATA cov_cr = sw.cov_cluster(res_srs, group.astype(int)) bse_cr = sw.se_cov(cov_cr) print('bse_rob ', bse_cr) res_stata = np.rec.array( [('growth', '|', -0.1027121, 0.22917029999999999, -0.45000000000000001, 0.65500000000000003, -0.55483519999999997, 0.34941109999999997), ('emer', '|', -5.4449319999999997, 0.72939690000000001, -7.46, 0.0, -6.8839379999999997, -4.0059269999999998), ('yr_rnd', '|', -51.075690000000002, 22.83615, -2.2400000000000002, 0.027, -96.128439999999998, -6.0229350000000004), ('_cons', '|', 740.3981, 13.460760000000001, 55.0, 0.0, 713.84180000000003, 766.95439999999996)], dtype=[('exogname', '|S6'), ('del', '|S1'), ('params', 'float'), ('bse', 'float'), ('tvalues', 'float'), ('pvalues', 'float'), ('cilow', 'float'), ('ciupp', 'float')]) print('diff Stata', bse_cr - res_stata.bse) assert_almost_equal(bse_cr, res_stata.bse, decimal=6)
def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds): """create new results instance with robust covariance as default Parameters ---------- cov_type : string the type of robust sandwich estimator to use. see Notes below use_t : bool If true, then the t distribution is used for inference. If false, then the normal distribution is used. kwds : depends on cov_type Required or optional arguments for robust covariance calculation. see Notes below Returns ------- results : results instance This method creates a new results instance with the requested robust covariance as the default covariance of the parameters. Inferential statistics like p-values and hypothesis tests will be based on this covariance matrix. Notes ----- Warning: Some of the options and defaults in cov_kwds may be changed in a future version. The covariance keywords provide an option 'scaling_factor' to adjust the scaling of the covariance matrix, that is the covariance is multiplied by this factor if it is given and is not `None`. This allows the user to adjust the scaling of the covariance matrix to match other statistical packages. For example, `scaling_factor=(nobs - 1.) / (nobs - k_params)` provides a correction so that the robust covariance matrices match those of Stata in some models like GLM and discrete Models. The following covariance types and required or optional arguments are currently available: - 'HC0', 'HC1', 'HC2', 'HC3' and no keyword arguments: heteroscedasticity robust covariance - 'HAC' and keywords - `maxlag` integer (required) : number of lags to use - `kernel` string (optional) : kernel, default is Bartlett - `use_correction` bool (optional) : If true, use small sample correction - 'cluster' and required keyword `groups`, integer group indicator - `groups` array_like, integer (required) : index of clusters or groups - `use_correction` bool (optional) : If True the sandwich covariance is calulated with a small sample correction. If False the the sandwich covariance is calulated without small sample correction. - `df_correction` bool (optional) If True (default), then the degrees of freedom for the inferential statistics and hypothesis tests, such as pvalues, f_pvalue, conf_int, and t_test and f_test, are based on the number of groups minus one instead of the total number of observations minus the number of explanatory variables. `df_resid` of the results instance is adjusted. If False, then `df_resid` of the results instance is not adjusted. - 'hac-groupsum' Driscoll and Kraay, heteroscedasticity and autocorrelation robust standard errors in panel data keywords - `time` array_like (required) : index of time periods - `maxlag` integer (required) : number of lags to use - `kernel` string (optional) : kernel, default is Bartlett - `use_correction` False or string in ['hac', 'cluster'] (optional) : If False the the sandwich covariance is calulated without small sample correction. If `use_correction = 'cluster'` (default), then the same small sample correction as in the case of 'covtype='cluster'' is used. - `df_correction` bool (optional) adjustment to df_resid, see cov_type 'cluster' above #TODO: we need more options here - 'hac-panel' heteroscedasticity and autocorrelation robust standard errors in panel data. The data needs to be sorted in this case, the time series for each panel unit or cluster need to be stacked. The membership to a timeseries of an individual or group can be either specified by group indicators or by increasing time periods. keywords - either `groups` or `time` : array_like (required) `groups` : indicator for groups `time` : index of time periods - `maxlag` integer (required) : number of lags to use - `kernel` string (optional) : kernel, default is Bartlett - `use_correction` False or string in ['hac', 'cluster'] (optional) : If False the the sandwich covariance is calulated without small sample correction. - `df_correction` bool (optional) adjustment to df_resid, see cov_type 'cluster' above #TODO: we need more options here Reminder: `use_correction` in "hac-groupsum" and "hac-panel" is not bool, needs to be in [False, 'hac', 'cluster'] TODO: Currently there is no check for extra or misspelled keywords, except in the case of cov_type `HCx` """ import statsmodels.stats.sandwich_covariance as sw #normalize names if cov_type == 'nw-panel': cov_type = 'hac-panel' if cov_type == 'nw-groupsum': cov_type = 'hac-groupsum' if 'kernel' in kwds: kwds['weights_func'] = kwds.pop('kernel') # pop because HCx raises if any kwds sc_factor = kwds.pop('scaling_factor', None) # TODO: make separate function that returns a robust cov plus info use_self = kwds.pop('use_self', False) if use_self: res = self else: # this doesn't work for most models, use raw instance instead from fit res = self.__class__(self.model, self.params, normalized_cov_params=self.normalized_cov_params, scale=self.scale) res.cov_type = cov_type # use_t might already be defined by the class, and already set if use_t is None: use_t = self.use_t res.cov_kwds = {'use_t':use_t} # store for information res.use_t = use_t adjust_df = False if cov_type in ['cluster', 'hac-panel', 'hac-groupsum']: df_correction = kwds.get('df_correction', None) # TODO: check also use_correction, do I need all combinations? if df_correction is not False: # i.e. in [None, True]: # user didn't explicitely set it to False adjust_df = True res.cov_kwds['adjust_df'] = adjust_df # verify and set kwds, and calculate cov # TODO: this should be outsourced in a function so we can reuse it in # other models # TODO: make it DRYer repeated code for checking kwds if cov_type.upper() in ('HC0', 'HC1', 'HC2', 'HC3'): if kwds: raise ValueError('heteroscedasticity robust covarians ' + 'does not use keywords') res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' + 'robust ' + '(' + cov_type + ')') res.cov_params_default = getattr(self, 'cov_' + cov_type.upper(), None) if res.cov_params_default is None: # results classes that don't have cov_HCx attribute res.cov_params_default = sw.cov_white_simple(self, use_correction=False) elif cov_type.lower() == 'hac': maxlags = kwds['maxlags'] # required?, default in cov_hac_simple res.cov_kwds['maxlags'] = maxlags weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func use_correction = kwds.get('use_correction', False) res.cov_kwds['use_correction'] = use_correction res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' + 'and autocorrelation robust (HAC) using %d lags and %s small ' + 'sample correction') % (maxlags, ['without', 'with'][use_correction]) res.cov_params_default = sw.cov_hac_simple(self, nlags=maxlags, weights_func=weights_func, use_correction=use_correction) elif cov_type.lower() == 'cluster': #cluster robust standard errors, one- or two-way groups = kwds['groups'] if not hasattr(groups, 'shape'): groups = np.asarray(groups).T if groups.ndim >= 2: groups = groups.squeeze() res.cov_kwds['groups'] = groups use_correction = kwds.get('use_correction', True) res.cov_kwds['use_correction'] = use_correction if groups.ndim == 1: if adjust_df: # need to find number of groups # duplicate work self.n_groups = n_groups = len(np.unique(groups)) res.cov_params_default = sw.cov_cluster(self, groups, use_correction=use_correction) elif groups.ndim == 2: if hasattr(groups, 'values'): groups = groups.values if adjust_df: # need to find number of groups # duplicate work n_groups0 = len(np.unique(groups[:,0])) n_groups1 = len(np.unique(groups[:, 1])) self.n_groups = (n_groups0, n_groups1) n_groups = min(n_groups0, n_groups1) # use for adjust_df # Note: sw.cov_cluster_2groups has 3 returns res.cov_params_default = sw.cov_cluster_2groups(self, groups, use_correction=use_correction)[0] else: raise ValueError('only two groups are supported') res.cov_kwds['description'] = ('Standard Errors are robust to' + 'cluster correlation ' + '(' + cov_type + ')') elif cov_type.lower() == 'hac-panel': #cluster robust standard errors res.cov_kwds['time'] = time = kwds.get('time', None) res.cov_kwds['groups'] = groups = kwds.get('groups', None) #TODO: nlags is currently required #nlags = kwds.get('nlags', True) #res.cov_kwds['nlags'] = nlags #TODO: `nlags` or `maxlags` res.cov_kwds['maxlags'] = maxlags = kwds['maxlags'] use_correction = kwds.get('use_correction', 'hac') res.cov_kwds['use_correction'] = use_correction weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func # TODO: clumsy time index in cov_nw_panel if groups is not None: groups = np.asarray(groups) tt = (np.nonzero(groups[:-1] != groups[1:])[0] + 1).tolist() nobs_ = len(groups) elif time is not None: # TODO: clumsy time index in cov_nw_panel time = np.asarray(time) tt = (np.nonzero(time[1:] < time[:-1])[0] + 1).tolist() nobs_ = len(time) else: raise ValueError('either time or groups needs to be given') groupidx = lzip([0] + tt, tt + [nobs_]) self.n_groups = n_groups = len(groupidx) res.cov_params_default = sw.cov_nw_panel(self, maxlags, groupidx, weights_func=weights_func, use_correction=use_correction) res.cov_kwds['description'] = ('Standard Errors are robust to' + 'cluster correlation ' + '(' + cov_type + ')') elif cov_type.lower() == 'hac-groupsum': # Driscoll-Kraay standard errors res.cov_kwds['time'] = time = kwds['time'] #TODO: nlags is currently required #nlags = kwds.get('nlags', True) #res.cov_kwds['nlags'] = nlags #TODO: `nlags` or `maxlags` res.cov_kwds['maxlags'] = maxlags = kwds['maxlags'] use_correction = kwds.get('use_correction', 'cluster') res.cov_kwds['use_correction'] = use_correction weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func if adjust_df: # need to find number of groups tt = (np.nonzero(time[1:] < time[:-1])[0] + 1) self.n_groups = n_groups = len(tt) + 1 res.cov_params_default = sw.cov_nw_groupsum(self, maxlags, time, weights_func=weights_func, use_correction=use_correction) res.cov_kwds['description'] = ( 'Driscoll and Kraay Standard Errors are robust to ' + 'cluster correlation ' + '(' + cov_type + ')') else: raise ValueError('cov_type not recognized. See docstring for ' + 'available options and spelling') # generic optional factor to scale covariance res.cov_kwds['scaling_factor'] = sc_factor if sc_factor is not None: res.cov_params_default *= sc_factor if adjust_df: # Note: df_resid is used for scale and others, add new attribute res.df_resid_inference = n_groups - 1 return res
def test_short_panel(): #this checks that some basic statistical properties are satisfied by the ##1lab_results, not verified #1lab_results against other packages #Note: the ranking of robust bse is different if within=True #I added within keyword to PanelSample to be able to use old example #if within is False, then there is no within group variation in exog. nobs = 100 nobs_i = 5 n_groups = nobs // nobs_i k_vars = 3 dgp = PanelSample(nobs, k_vars, n_groups, corr_structure=cs.corr_arma, corr_args=([1], [1., -0.9],), seed=377769, within=False) #print 'seed', dgp.seed y = dgp.generate_panel() noise = y - dgp.y_true #test dgp dgp_cov_e = np.array( [[ 1. , 0.9 , 0.81 , 0.729 , 0.6561], [ 0.9 , 1. , 0.9 , 0.81 , 0.729 ], [ 0.81 , 0.9 , 1. , 0.9 , 0.81 ], [ 0.729 , 0.81 , 0.9 , 1. , 0.9 ], [ 0.6561, 0.729 , 0.81 , 0.9 , 1. ]]) npt.assert_almost_equal(dgp.cov, dgp_cov_e, 13) cov_noise = np.cov(noise.reshape(-1,n_groups, order='F')) corr_noise = cov2corr(cov_noise) npt.assert_almost_equal(corr_noise, dgp.cov, 1) #estimate panel model mod2 = ShortPanelGLS(y, dgp.exog, dgp.groups) res2 = mod2.fit_iterative(2) #whitened residual should be uncorrelated corr_wresid = np.corrcoef(res2.wresid.reshape(-1,n_groups, order='F')) assert_maxabs(corr_wresid, np.eye(5), 0.1) #residual should have same correlation as dgp corr_resid = np.corrcoef(res2.resid.reshape(-1,n_groups, order='F')) assert_maxabs(corr_resid, dgp.cov, 0.1) assert_almost_equal(res2.resid.std(),1, decimal=0) y_pred = np.dot(mod2.exog, res2.params) assert_almost_equal(res2.fittedvalues, y_pred, 13) #compare with OLS res2_ols = mod2._fit_ols() npt.assert_(mod2.res_pooled is res2_ols) res2_ols = mod2.res_pooled #TODO: BUG: requires call to _fit_ols #fitting once is the same as OLS #note: I need to create new instance, otherwise it continuous fitting mod1 = ShortPanelGLS(y, dgp.exog, dgp.groups) res1 = mod1.fit_iterative(1) assert_almost_equal(res1.params, res2_ols.params, decimal=13) assert_almost_equal(res1.bse, res2_ols.bse, decimal=13) res_ols = OLS(y, dgp.exog).fit() assert_almost_equal(res1.params, res_ols.params, decimal=13) assert_almost_equal(res1.bse, res_ols.bse, decimal=13) #compare with old version mod_old = ShortPanelGLS2(y, dgp.exog, dgp.groups) res_old = mod_old.fit() assert_almost_equal(res2.params, res_old.params, decimal=13) assert_almost_equal(res2.bse, res_old.bse, decimal=13) mod5 = ShortPanelGLS(y, dgp.exog, dgp.groups) res5 = mod5.fit_iterative(5) #make sure it's different #npt.assert_array_less(0.009, em.maxabs(res5.bse, res2.bse)) cov_clu = sw.cov_cluster(mod2.res_pooled, dgp.groups.astype(int)) clubse = se_cov(cov_clu) pnwbse = se_cov(sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx)) bser = np.vstack((res2.bse, res5.bse, clubse, pnwbse)) bser_mean = np.mean(bser, axis=0) #cov_cluster close to robust and PanelGLS #is up to 24% larger than mean of bser #npt.assert_array_less(0, clubse / bser_mean - 1) npt.assert_array_less(clubse / bser_mean - 1, 0.25) #cov_nw_panel close to robust and PanelGLS npt.assert_array_less(pnwbse / bser_mean - 1, 0.1) #OLS underestimates bse, robust at least 60% larger npt.assert_array_less(0.6, bser_mean / res_ols.bse - 1) #cov_hac_panel with uniform_kernel is the same as cov_cluster for balanced #panel with full length kernel #I fixe default correction to be equal cov_uni = sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx, weights_func=sw.weights_uniform, use_correction='c') assert_almost_equal(cov_uni, cov_clu, decimal=13) #without correction cov_clu2 = sw.cov_cluster(mod2.res_pooled, dgp.groups.astype(int), use_correction=False) cov_uni2 = sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx, weights_func=sw.weights_uniform, use_correction=False) assert_almost_equal(cov_uni2, cov_clu2, decimal=13) cov_white = sw.cov_white_simple(mod2.res_pooled) cov_pnw0 = sw.cov_nw_panel(mod2.res_pooled, 0, mod2.group.groupidx, use_correction='hac') assert_almost_equal(cov_pnw0, cov_white, decimal=13)
#test White assert_almost_equal(bse_w, self.HC0_se, 15) bse_wc = sw.se_cov(sw.cov_white_simple(self, use_correction=True)) print bse_wc #test White assert_almost_equal(bse_wc, self.HC1_se, 15) groups = np.repeat(np.arange(5), 20) idx = np.nonzero(np.diff(groups))[0].tolist() groupidx = zip([0]+idx, idx+[len(groups)]) ngroups = len(groupidx) print sw.se_cov(sw.cov_cluster(self, groups)) #two strange looking corner cases BUG? print sw.se_cov(sw.cov_cluster(self, np.ones(len(endog), int), use_correction=False)) print sw.se_cov(sw.cov_crosssection_0(self, np.arange(len(endog)))) #these results are close to simple (no group) white, 50 groups 2 obs each groups = np.repeat(np.arange(50), 100//50) print sw.se_cov(sw.cov_cluster(self, groups)) #2 groups with 50 obs each, what was the interpretation again? groups = np.repeat(np.arange(2), 100//2) print sw.se_cov(sw.cov_cluster(self, groups)) "http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt" ''' test <- read.table( url(paste("http://www.kellogg.northwestern.edu/", "faculty/petersen/htm/papers/se/",
def get_robust_clu(cls): res1 = cls.res1 cov_clu = sw.cov_cluster(res1, group) cls.bse_rob = sw.se_cov(cov_clu) cls.corr_fact = cls.get_correction_factor(res1)