Python cov_cluster示例，statsmodels.stats.sandwich_covariance.cov_cluster Python示例

示例#1

0

显示文件

文件： test_panel_robustcov.py 项目： kasunsp/pinalpha_mvp

def test_panel_robust_cov():
    import pandas as pa
    import statsmodels.datasets.grunfeld as gr
    from .results.results_panelrobust import results as res_stata

    dtapa = gr.data.load_pandas()
    #Stata example/data seems to miss last firm
    dtapa_endog = dtapa.endog[:200]
    dtapa_exog = dtapa.exog[:200]
    res = OLS(dtapa_endog,
              add_constant(dtapa_exog[['value', 'capital']],
                           prepend=False)).fit()

    #time indicator in range(max Ti)
    time = np.asarray(dtapa_exog[['year']])
    time -= time.min()
    time = np.squeeze(time).astype(int)

    #sw.cov_nw_panel requires bounds instead of index
    tidx = [(i * 20, 20 * (i + 1)) for i in range(10)]

    #firm index in range(n_firms)
    firm_names, firm_id = np.unique(np.asarray(dtapa_exog[['firm']], 'S20'),
                                    return_inverse=True)

    #panel newey west standard errors
    cov = sw.cov_nw_panel(res, 0, tidx, use_correction='hac')
    #dropping numpy 1.4 soon
    #np.testing.assert_allclose(cov, res_stata.cov_pnw0_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw0_stata, decimal=4)

    cov = sw.cov_nw_panel(res, 1, tidx, use_correction='hac')
    #np.testing.assert_allclose(cov, res_stata.cov_pnw1_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw1_stata, decimal=4)

    cov = sw.cov_nw_panel(res, 4, tidx)  #check default
    #np.testing.assert_allclose(cov, res_stata.cov_pnw4_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw4_stata, decimal=4)

    #cluster robust standard errors
    cov_clu = sw.cov_cluster(res, firm_id)
    assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4)

    #cluster robust standard errors, non-int groups
    cov_clu = sw.cov_cluster(res, lmap(str, firm_id))
    assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4)

    #Driscoll and Kraay panel robust standard errors
    rcov = sw.cov_nw_groupsum(res, 0, time, use_correction=0)
    assert_almost_equal(rcov, res_stata.cov_dk0_stata, decimal=4)

    rcov = sw.cov_nw_groupsum(res, 1, time, use_correction=0)
    assert_almost_equal(rcov, res_stata.cov_dk1_stata, decimal=4)

    rcov = sw.cov_nw_groupsum(res, 4, time)  #check default
    assert_almost_equal(rcov, res_stata.cov_dk4_stata, decimal=4)

示例#2

0

显示文件

文件： test_panel_robustcov.py 项目： SuperXrooT/statsmodels

def test_panel_robust_cov():
    import pandas as pa
    import statsmodels.datasets.grunfeld as gr
    from results.results_panelrobust import results as res_stata

    dtapa = gr.data.load_pandas()
    #Stata example/data seems to miss last firm
    dtapa_endog = dtapa.endog[:200]
    dtapa_exog = dtapa.exog[:200]
    res = OLS(dtapa_endog, add_constant(dtapa_exog[['value', 'capital']],
              prepend=False)).fit()

    #time indicator in range(max Ti)
    time = np.asarray(dtapa_exog[['year']])
    time -= time.min()
    time = np.squeeze(time).astype(int)

    #sw.cov_nw_panel requires bounds instead of index
    tidx = [(i*20, 20*(i+1)) for i in range(10)]

    #firm index in range(n_firms)
    firm_names, firm_id = np.unique(np.asarray(dtapa_exog[['firm']], 'S20'),
                                    return_inverse=True)

    #panel newey west standard errors
    cov = sw.cov_nw_panel(res, 0, tidx, use_correction='hac')
    #dropping numpy 1.4 soon
    #np.testing.assert_allclose(cov, res_stata.cov_pnw0_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw0_stata, decimal=4)

    cov = sw.cov_nw_panel(res, 1, tidx, use_correction='hac')
    #np.testing.assert_allclose(cov, res_stata.cov_pnw1_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw1_stata, decimal=4)

    cov = sw.cov_nw_panel(res, 4, tidx)  #check default
    #np.testing.assert_allclose(cov, res_stata.cov_pnw4_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw4_stata, decimal=4)

    #cluster robust standard errors
    cov_clu = sw.cov_cluster(res, firm_id)
    assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4)

    #cluster robust standard errors, non-int groups
    cov_clu = sw.cov_cluster(res, map(str, firm_id))
    assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4)

    #Driscoll and Kraay panel robust standard errors
    rcov = sw.cov_nw_groupsum(res, 0, time, use_correction=0)
    assert_almost_equal(rcov, res_stata.cov_dk0_stata, decimal=4)

    rcov = sw.cov_nw_groupsum(res, 1, time, use_correction=0)
    assert_almost_equal(rcov, res_stata.cov_dk1_stata, decimal=4)

    rcov = sw.cov_nw_groupsum(res, 4, time) #check default
    assert_almost_equal(rcov, res_stata.cov_dk4_stata, decimal=4)

示例#3

0

显示文件

文件： statistics.py 项目： toobaz/generic_utils

def robust_reg(fit_result, dataframe, cluster_field):
    # Fails:
    #clustered = cov_cluster(fit, dfreg['city'])
    # TODO: submit bug.
    try:
        clustered = cov_cluster(fit_result,
                                dataframe[cluster_field],
                                use_correction=True)
    except:
        print fit_result.resid.shape, dataframe[cluster_field].shape, set(
            dataframe[cluster_field].values)
        raise

    # From https://github.com/spillz/sci-comp/blob/master/statsmodels-fun/clustered_se.py (def clustered_output(fit_results,group)):
    cse = numpy.diag(clustered)**0.5
    scse = pandas.Series(cse, index=fit.params.index)
    outp = pandas.DataFrame([fit.params, fit.bse, scse]).transpose()
    outp.columns = ['Coef', 'SE', 'C1. S']
    #print outp
    # OK, è identica al risultato con STATA, mancano solo i p-value.

    # Translation of http://www.stata.com/statalist/archive/2006-08/msg00840.html

    from scipy.stats import t as students_t

    deg_f = 2
    outp['tstat'] = abs(outp['Coef'] / outp['C1. S'])
    outp['pval'] = 2 * (1 - students_t.cdf(outp['tstat'], deg_f))
    # OK, è identica al risultato con STATA.
    return outp

示例#4

0

显示文件

文件： metrics.py 项目： sglyon/econtools

def cluster_se(fit, gp_name):
    """
    Compute robust "clustered" standard errors.

    Parameters
    ==========
    fit : statsmodels.regression.linear_model.RegressionResultsWrapper
        The statsmodels fit object obtained from the original regression

    gp_name : str
        The name of the group on which the clustering should happen.
        This needs to be the name of a column in the original DataFrame
        used to create and fit the model.

    Returns
    =======
    ser : pd.Series
        A pandas Series with the variable names and robust standard
        errors.

    """
    from statsmodels.stats.sandwich_covariance import cov_cluster
    grp = fit.model.data.frame[gp_name]
    se = np.diag(cov_cluster(fit, grp))**(1 / 2.)
    return pd.Series(se, index=fit.params.index)

示例#5

0

显示文件

文件： metrics.py 项目： spencerlyon2/econtools

def cluster_se(fit, gp_name):
    """
    Compute robust "clustered" standard errors.

    Parameters
    ==========
    fit : statsmodels.regression.linear_model.RegressionResultsWrapper
        The statsmodels fit object obtained from the original regression

    gp_name : str
        The name of the group on which the clustering should happen.
        This needs to be the name of a column in the original DataFrame
        used to create and fit the model.

    Returns
    =======
    ser : pd.Series
        A pandas Series with the variable names and robust standard
        errors.

    """
    from statsmodels.stats.sandwich_covariance import cov_cluster
    grp = fit.model.data.frame[gp_name]
    se = np.diag(cov_cluster(fit, grp)) ** (1/2.)
    return pd.Series(se, index=fit.params.index)

示例#6

0

显示文件

文件： test_robustcov.py 项目： quuhua911/statsmodels

    def setup(self):
        model = OLS(self.res1.model.endog, self.res1.model.exog)
        # res_ols = self.res1.model.fit(cov_type='cluster',
        res_ols = model.fit(
            cov_type="cluster",
            cov_kwds=dict(
                groups=self.groups,
                use_correction=False,
                use_t=False,
                df_correction=True,
            ),
        )
        self.res3 = self.res1
        self.res1 = res_ols
        self.bse_robust = res_ols.bse
        self.cov_robust = res_ols.cov_params()
        cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=False)
        se1 = sw.se_cov(cov1)
        self.bse_robust2 = se1
        self.cov_robust2 = cov1
        self.small = False
        self.res2 = res2.results_cluster_large

        self.skip_f = True
        self.rtol = 1e-6
        self.rtolh = 1e-10

示例#7

0

显示文件

文件： test_sandwich_cov.py 项目： yutiansut/statsmodels

    def get_robust_clu(cls):
        res1 = cls.res1
        cov_clu = sc.cov_cluster(res1, group)
        cls.bse_rob = sc.se_cov(cov_clu)

        nobs, k_vars = res1.model.exog.shape
        k_params = len(res1.params)
        #n_groups = len(np.unique(group))
        corr_fact = (nobs - 1.) / float(nobs - k_params)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(corr_fact)

示例#8

0

显示文件

文件： test_sandwich_cov.py 项目： tadeze/statsmodels

    def get_robust_clu(cls):
        res1 = cls.res1
        cov_clu = sw.cov_cluster(res1, group)
        cls.bse_rob = sw.se_cov(cov_clu)

        nobs, k_vars = res1.model.exog.shape
        k_params = len(res1.params)
        #n_groups = len(np.unique(group))
        corr_fact = (nobs-1.) / float(nobs - k_params)
        # for bse we need sqrt of correction factor
        cls.corr_fact = np.sqrt(corr_fact)

示例#9

0

显示文件

    def setup(self):
        res_ols = self.res1.get_robustcov_results("cluster", groups=self.groups, use_correction=True, use_t=True)
        self.res3 = self.res1
        self.res1 = res_ols
        self.bse_robust = res_ols.bse
        self.cov_robust = res_ols.cov_params()
        cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True)
        se1 = sw.se_cov(cov1)
        self.bse_robust2 = se1
        self.cov_robust2 = cov1
        self.small = True
        self.res2 = res2.results_cluster_wls_small

        self.rtol = 1e-6
        self.rtolh = 1e-10

示例#10

0

显示文件

文件： test_robustcov.py 项目： timgates42/statsmodels

    def setup(self):
        res_ols = self.res1.get_robustcov_results(
            "cluster", groups=self.groups, use_correction=True, use_t=True
        )
        self.res3 = self.res1
        self.res1 = res_ols
        self.bse_robust = res_ols.bse
        self.cov_robust = res_ols.cov_params()
        cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True)
        se1 = sw.se_cov(cov1)
        self.bse_robust2 = se1
        self.cov_robust2 = cov1
        self.small = True
        self.res2 = res2.results_cluster_wls_small

        self.rtol = 1e-6
        self.rtolh = 1e-10

示例#11

0

显示文件

    def setup(self):
        res_ols = self.res1.model.fit(
            cov_type="cluster", cov_kwds=dict(groups=self.groups, use_correction=True, use_t=True)
        )
        self.res3 = self.res1
        self.res1 = res_ols
        self.bse_robust = res_ols.bse
        self.cov_robust = res_ols.cov_params()
        cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True)
        se1 = sw.se_cov(cov1)
        self.bse_robust2 = se1
        self.cov_robust2 = cov1
        self.small = True
        self.res2 = res2.results_cluster

        self.rtol = 1e-6
        self.rtolh = 1e-10

示例#12

0

显示文件

    def setup(self):
        res_ols = self.res1.model.fit(cov_type='cluster',
                                      cov_kwds=dict(groups=self.groups,
                                                    use_correction=True,
                                                    use_t=True))
        self.res3 = self.res1
        self.res1 = res_ols
        self.bse_robust = res_ols.bse
        self.cov_robust = res_ols.cov_params()
        cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True)
        se1 = sw.se_cov(cov1)
        self.bse_robust2 = se1
        self.cov_robust2 = cov1
        self.small = True
        self.res2 = res2.results_cluster

        self.rtol = 1e-6
        self.rtolh = 1e-10

示例#13

0

显示文件

文件： fe_functions.py 项目： bprest/Duke_PUBPOL590

def se_cluster(fe_results, df, group):

    """
    degrees of freedom adjustment for nested-within-cluster std. err.
        see http://www.stata.com/statalist/archive/2013-01/msg00596.html
    """
    from numpy.linalg import matrix_rank
    from statsmodels.stats.sandwich_covariance import cov_cluster

    cl_se = np.sqrt(cov_cluster(fe_results, df[group]).diagonal())
    cols = [v for v in X.columns if not v.startswith("panid")]
    V = fe_results.cov_params().ix[cols, cols]
    N = fe_results.nobs
    N_grp = len(np.unique(df[group].squeeze()))
    rank = matrix_rank(V)
    df_cl = N - (rank - 1) - (N_grp - 1) # degrees of freedom for cluster-robust but no nested adjustmnet
    df_xt = N - (rank - 1) # degrees of freedom for cluster-nested-robust
    cl_se_adj = cl_se * np.sqrt(df_cl/df_xt)
    return cl_se_adj

示例#14

0

显示文件

    def setup(self):
        res_ols = self.res1.get_robustcov_results('cluster',
                                                  groups=self.groups,
                                                  use_correction=False,
                                                  use_t=False,
                                                  df_correction=True)
        self.res3 = self.res1
        self.res1 = res_ols
        self.bse_robust = res_ols.bse
        self.cov_robust = res_ols.cov_params()
        cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=False)
        se1 =  sw.se_cov(cov1)
        self.bse_robust2 = se1
        self.cov_robust2 = cov1
        self.small = False
        self.res2 = res2.results_cluster_large

        self.skip_f = True
        self.rtol = 1e-6
        self.rtolh = 1e-10

示例#15

0

显示文件

def se_cluster(fe_results, df, group):

    """
    degrees of freedom adjustment for nested-within-cluster std. err.
        see http://www.stata.com/statalist/archive/2013-01/msg00596.html
    """
    import numpy as np
    from numpy.linalg import matrix_rank
    from statsmodels.stats.sandwich_covariance import cov_cluster

    cl_se = np.sqrt(cov_cluster(fe_results, df[group]).diagonal())
    cols = [v for v in X.columns if not v.startswith("panid")]
    V = fe_results.cov_params().ix[cols, cols]
    N = fe_results.nobs
    N_grp = len(np.unique(df[group].squeeze()))
    rank = matrix_rank(V)
    df_cl = N - (rank - 1) - (N_grp - 1) # degrees of freedom for cluster-robust but no nested adjustmnet
    df_xt = N - (rank - 1) # degrees of freedom for cluster-nested-robust
    cl_se_adj = cl_se * np.sqrt(df_cl/df_xt)
    return cl_se_adj

示例#16

0

显示文件

    def setup(self):
        model = OLS(self.res1.model.endog, self.res1.model.exog)
        # res_ols = self.res1.model.fit(cov_type='cluster',
        res_ols = model.fit(
            cov_type="cluster", cov_kwds=dict(groups=self.groups, use_correction=False, use_t=False, df_correction=True)
        )
        self.res3 = self.res1
        self.res1 = res_ols
        self.bse_robust = res_ols.bse
        self.cov_robust = res_ols.cov_params()
        cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=False)
        se1 = sw.se_cov(cov1)
        self.bse_robust2 = se1
        self.cov_robust2 = cov1
        self.small = False
        self.res2 = res2.results_cluster_large

        self.skip_f = True
        self.rtol = 1e-6
        self.rtolh = 1e-10

示例#17

0

显示文件

    def setup(self):
        import pandas as pd
        fat_array = self.groups.reshape(-1, 1)
        fat_groups = pd.DataFrame(fat_array)

        res_ols = self.res1.get_robustcov_results('cluster',
                                                  groups=fat_groups,
                                                  use_correction=True,
                                                  use_t=True)
        self.res3 = self.res1
        self.res1 = res_ols
        self.bse_robust = res_ols.bse
        self.cov_robust = res_ols.cov_params()
        cov1 = sw.cov_cluster(self.res1, self.groups, use_correction=True)
        se1 =  sw.se_cov(cov1)
        self.bse_robust2 = se1
        self.cov_robust2 = cov1
        self.small = True
        self.res2 = res2.results_cluster

        self.rtol = 1e-6
        self.rtolh = 1e-10

示例#18

0

显示文件

文件： ex_random_panel.py 项目： PaulGureghian1/Statsmodels

    #res.resid is of transformed model
    #np.corrcoef(res.resid.reshape(-1,n_groups, order='F'))
    y_pred = np.dot(mod.exog, res.params)
    resid = y - y_pred
    print(np.corrcoef(resid.reshape(-1, n_groups, order='F')))
    print(resid.std())
    err = y_pred - dgp.y_true
    print(err.std())
    #OLS standard errors are too small
    mod.res_pooled.params
    mod.res_pooled.bse
    #heteroscedasticity robust doesn't help
    mod.res_pooled.HC1_se
    #compare with cluster robust se

    print(sw.se_cov(sw.cov_cluster(mod.res_pooled, dgp.groups.astype(int))))
    #not bad, pretty close to panel estimator
    #and with Newey-West Hac
    print(sw.se_cov(sw.cov_nw_panel(mod.res_pooled, 4, mod.group.groupidx)))
    #too small, assuming no bugs,
    #see Peterson assuming it refers to same kind of model
    print(dgp.cov)

    mod2 = ShortPanelGLS(y, dgp.exog, dgp.groups)
    res2 = mod2.fit_iterative(2)
    print(res2.params)
    print(res2.bse)
    #both implementations produce the same results:
    from numpy.testing import assert_almost_equal
    assert_almost_equal(res.params, res2.params, decimal=12)
    assert_almost_equal(res.bse, res2.bse, decimal=13)

示例#19

0

显示文件

文件： ex_sandwich2.py 项目： changhiskhan/statsmodels

#remove nan observation
mask = (xx!=-999.0).all(1)   #nan code in dta file
mask.shape
y = y[mask]
xx = xx[mask]
group = group[mask]

#run OLS

res_srs = sm.OLS(y, xx).fit()
print 'params    ', res_srs.params
print 'bse_OLS   ', res_srs.bse

#get cluster robust standard errors and compare with STATA

cov_cr = sw.cov_cluster(res_srs, group.astype(int))
bse_cr = sw.se_cov(cov_cr)
print 'bse_rob   ', bse_cr

res_stata = np.rec.array(
     [ ('growth', '|', -0.1027121, 0.22917029999999999, -0.45000000000000001, 0.65500000000000003, -0.55483519999999997, 0.34941109999999997),
       ('emer', '|', -5.4449319999999997, 0.72939690000000001, -7.46, 0.0, -6.8839379999999997, -4.0059269999999998),
       ('yr_rnd', '|', -51.075690000000002, 22.83615, -2.2400000000000002, 0.027, -96.128439999999998, -6.0229350000000004),
       ('_cons', '|', 740.3981, 13.460760000000001, 55.0, 0.0, 713.84180000000003, 766.95439999999996)],
      dtype=[('exogname', '|S6'), ('del', '|S1'), ('params', '<f8'),
             ('bse', '<f8'), ('tvalues', '<f8'), ('pvalues', '<f8'),
             ('cilow', '<f8'), ('ciupp', '<f8')])

print 'diff Stata', bse_cr - res_stata.bse
assert_almost_equal(bse_cr, res_stata.bse, decimal=6)

示例#20

0

显示文件

reg_df.to_csv("estimated_data/final_regression_dfs/rn_prob_decline_ret.csv")

# Regression in levels:
res = smf.ols(formula='ret ~ ' + var, data=reg_df).fit(cov_type="HC1")
res.summary()

# Creating a dummy of 20% decline:
reg_df["dummy_ret_neg_20"] = np.where(reg_df['ret'] <= -0.2, 1, 0)
res = smf.ols(formula='dummy_ret_neg_20 ~ ' + var,
              data=reg_df).fit(cov_type="HC1")
res.summary()

# Calculating clustered standard errors:
cov_hetero = sandwich_covariance.cov_hc1(
    res)  # Replicates robust option in STATA
cov_cluster_firm = sandwich_covariance.cov_cluster(
    res, reg_df['secid'])  # Matches with one in STATA
cov_cluster_month = sandwich_covariance.cov_cluster(
    res, reg_df['month_lead'])  # Matches with one in STATA

#### Double clustering by firm and year ############################################
# To do it we need to calculate the covariance matrix with clustering
# by firm and by time and add them together, then subtract covariance matrix
# with Heteroskedascity correction
cov_double_cluster = cov_cluster_firm + cov_cluster_month - cov_hetero

np.sqrt(np.diag(cov_double_cluster))

# Excluding 2007, 2008 and 2009:
reg_df_excl_crisis = reg_df[(reg_df["month_lead"] < "2007-01-01") |
                            (reg_df["month_lead"] > "2009-12-31")]
reg_df_excl_crisis["dummy_ret_neg_20"] = np.where(

示例#21

0

显示文件

文件： ex_random_panel.py 项目： r0k3/statsmodels

    # res.resid is of transformed model
    # np.corrcoef(res.resid.reshape(-1,n_groups, order='F'))
    y_pred = np.dot(mod.exog, res.params)
    resid = y - y_pred
    print np.corrcoef(resid.reshape(-1, n_groups, order="F"))
    print resid.std()
    err = y_pred - dgp.y_true
    print err.std()
    # OLS standard errors are too small
    mod.res_pooled.params
    mod.res_pooled.bse
    # heteroscedasticity robust doesn't help
    mod.res_pooled.HC1_se
    # compare with cluster robust se

    print sw.se_cov(sw.cov_cluster(mod.res_pooled, dgp.groups.astype(int)))
    # not bad, pretty close to panel estimator
    # and with Newey-West Hac
    print sw.se_cov(sw.cov_nw_panel(mod.res_pooled, 4, mod.group.groupidx))
    # too small, assuming no bugs,
    # see Peterson assuming it refers to same kind of model
    print dgp.cov

    mod2 = ShortPanelGLS(y, dgp.exog, dgp.groups)
    res2 = mod2.fit_iterative(2)
    print res2.params
    print res2.bse
    # both implementations produce the same results:
    from numpy.testing import assert_almost_equal

    assert_almost_equal(res.params, res2.params, decimal=12)

示例#22

0

显示文件

文件： test_random_panel.py 项目： 0ceangypsy/statsmodels

def test_short_panel():
    #this checks that some basic statistical properties are satisfied by the
    #results, not verified results against other packages
    #Note: the ranking of robust bse is different if within=True
    #I added within keyword to PanelSample to be able to use old example
    #if within is False, then there is no within group variation in exog.
    nobs = 100
    nobs_i = 5
    n_groups = nobs // nobs_i
    k_vars = 3

    dgp = PanelSample(nobs, k_vars, n_groups, corr_structure=cs.corr_arma,
                      corr_args=([1], [1., -0.9],), seed=377769, within=False)
    #print 'seed', dgp.seed
    y = dgp.generate_panel()
    noise = y - dgp.y_true

    #test dgp

    dgp_cov_e = np.array(
              [[ 1.    ,  0.9   ,  0.81  ,  0.729 ,  0.6561],
               [ 0.9   ,  1.    ,  0.9   ,  0.81  ,  0.729 ],
               [ 0.81  ,  0.9   ,  1.    ,  0.9   ,  0.81  ],
               [ 0.729 ,  0.81  ,  0.9   ,  1.    ,  0.9   ],
               [ 0.6561,  0.729 ,  0.81  ,  0.9   ,  1.    ]])

    npt.assert_almost_equal(dgp.cov, dgp_cov_e, 13)

    cov_noise = np.cov(noise.reshape(-1,n_groups, order='F'))
    corr_noise = cov2corr(cov_noise)
    npt.assert_almost_equal(corr_noise, dgp.cov, 1)

    #estimate panel model
    mod2 = ShortPanelGLS(y, dgp.exog, dgp.groups)
    res2 = mod2.fit_iterative(2)


    #whitened residual should be uncorrelated
    corr_wresid = np.corrcoef(res2.wresid.reshape(-1,n_groups, order='F'))
    assert_maxabs(corr_wresid, np.eye(5), 0.1)

    #residual should have same correlation as dgp
    corr_resid = np.corrcoef(res2.resid.reshape(-1,n_groups, order='F'))
    assert_maxabs(corr_resid, dgp.cov, 0.1)

    assert_almost_equal(res2.resid.std(),1, decimal=0)

    y_pred = np.dot(mod2.exog, res2.params)
    assert_almost_equal(res2.fittedvalues, y_pred, 13)


    #compare with OLS

    res2_ols = mod2._fit_ols()
    npt.assert_(mod2.res_pooled is res2_ols)

    res2_ols = mod2.res_pooled  #TODO: BUG: requires call to _fit_ols

    #fitting once is the same as OLS
    #note: I need to create new instance, otherwise it continuous fitting
    mod1 = ShortPanelGLS(y, dgp.exog, dgp.groups)
    res1 = mod1.fit_iterative(1)

    assert_almost_equal(res1.params, res2_ols.params, decimal=13)
    assert_almost_equal(res1.bse, res2_ols.bse, decimal=13)

    res_ols = OLS(y, dgp.exog).fit()
    assert_almost_equal(res1.params, res_ols.params, decimal=13)
    assert_almost_equal(res1.bse, res_ols.bse, decimal=13)


    #compare with old version
    mod_old = ShortPanelGLS2(y, dgp.exog, dgp.groups)
    res_old = mod_old.fit()

    assert_almost_equal(res2.params, res_old.params, decimal=13)
    assert_almost_equal(res2.bse, res_old.bse, decimal=13)


    mod5 = ShortPanelGLS(y, dgp.exog, dgp.groups)
    res5 = mod5.fit_iterative(5)

    #make sure it's different
    #npt.assert_array_less(0.009, em.maxabs(res5.bse, res2.bse))

    cov_clu = sw.cov_cluster(mod2.res_pooled, dgp.groups.astype(int))
    clubse = se_cov(cov_clu)
    pnwbse = se_cov(sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx))
    bser = np.vstack((res2.bse, res5.bse, clubse, pnwbse))
    bser_mean = np.mean(bser, axis=0)

    #cov_cluster close to robust and PanelGLS
    #is up to 24% larger than mean of bser
    #npt.assert_array_less(0, clubse / bser_mean - 1)
    npt.assert_array_less(clubse / bser_mean - 1, 0.25)
    #cov_nw_panel close to robust and PanelGLS
    npt.assert_array_less(pnwbse / bser_mean - 1, 0.1)
    #OLS underestimates bse, robust at least 60% larger
    npt.assert_array_less(0.6, bser_mean / res_ols.bse  - 1)

    #cov_hac_panel with uniform_kernel is the same as cov_cluster for balanced
    #panel with full length kernel
    #I fixe default correction to be equal
    cov_uni = sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx,
                              weights_func=sw.weights_uniform,
                              use_correction='c')
    assert_almost_equal(cov_uni, cov_clu, decimal=13)

    #without correction
    cov_clu2 = sw.cov_cluster(mod2.res_pooled, dgp.groups.astype(int),
                              use_correction=False)
    cov_uni2 = sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx,
                              weights_func=sw.weights_uniform,
                              use_correction=False)
    assert_almost_equal(cov_uni2, cov_clu2, decimal=13)

    cov_white = sw.cov_white_simple(mod2.res_pooled)
    cov_pnw0 = sw.cov_nw_panel(mod2.res_pooled, 0, mod2.group.groupidx,
                              use_correction='hac')
    assert_almost_equal(cov_pnw0, cov_white, decimal=13)

示例#23

0

显示文件

#remove nan observation
mask = (xx!=-999.0).all(1)   #nan code in dta file
mask.shape
y = y[mask]
xx = xx[mask]
group = group[mask]

#run OLS

res_srs = sm.OLS(y, xx).fit()
print('params    ', res_srs.params)
print('bse_OLS   ', res_srs.bse)

#get cluster robust standard errors and compare with STATA

cov_cr = sw.cov_cluster(res_srs, group.astype(int))
bse_cr = sw.se_cov(cov_cr)
print('bse_rob   ', bse_cr)

res_stata = np.rec.array(
    [('growth', '|', -0.1027121, 0.22917029999999999, -0.45000000000000001, 0.65500000000000003, -0.55483519999999997, 0.34941109999999997),
     ('emer', '|', -5.4449319999999997, 0.72939690000000001, -7.46, 0.0, -6.8839379999999997, -4.0059269999999998),
     ('yr_rnd', '|', -51.075690000000002, 22.83615, -2.2400000000000002, 0.027, -96.128439999999998, -6.0229350000000004),
     ('_cons', '|', 740.3981, 13.460760000000001, 55.0, 0.0, 713.84180000000003, 766.95439999999996)],
    dtype=[('exogname', '|S6'), ('del', '|S1'), ('params', 'float'),
           ('bse', 'float'), ('tvalues', 'float'), ('pvalues', 'float'),
           ('cilow', 'float'), ('ciupp', 'float')])

print('diff Stata', bse_cr - res_stata.bse)
assert_almost_equal(bse_cr, res_stata.bse, decimal=6)

示例#24

0

显示文件

def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds):
    """create new results instance with robust covariance as default

    Parameters
    ----------
    cov_type : string
        the type of robust sandwich estimator to use. see Notes below
    use_t : bool
        If true, then the t distribution is used for inference.
        If false, then the normal distribution is used.
    kwds : depends on cov_type
        Required or optional arguments for robust covariance calculation.
        see Notes below

    Returns
    -------
    results : results instance
        This method creates a new results instance with the requested
        robust covariance as the default covariance of the parameters.
        Inferential statistics like p-values and hypothesis tests will be
        based on this covariance matrix.

    Notes
    -----
    Warning: Some of the options and defaults in cov_kwds may be changed in a
    future version.

    The covariance keywords provide an option 'scaling_factor' to adjust the
    scaling of the covariance matrix, that is the covariance is multiplied by
    this factor if it is given and is not `None`. This allows the user to
    adjust the scaling of the covariance matrix to match other statistical
    packages.
    For example, `scaling_factor=(nobs - 1.) / (nobs - k_params)` provides a
    correction so that the robust covariance matrices match those of Stata in
    some models like GLM and discrete Models.

    The following covariance types and required or optional arguments are
    currently available:

    - 'HC0', 'HC1', 'HC2', 'HC3' and no keyword arguments:
        heteroscedasticity robust covariance
    - 'HAC' and keywords

        - `maxlag` integer (required) : number of lags to use
        - `kernel` string (optional) : kernel, default is Bartlett
        - `use_correction` bool (optional) : If true, use small sample
              correction

    - 'cluster' and required keyword `groups`, integer group indicator

        - `groups` array_like, integer (required) :
              index of clusters or groups
        - `use_correction` bool (optional) :
              If True the sandwich covariance is calulated with a small
              sample correction.
              If False the the sandwich covariance is calulated without
              small sample correction.
        - `df_correction` bool (optional)
              If True (default), then the degrees of freedom for the
              inferential statistics and hypothesis tests, such as
              pvalues, f_pvalue, conf_int, and t_test and f_test, are
              based on the number of groups minus one instead of the
              total number of observations minus the number of explanatory
              variables. `df_resid` of the results instance is adjusted.
              If False, then `df_resid` of the results instance is not
              adjusted.

    - 'hac-groupsum' Driscoll and Kraay, heteroscedasticity and
        autocorrelation robust standard errors in panel data
        keywords

        - `time` array_like (required) : index of time periods
        - `maxlag` integer (required) : number of lags to use
        - `kernel` string (optional) : kernel, default is Bartlett
        - `use_correction` False or string in ['hac', 'cluster'] (optional) :
              If False the the sandwich covariance is calulated without
              small sample correction.
              If `use_correction = 'cluster'` (default), then the same
              small sample correction as in the case of 'covtype='cluster''
              is used.
        - `df_correction` bool (optional)
              adjustment to df_resid, see cov_type 'cluster' above
              #TODO: we need more options here

    - 'hac-panel' heteroscedasticity and autocorrelation robust standard
        errors in panel data.
        The data needs to be sorted in this case, the time series for
        each panel unit or cluster need to be stacked. The membership to
        a timeseries of an individual or group can be either specified by
        group indicators or by increasing time periods.

        keywords

        - either `groups` or `time` : array_like (required)
          `groups` : indicator for groups
          `time` : index of time periods
        - `maxlag` integer (required) : number of lags to use
        - `kernel` string (optional) : kernel, default is Bartlett
        - `use_correction` False or string in ['hac', 'cluster'] (optional) :
              If False the the sandwich covariance is calulated without
              small sample correction.
        - `df_correction` bool (optional)
              adjustment to df_resid, see cov_type 'cluster' above
              #TODO: we need more options here

    Reminder:
    `use_correction` in "hac-groupsum" and "hac-panel" is not bool,
    needs to be in [False, 'hac', 'cluster']

    TODO: Currently there is no check for extra or misspelled keywords,
    except in the case of cov_type `HCx`

    """

    import statsmodels.stats.sandwich_covariance as sw

    #normalize names
    if cov_type == 'nw-panel':
        cov_type = 'hac-panel'
    if cov_type == 'nw-groupsum':
        cov_type = 'hac-groupsum'
    if 'kernel' in kwds:
            kwds['weights_func'] = kwds.pop('kernel')

    # pop because HCx raises if any kwds
    sc_factor = kwds.pop('scaling_factor', None)

    # TODO: make separate function that returns a robust cov plus info
    use_self = kwds.pop('use_self', False)
    if use_self:
        res = self
    else:
        # this doesn't work for most models, use raw instance instead from fit
        res = self.__class__(self.model, self.params,
                   normalized_cov_params=self.normalized_cov_params,
                   scale=self.scale)

    res.cov_type = cov_type
    # use_t might already be defined by the class, and already set
    if use_t is None:
        use_t = self.use_t
    res.cov_kwds = {'use_t':use_t}  # store for information
    res.use_t = use_t

    adjust_df = False
    if cov_type in ['cluster', 'hac-panel', 'hac-groupsum']:
        df_correction = kwds.get('df_correction', None)
        # TODO: check also use_correction, do I need all combinations?
        if df_correction is not False: # i.e. in [None, True]:
            # user didn't explicitely set it to False
            adjust_df = True

    res.cov_kwds['adjust_df'] = adjust_df

    # verify and set kwds, and calculate cov
    # TODO: this should be outsourced in a function so we can reuse it in
    #       other models
    # TODO: make it DRYer   repeated code for checking kwds
    if cov_type.upper() in ('HC0', 'HC1', 'HC2', 'HC3'):
        if kwds:
            raise ValueError('heteroscedasticity robust covarians ' +
                             'does not use keywords')
        res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' +
                                       'robust ' + '(' + cov_type + ')')

        res.cov_params_default = getattr(self, 'cov_' + cov_type.upper(), None)
        if res.cov_params_default is None:
            # results classes that don't have cov_HCx attribute
            res.cov_params_default = sw.cov_white_simple(self,
                                                         use_correction=False)
    elif cov_type.lower() == 'hac':
        maxlags = kwds['maxlags']   # required?, default in cov_hac_simple
        res.cov_kwds['maxlags'] = maxlags
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        use_correction = kwds.get('use_correction', False)
        res.cov_kwds['use_correction'] = use_correction
        res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' +
             'and autocorrelation robust (HAC) using %d lags and %s small ' +
             'sample correction') % (maxlags, ['without', 'with'][use_correction])

        res.cov_params_default = sw.cov_hac_simple(self, nlags=maxlags,
                                             weights_func=weights_func,
                                             use_correction=use_correction)
    elif cov_type.lower() == 'cluster':
        #cluster robust standard errors, one- or two-way
        groups = kwds['groups']
        if not hasattr(groups, 'shape'):
            groups = np.asarray(groups).T

        if groups.ndim >= 2:
            groups = groups.squeeze()

        res.cov_kwds['groups'] = groups
        use_correction = kwds.get('use_correction', True)
        res.cov_kwds['use_correction'] = use_correction
        if groups.ndim == 1:
            if adjust_df:
                # need to find number of groups
                # duplicate work
                self.n_groups = n_groups = len(np.unique(groups))
            res.cov_params_default = sw.cov_cluster(self, groups,
                                             use_correction=use_correction)

        elif groups.ndim == 2:
            if hasattr(groups, 'values'):
                groups = groups.values

            if adjust_df:
                # need to find number of groups
                # duplicate work
                n_groups0 = len(np.unique(groups[:,0]))
                n_groups1 = len(np.unique(groups[:, 1]))
                self.n_groups = (n_groups0, n_groups1)
                n_groups = min(n_groups0, n_groups1) # use for adjust_df

            # Note: sw.cov_cluster_2groups has 3 returns
            res.cov_params_default = sw.cov_cluster_2groups(self, groups,
                                         use_correction=use_correction)[0]
        else:
            raise ValueError('only two groups are supported')
        res.cov_kwds['description'] = ('Standard Errors are robust to' +
                            'cluster correlation ' + '(' + cov_type + ')')

    elif cov_type.lower() == 'hac-panel':
        #cluster robust standard errors
        res.cov_kwds['time'] = time = kwds.get('time', None)
        res.cov_kwds['groups'] = groups = kwds.get('groups', None)
        #TODO: nlags is currently required
        #nlags = kwds.get('nlags', True)
        #res.cov_kwds['nlags'] = nlags
        #TODO: `nlags` or `maxlags`
        res.cov_kwds['maxlags'] = maxlags = kwds['maxlags']
        use_correction = kwds.get('use_correction', 'hac')
        res.cov_kwds['use_correction'] = use_correction
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        # TODO: clumsy time index in cov_nw_panel
        if groups is not None:
            groups = np.asarray(groups)
            tt = (np.nonzero(groups[:-1] != groups[1:])[0] + 1).tolist()
            nobs_ = len(groups)
        elif time is not None:
            # TODO: clumsy time index in cov_nw_panel
            time = np.asarray(time)
            tt = (np.nonzero(time[1:] < time[:-1])[0] + 1).tolist()
            nobs_ = len(time)
        else:
            raise ValueError('either time or groups needs to be given')
        groupidx = lzip([0] + tt, tt + [nobs_])
        self.n_groups = n_groups = len(groupidx)
        res.cov_params_default = sw.cov_nw_panel(self, maxlags, groupidx,
                                            weights_func=weights_func,
                                            use_correction=use_correction)
        res.cov_kwds['description'] = ('Standard Errors are robust to' +
                            'cluster correlation ' + '(' + cov_type + ')')
    elif cov_type.lower() == 'hac-groupsum':
        # Driscoll-Kraay standard errors
        res.cov_kwds['time'] = time = kwds['time']
        #TODO: nlags is currently required
        #nlags = kwds.get('nlags', True)
        #res.cov_kwds['nlags'] = nlags
        #TODO: `nlags` or `maxlags`
        res.cov_kwds['maxlags'] = maxlags = kwds['maxlags']
        use_correction = kwds.get('use_correction', 'cluster')
        res.cov_kwds['use_correction'] = use_correction
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        if adjust_df:
            # need to find number of groups
            tt = (np.nonzero(time[1:] < time[:-1])[0] + 1)
            self.n_groups = n_groups = len(tt) + 1
        res.cov_params_default = sw.cov_nw_groupsum(self, maxlags, time,
                                        weights_func=weights_func,
                                        use_correction=use_correction)
        res.cov_kwds['description'] = (
                    'Driscoll and Kraay Standard Errors are robust to ' +
                    'cluster correlation ' + '(' + cov_type + ')')
    else:
        raise ValueError('cov_type not recognized. See docstring for ' +
                         'available options and spelling')

    # generic optional factor to scale covariance

    res.cov_kwds['scaling_factor'] = sc_factor
    if sc_factor is not None:
        res.cov_params_default *= sc_factor

    if adjust_df:
        # Note: df_resid is used for scale and others, add new attribute
        res.df_resid_inference = n_groups - 1

    return res

示例#25

0

显示文件

def test_short_panel():
    #this checks that some basic statistical properties are satisfied by the
    ##1lab_results, not verified #1lab_results against other packages
    #Note: the ranking of robust bse is different if within=True
    #I added within keyword to PanelSample to be able to use old example
    #if within is False, then there is no within group variation in exog.
    nobs = 100
    nobs_i = 5
    n_groups = nobs // nobs_i
    k_vars = 3

    dgp = PanelSample(nobs, k_vars, n_groups, corr_structure=cs.corr_arma,
                      corr_args=([1], [1., -0.9],), seed=377769, within=False)
    #print 'seed', dgp.seed
    y = dgp.generate_panel()
    noise = y - dgp.y_true

    #test dgp

    dgp_cov_e = np.array(
              [[ 1.    ,  0.9   ,  0.81  ,  0.729 ,  0.6561],
               [ 0.9   ,  1.    ,  0.9   ,  0.81  ,  0.729 ],
               [ 0.81  ,  0.9   ,  1.    ,  0.9   ,  0.81  ],
               [ 0.729 ,  0.81  ,  0.9   ,  1.    ,  0.9   ],
               [ 0.6561,  0.729 ,  0.81  ,  0.9   ,  1.    ]])

    npt.assert_almost_equal(dgp.cov, dgp_cov_e, 13)

    cov_noise = np.cov(noise.reshape(-1,n_groups, order='F'))
    corr_noise = cov2corr(cov_noise)
    npt.assert_almost_equal(corr_noise, dgp.cov, 1)

    #estimate panel model
    mod2 = ShortPanelGLS(y, dgp.exog, dgp.groups)
    res2 = mod2.fit_iterative(2)


    #whitened residual should be uncorrelated
    corr_wresid = np.corrcoef(res2.wresid.reshape(-1,n_groups, order='F'))
    assert_maxabs(corr_wresid, np.eye(5), 0.1)

    #residual should have same correlation as dgp
    corr_resid = np.corrcoef(res2.resid.reshape(-1,n_groups, order='F'))
    assert_maxabs(corr_resid, dgp.cov, 0.1)

    assert_almost_equal(res2.resid.std(),1, decimal=0)

    y_pred = np.dot(mod2.exog, res2.params)
    assert_almost_equal(res2.fittedvalues, y_pred, 13)


    #compare with OLS

    res2_ols = mod2._fit_ols()
    npt.assert_(mod2.res_pooled is res2_ols)

    res2_ols = mod2.res_pooled  #TODO: BUG: requires call to _fit_ols

    #fitting once is the same as OLS
    #note: I need to create new instance, otherwise it continuous fitting
    mod1 = ShortPanelGLS(y, dgp.exog, dgp.groups)
    res1 = mod1.fit_iterative(1)

    assert_almost_equal(res1.params, res2_ols.params, decimal=13)
    assert_almost_equal(res1.bse, res2_ols.bse, decimal=13)

    res_ols = OLS(y, dgp.exog).fit()
    assert_almost_equal(res1.params, res_ols.params, decimal=13)
    assert_almost_equal(res1.bse, res_ols.bse, decimal=13)


    #compare with old version
    mod_old = ShortPanelGLS2(y, dgp.exog, dgp.groups)
    res_old = mod_old.fit()

    assert_almost_equal(res2.params, res_old.params, decimal=13)
    assert_almost_equal(res2.bse, res_old.bse, decimal=13)


    mod5 = ShortPanelGLS(y, dgp.exog, dgp.groups)
    res5 = mod5.fit_iterative(5)

    #make sure it's different
    #npt.assert_array_less(0.009, em.maxabs(res5.bse, res2.bse))

    cov_clu = sw.cov_cluster(mod2.res_pooled, dgp.groups.astype(int))
    clubse = se_cov(cov_clu)
    pnwbse = se_cov(sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx))
    bser = np.vstack((res2.bse, res5.bse, clubse, pnwbse))
    bser_mean = np.mean(bser, axis=0)

    #cov_cluster close to robust and PanelGLS
    #is up to 24% larger than mean of bser
    #npt.assert_array_less(0, clubse / bser_mean - 1)
    npt.assert_array_less(clubse / bser_mean - 1, 0.25)
    #cov_nw_panel close to robust and PanelGLS
    npt.assert_array_less(pnwbse / bser_mean - 1, 0.1)
    #OLS underestimates bse, robust at least 60% larger
    npt.assert_array_less(0.6, bser_mean / res_ols.bse  - 1)

    #cov_hac_panel with uniform_kernel is the same as cov_cluster for balanced
    #panel with full length kernel
    #I fixe default correction to be equal
    cov_uni = sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx,
                              weights_func=sw.weights_uniform,
                              use_correction='c')
    assert_almost_equal(cov_uni, cov_clu, decimal=13)

    #without correction
    cov_clu2 = sw.cov_cluster(mod2.res_pooled, dgp.groups.astype(int),
                              use_correction=False)
    cov_uni2 = sw.cov_nw_panel(mod2.res_pooled, 4, mod2.group.groupidx,
                              weights_func=sw.weights_uniform,
                              use_correction=False)
    assert_almost_equal(cov_uni2, cov_clu2, decimal=13)

    cov_white = sw.cov_white_simple(mod2.res_pooled)
    cov_pnw0 = sw.cov_nw_panel(mod2.res_pooled, 0, mod2.group.groupidx,
                              use_correction='hac')
    assert_almost_equal(cov_pnw0, cov_white, decimal=13)

示例#26

0

显示文件

文件： ex_sandwich.py 项目： zhisheng/statsmodels

#test White
assert_almost_equal(bse_w, self.HC0_se, 15)

bse_wc = sw.se_cov(sw.cov_white_simple(self, use_correction=True))
print bse_wc
#test White
assert_almost_equal(bse_wc, self.HC1_se, 15)


groups = np.repeat(np.arange(5), 20)

idx = np.nonzero(np.diff(groups))[0].tolist()
groupidx = zip([0]+idx, idx+[len(groups)])
ngroups = len(groupidx)

print sw.se_cov(sw.cov_cluster(self, groups))
#two strange looking corner cases BUG?
print sw.se_cov(sw.cov_cluster(self, np.ones(len(endog), int), use_correction=False))
print sw.se_cov(sw.cov_crosssection_0(self, np.arange(len(endog))))
#these results are close to simple (no group) white, 50 groups 2 obs each
groups = np.repeat(np.arange(50), 100//50)
print sw.se_cov(sw.cov_cluster(self, groups))
#2 groups with 50 obs each, what was the interpretation again?
groups = np.repeat(np.arange(2), 100//2)
print sw.se_cov(sw.cov_cluster(self, groups))

"http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt"
'''
test <- read.table(
      url(paste("http://www.kellogg.northwestern.edu/",
            "faculty/petersen/htm/papers/se/",

示例#27

0

显示文件

文件： test_sandwich_cov.py 项目： GonzaloUlla/Bankruptcy-Prediction-using-Machine-Learning-Algorithms-in-Python

    def get_robust_clu(cls):
        res1 = cls.res1
        cov_clu = sw.cov_cluster(res1, group)
        cls.bse_rob = sw.se_cov(cov_clu)

        cls.corr_fact = cls.get_correction_factor(res1)