def vif_cal(input_data, dependent_col, endo, instrument, reg): if reg == '2SLS': x_vars = input_data.drop([dependent_col, instrument], axis=1) ins_vars = input_data.drop([dependent_col, endo], axis=1) else: x_vars = input_data.drop([dependent_col, instrument], axis=1) ins_vars = input_data.drop([dependent_col, instrument], axis=1) xvar_names = x_vars.columns vif = list() for i in range(0, xvar_names.shape[0]): y = x_vars[xvar_names[i]] x = x_vars[xvar_names.drop(xvar_names[i])] if reg == '2SLS': rsq = IV2SLS(y, x, ins_vars).fit().rsquared else: rsq = smf.ols(formula="y~x", data=x_vars).fit().rsquared vif.append(round(1 / (1 - rsq), 2)) if reg == 'OLS': return pd.DataFrame({ 'Var': xvar_names[i], 'VIF/OLS': vif[i] } for i in range(0, xvar_names.shape[0])) elif reg == '2SLS': return pd.DataFrame({ 'Var': xvar_names[i], 'VIF/2SLS': vif[i] } for i in range(0, xvar_names.shape[0]))
def _estimate_effect(self): if len(self.estimating_instrument_names) == 1 and len(self._treatment_name) == 1: instrument = self._estimating_instruments.iloc[:,0] self.logger.debug("Instrument Variable values: {0}".format(instrument)) num_unique_values = len(np.unique(instrument)) instrument_is_binary = (num_unique_values <= 2) if instrument_is_binary: # Obtain estimate by Wald Estimator y1_z = np.mean(self._outcome[instrument == 1]) y0_z = np.mean(self._outcome[instrument == 0]) x1_z = np.mean(self._treatment[self._treatment_name[0]][instrument == 1]) x0_z = np.mean(self._treatment[self._treatment_name[0]][instrument == 0]) num = y1_z - y0_z deno = x1_z - x0_z iv_est = num / deno else: # Obtain estimate by 2SLS estimator: Cov(y,z) / Cov(x,z) num_yz = np.cov(self._outcome, instrument)[0, 1] deno_xz = np.cov(self._treatment[self._treatment_name[0]], instrument)[0, 1] iv_est = num_yz / deno_xz else: # More than 1 instrument. Use 2sls. est_treatment = self._treatment.astype(np.float32) est_outcome = self._outcome.astype(np.float32) ivmodel = IV2SLS(est_outcome, est_treatment, self._estimating_instruments) reg_results = ivmodel.fit() print(reg_results.summary()) iv_est = sum(reg_results.params) # the effect is the same for any treatment value (assume treatment goes from 0 to 1) estimate = CausalEstimate(estimate=iv_est, control_value=self._control_value, treatment_value=self._treatment_value, target_estimand=self._target_estimand, realized_estimand_expr=self.symbolic_estimator) return estimate
def IVRegression2(data): """ 使用工具变量估计模型参数 """ data = sm.add_constant(data[["Y", "X1", "X2", "IV1"]]) model = IV2SLS(data[["Y"]], data[["X1", "X2", "const"]], data[["IV1", "X2", "const"]]) re = model.fit() print("使用工具变量") print(re.summary()) print("Durbin–Wu–Hausman检验") print(re.spec_hausman())
def TSLS_FIX(df, y_var, first_y, X_vars, IV, fix1, fix2=None, add_intercept=True): """ This function replicates probit in STATA, for probit model. 至少有一个固定效应变量,至多只能有两个。 Inputs. --------- df:pd.DataFrame, the data for OLS. y_var:str, the column name of the dependent variable first_y:str, the column name of the first-stage y X_vars:list of str, the list of explanatory variable names IV:list str, the list of instrument variable names fix1:str, the column name of the first fix effect variable fix2:str, the column name of the second fix effect variable Outputs. --------- res:obj """ new_df = df.copy() new_df = new_df.dropna() if fix2 is None: # data.dropna(subset=[fix1], inplace=True) fix2 = 'time_index' # new_df = new_df[[y_var] + [first_y] + X_vars + IV + [fix1]] new_df = new_df.groupby(fix1).apply(demean) else: # data.dropna(subset=[fix1,fix2], inplace=True) # new_df = new_df[[y_var] + [first_y] + X_vars + IV + [fix1, fix2]] new_df = new_df.groupby([fix1, fix2]).apply(demean) y = new_df[y_var] if add_intercept: new_df['intercept'] = 1.0 X = new_df[['intercept'] + [first_y] + X_vars] X_vars = ['intercept'] + X_vars else: X = new_df[[first_y] + X_vars] # IV and all x that is not explained by the IV TSLS_mod = IV2SLS(endog=y, exog=X, instrument=new_df[X_vars + IV]) res = TSLS_mod.fit() return res
def two_stage_least_squares(z: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray: """Fit 2sls model to data. Args: z: Instrument x: Treatment y: Outcome Returns: coeff: The coefficients of the estimated linear cause-effect relation. """ x = add_constant(onp.array(x)) z = add_constant(onp.array(z)) y = onp.array(y) iv2sls = IV2SLS(y, x, z).fit() logging.info(iv2sls.summary()) return np.array(iv2sls.params)
def fit(self, X, treatment, y, w): """Fits the 2SLS model. Args: X (np.matrix or np.array or pd.Dataframe): a feature matrix treatment (np.array or pd.Series): a treatment vector y (np.array or pd.Series): an outcome vector w (np.array or pd.Series): an instrument vector """ X, treatment, y, w = convert_pd_to_np(X, treatment, y, w) exog = sm.add_constant(np.c_[X, treatment]) endog = y instrument = sm.add_constant(np.c_[X, w]) self.iv_model = IV2SLS(endog=endog, exog=exog, instrument=instrument) self.iv_fit = self.iv_model.fit()
def weak_instruments(self, n_sims=20): np.random.seed(1692) model = feedforward.FeedForwardModel(19, 1, dense_size=60, n_dense_layers=2) treatment_effects = [] ols_betas, ols_ses = [], [] old_corrs, new_corrs = [], [] for _ in xrange(n_sims): df = self.treatment_gen.simulate_data(False) X = np.hstack((self.x, df['new_treat'].values[:, None])) Z = np.hstack((self.x, df['instrument'].values[:, None])) ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X) ols_betas.append(ols_beta) ols_ses.append(ols_se) old_corr = df[['instrument', 'new_treat']].corr().values[0, 1] new_instrument, new_corr = model.fit_instruments( X, Z, df['treatment_effect'].values, batchsize=128) new_corrs.append(new_corr) old_corrs.append(old_corr) Z2 = Z.copy() Z2[:, -1] = new_instrument[:, 0] iv = IV2SLS(df['treatment_effect'].values.flatten(), add_constant(X), add_constant(Z2)) model.reset_params() if new_corr: logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs)) logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses)) logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
def TSLS(df, y_var, firsts_y, X_vars, IV, add_intercept=True): """ This function replicates probit in STATA, for probit model. 至少有一个固定效应变量,至多只能有两个。 Inputs. --------- df:pd.DataFrame, the data for OLS. y_var:str, the column name of the dependent variable firsts_y:str, the column name of the first-stage y X_vars:list of str, the list of explanatory variable names IV:list str, the list of instrument variable names Outputs. --------- res:obj """ new_df = df.copy() new_df = new_df.dropna() y = new_df[y_var] if add_intercept: new_df['intercept'] = 1.0 x = ['intercept'] + [firsts_y] + X_vars # new_df.dropna(subset=temp, inplace=True) X = new_df[x] # X = new_df[['intercept'] + [firsts_y] + X_vars] X_vars = ['intercept'] + X_vars else: x = [firsts_y] + X_vars # new_df.dropna(subset=temp, inplace=True) X = new_df[x] # X = new_df[[firsts_y] + X_vars] # IV and all x that is not explained by the IV TSLS_mod = IV2SLS(endog=y, exog=X, instrument=new_df[X_vars + IV]) res = TSLS_mod.fit() return res
plt.show() # Or it can be plotted another way plt.scatter(CollegeDistance['distance'], CollegeDistance['education']) # Run the regression reg = smf.ols('education ~ distance', data=CollegeDistance).fit() print(reg.summary()) # Still true with controls? reg = smf.ols('education ~ distance + gender + ethnicity + unemp + urban', data=CollegeDistance).fit() print(reg.summary()) # And robust standard errors reg_robust = reg.get_robustcov_results(cov_type='HC1') print(reg_robust.summary()) # Ok, so let's use it as an instrument # To run an Instrumental Variables Regression, use the command IV2SLS: CollegeDistance['const'] = 1 iv = IV2SLS( dependent=CollegeDistance['wage'], exog=CollegeDistance[['const', 'gender', 'ethnicity', 'unemp', 'urban']], endog=CollegeDistance['education'], instruments=CollegeDistance['distance']).fit() print(iv.summary)
def table13_ext9(df, name, trust): dependent = [ 'voice', 'PolStab', 'GovEffec', 'RegQual', 'RulLaw', 'ConCorr' ] table = [f'table{i}' for i in range(6)] for dep, i in zip(dependent, range(6)): df_13 = df[[ f'{name}_C2', f'{name}_instrument_C2_thresh', f'{name}_I', 'voice', 'PolStab', 'GovEffec', 'RegQual', 'RulLaw', 'ConCorr', 'trust', 'ethnic_party_dum', 'dummy_sepx_nm' ]].dropna(axis=0) y1 = df_13[f'{dep}'] x1 = sm.add_constant(df_13[[f'{name}_C2', f'{name}_I']]) x2 = sm.add_constant(df_13[[f'{name}_C2', f'{name}_I', 'trust']]) x3 = sm.add_constant(df_13[[ f'{name}_C2', f'{name}_I', 'trust', 'ethnic_party_dum', 'dummy_sepx_nm' ]]) ins1 = sm.add_constant( df_13[[f'{name}_instrument_C2_thresh', f'{name}_I']]) ins2 = sm.add_constant( df_13[[f'{name}_instrument_C2_thresh', f'{name}_I', 'trust']]) ins3 = sm.add_constant(df_13[[ f'{name}_instrument_C2_thresh', f'{name}_I', 'trust', 'ethnic_party_dum', 'dummy_sepx_nm' ]]) est = [f'est{i}' for i in range(6)] est[0] = sm.OLS(y1, x1).fit(cov_type='HC1') est[1] = sm.OLS(y1, x2).fit(cov_type='HC1') est[2] = sm.OLS(y1, x3).fit(cov_type='HC1') est[3] = IV2SLS(y1, x1, ins1).fit() est[4] = IV2SLS(y1, x2, ins2).fit() est[5] = IV2SLS(y1, x3, ins3).fit() if trust == 'trust': table[i] = pd.DataFrame( { 'OLS / Trust': [ est[1].params.values[3], est[1].bse.values[3], est[1].pvalues[3] ], 'OLS / All': [ est[2].params.values[3], est[2].bse.values[3], est[2].pvalues[3] ], '2SLS / Trust': [ est[4].params.values[3], est[4].bse.values[3], est[4].pvalues[3] ], '2SLS / All': [ est[5].params.values[3], est[5].bse.values[3], est[5].pvalues[3] ] }, index=['Trust', 'Standard Error', 'p-value']) table[i].index = pd.MultiIndex.from_product([[f'{dep}'], table[i].index]) else: table[i] = pd.DataFrame( { 'OLS / None': [ est[0].params.values[1], est[0].bse.values[1], est[0].pvalues[1] ], 'OLS / Trust': [ est[1].params.values[1], est[1].bse.values[1], est[1].pvalues[1] ], 'OLS / All': [ est[2].params.values[1], est[2].bse.values[1], est[2].pvalues[1] ], '2SLS / None': [ est[3].params.values[1], est[3].bse.values[1], est[3].pvalues[1] ], '2SLS / Trust': [ est[4].params.values[1], est[4].bse.values[1], est[4].pvalues[1] ], '2SLS / All': [ est[5].params.values[1], est[5].bse.values[1], est[5].pvalues[1] ] }, index=['Segregation', 'Standard Error', 'p-value']) table[i].index = pd.MultiIndex.from_product([[f'{dep}'], table[i].index]) table = pd.concat(table) table = table.rename( index={ 'voice': 'Voice', 'PolStab': 'Political stability', 'GovEffec': 'Govern-t effectiv.', 'RegQual': 'Regul. quality', 'RulLaw': 'Rule of law', 'ConCorr': 'Control of corr' }) table.index.names = ['Dependent Var', ''] return table
def df_table12(df, name): df_table12 = df[[ f'{name}_C2', f'{name}_instrument_C2_thresh', f'{name}_I', 'trust', 'democ', 'lnpopulation', 'lnArea', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'mtnall' ]].dropna(axis=0) df_demo = df_table12[df_table12.democ > 1] dep1 = df_table12['trust'] dep2 = df_demo['trust'] exo1 = sm.add_constant(df_table12[f'{name}_C2']) exo2 = sm.add_constant(df_table12[[ f'{name}_C2', f'{name}_I', 'lnpopulation', 'lnArea', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) exo3 = sm.add_constant(df_demo[[ f'{name}_C2', f'{name}_I', 'lnpopulation', 'lnArea', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) ins1 = sm.add_constant(df_table12[f'{name}_instrument_C2_thresh']) ins2 = sm.add_constant(df_table12[[ f'{name}_instrument_C2_thresh', f'{name}_I', 'lnpopulation', 'lnArea', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) ins3 = sm.add_constant(df_demo[[ f'{name}_instrument_C2_thresh', f'{name}_I', 'lnpopulation', 'lnArea', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) reg1 = sm.OLS(dep1, exo1).fit(cov_type='HC1') reg2 = sm.OLS(dep1, exo2).fit(cov_type='HC1') reg3 = sm.OLS(dep2, exo3).fit(cov_type='HC1') reg4 = IV2SLS(dep1, exo1, ins1).fit() reg5 = IV2SLS(dep1, exo2, ins2).fit() reg6 = IV2SLS(dep2, exo3, ins3).fit() stargazer = Stargazer([reg1, reg2, reg3, reg4, reg5, reg6]) stargazer.covariate_order([f'{name}_C2', f'{name}_I']) stargazer.rename_covariates({ f'{name}_C2': 'Segregation $\hat{S}$ (' f'{name}' ')', f'{name}_I': 'Fractionalization $F$ (' f'{name}' ')' }) stargazer.custom_columns(['OLS', 'OLS', 'OLS', '2SLS', '2SLS', '2SLS'], [1, 1, 1, 1, 1, 1]) stargazer.add_line('Controls', ['No', 'Yes', 'Yes', 'No', 'Yes', 'Yes']) stargazer.add_line('Sample', ['Full', 'Full', 'Democ', 'Full', 'Full', 'Democ']) if name == 'ethnicity': stargazer.title('Panel A. Ethnicity') return stargazer else: stargazer.title('Panel B. Language') return stargazer
def table10_11(df, name, democ): full_x = [ f'{name}_I', f'{name}_C2', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ] ins = [ f'{name}_I', f'{name}_instrument_C2_thresh', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ] df_10_11_1 = df[[ f'{name}_C2', f'{name}_I', f'{name}_instrument_C2_thresh', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'icrg_qog' ]].dropna(axis=0) df_10_11_2 = df[[ f'{name}_C2', f'{name}_I', f'{name}_instrument_C2_thresh', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'ef_regul', 'ef_corruption', 'ef_property_rights' ]].dropna(axis=0) df_10_11_3 = df[[ f'{name}_C2', f'{name}_I', f'{name}_instrument_C2_thresh', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'taxevas' ]].dropna(axis=0) if democ == 'democracy': df_10_11_1 = df_10_11_1[df_10_11_1.democ >= 1] df_10_11_2 = df_10_11_2[df_10_11_2.democ >= 1] df_10_11_3 = df_10_11_3[df_10_11_3.democ >= 1] x1 = sm.add_constant(df_10_11_1[full_x]) x2 = sm.add_constant(df_10_11_2[full_x]) x3 = sm.add_constant(df_10_11_3[full_x]) ins1 = sm.add_constant(df_10_11_1[ins]) ins2 = sm.add_constant(df_10_11_2[ins]) ins3 = sm.add_constant(df_10_11_3[ins]) else: x1 = sm.add_constant(df_10_11_1[[f'{name}_I', f'{name}_C2']]) x2 = sm.add_constant(df_10_11_2[[f'{name}_I', f'{name}_C2']]) x3 = sm.add_constant(df_10_11_3[[f'{name}_I', f'{name}_C2']]) ins1 = sm.add_constant( df_10_11_1[[f'{name}_I', f'{name}_instrument_C2_thresh']]) ins2 = sm.add_constant( df_10_11_2[[f'{name}_I', f'{name}_instrument_C2_thresh']]) ins3 = sm.add_constant( df_10_11_3[[f'{name}_I', f'{name}_instrument_C2_thresh']]) y1 = df_10_11_1['icrg_qog'] y2 = df_10_11_2['ef_corruption'] y3 = df_10_11_2['ef_property_rights'] y4 = df_10_11_2['ef_regul'] y5 = df_10_11_3['taxevas'] est1 = sm.OLS(y1, x1).fit(cov_type='HC1') est2 = IV2SLS(y1, x1, ins1).fit() est3 = sm.OLS(y2, x2).fit(cov_type='HC1') est4 = IV2SLS(y2, x2, ins2).fit() est5 = sm.OLS(y3, x2).fit(cov_type='HC1') est6 = IV2SLS(y3, x2, ins2).fit() est7 = sm.OLS(y4, x2).fit(cov_type='HC1') est8 = IV2SLS(y4, x2, ins2).fit() est9 = sm.OLS(y5, x3).fit(cov_type='HC1') est10 = IV2SLS(y5, x3, ins3).fit() stargazer = Stargazer( [est1, est2, est3, est4, est5, est6, est7, est8, est9, est10]) stargazer.custom_columns([ 'ICRG quality of gov', 'EF Corruption', 'EF Property rights', 'EF Regulation', 'Tax eva' ], [2, 2, 2, 2, 2]) stargazer.show_model_numbers(False) stargazer.covariate_order([f'{name}_C2', f'{name}_I']) stargazer.rename_covariates({ f'{name}_C2': 'Segregation $\hat{S}$ (' f'{name}' ')', f'{name}_I': 'Fractionalization $F$ (' f'{name}' ')' }) stargazer.add_line('Method', [ 'OLS', '2SLS', 'OLS', '2SLS', 'OLS', '2SLS', 'OLS', '2SLS', 'OLS', '2SLS' ]) if democ == 'democracy': stargazer.title('Panel B. Democracies sample, all controls') return stargazer else: stargazer.title('Panel A. Full sample, no additional controls') return stargazer
# -*- coding: utf-8 -*- #%% NumPyの読み込み import numpy as np # SciPyのstatsモジュールの読み込み import scipy.stats as st # statsmodelsの読み込み import statsmodels.api as sm # 2段階最小2乗法を実行するIV2SLSとGMMを実行するIVGMMの読み込み from statsmodels.sandbox.regression.gmm import IV2SLS, IVGMM #%% RdatasetsからMrozの読み込み mroz = sm.datasets.get_rdataset('Mroz', 'Ecdat') mroz.data = mroz.data[mroz.data['hearnw'] > 0] print(st.pearsonr(mroz.data['educw'], mroz.data['educwf'])) print(st.pearsonr(mroz.data['educw'], mroz.data['educwm'])) #%% 収入を教育年数で説明する単回帰モデル y = np.log(mroz.data['hearnw']) x = mroz.data[['educw']] X = sm.add_constant(x) results_ols = sm.OLS(y, X).fit(use_t=False) print(results_ols.summary()) #%% 父母の教育年数を操作変数として使う2SLS z = mroz.data[['educwf', 'educwm']] Z = sm.add_constant(z) results_iv = IV2SLS(y, X, instrument=Z).fit() print(results_iv.summary()) #%% 2SLSの代わりにGMMでIV推定量を求める results_gmm = IVGMM(y, X, instrument=Z).fit() print(results_gmm.summary())
def table3_7(df, regression_type): df_3_7E = df[[ 'ethnicity_C2', 'ethnicity_instrument_C2_thresh', 'ethnicity_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'lnArea', 'LOScandin', 'democ', 'mtnall', 'RulLaw' ]].dropna(axis=0) df_3_7L = df[[ 'language_C2', 'language_instrument_C2_thresh', 'language_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'lnArea', 'LOScandin', 'democ', 'mtnall', 'RulLaw' ]].dropna(axis=0) df_3_7R = df[[ 'religion_C2', 'religion_instrument_C2_thresh', 'religion_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'lnArea', 'LOScandin', 'democ', 'mtnall', 'RulLaw' ]].dropna(axis=0) exo = sm.add_constant(df_3_7E[[ 'ethnicity_C2', 'ethnicity_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'lnArea', 'democ', 'mtnall' ]]) exo2 = sm.add_constant(df_3_7E[['ethnicity_C2', 'ethnicity_I']]) exo3 = sm.add_constant(df_3_7L[[ 'language_C2', 'language_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'lnArea', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) exo4 = sm.add_constant(df_3_7L[['language_C2', 'language_I']]) exo5 = sm.add_constant(df_3_7R[[ 'religion_C2', 'religion_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'lnArea', 'democ', 'mtnall' ]]) exo6 = sm.add_constant(df_3_7R[['religion_C2', 'religion_I']]) if regression_type == 'IV2SLS': reg = IV2SLS( df_3_7E['RulLaw'], exo, sm.add_constant(df_3_7E[[ 'ethnicity_instrument_C2_thresh', 'ethnicity_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'lnArea' ]])).fit() reg2 = IV2SLS( df_3_7E['RulLaw'], exo2, sm.add_constant( df_3_7E[['ethnicity_instrument_C2_thresh', 'ethnicity_I']])).fit() reg3 = IV2SLS( df_3_7L['RulLaw'], exo3, sm.add_constant(df_3_7L[[ 'language_instrument_C2_thresh', 'language_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'lnArea' ]])).fit() reg4 = IV2SLS( df_3_7L['RulLaw'], exo4, sm.add_constant( df_3_7L[['language_instrument_C2_thresh', 'language_I']])).fit() reg5 = IV2SLS( df_3_7R['RulLaw'], exo5, sm.add_constant(df_3_7R[[ 'religion_instrument_C2_thresh', 'religion_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'democ', 'mtnall', 'lnArea' ]])).fit() reg6 = IV2SLS( df_3_7R['RulLaw'], exo6, sm.add_constant( df_3_7R[['religion_instrument_C2_thresh', 'religion_I']])).fit() elif regression_type == 'OLS': reg2 = sm.OLS(df_3_7E['RulLaw'], exo2).fit(cov_type='HC1') reg = sm.OLS(df_3_7E['RulLaw'], exo).fit(cov_type='HC1') reg4 = sm.OLS(df_3_7L['RulLaw'], exo4).fit(cov_type='HC1') reg3 = sm.OLS(df_3_7L['RulLaw'], exo3).fit(cov_type='HC1') reg6 = sm.OLS(df_3_7R['RulLaw'], exo6).fit(cov_type='HC1') reg5 = sm.OLS(df_3_7R['RulLaw'], exo5).fit(cov_type='HC1') stargazer = Stargazer([reg2, reg, reg4, reg3, reg6, reg5]) stargazer.covariate_order([ 'ethnicity_C2', 'ethnicity_I', 'language_C2', 'language_I', 'religion_C2', 'religion_I', 'lnpopulation', 'lnGDP_pc', 'lnArea', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'const' ]) stargazer.rename_covariates({ 'ethnicity_C2': 'Segregation $\hat{S}$ (ethnicity)', 'ethnicity_I': 'Fractionalization $F$ (ethnicity)', 'language_C2': 'Segregation $\hat{S}$ (language)', 'language_I': 'Fractionalization $F$ (language)', 'religion_C2': 'Segregation $\hat{S}$ (religion)', 'religion_I': 'Fractionalization $F$ (religion)', 'lnpopulation': 'ln (population)', 'lnGDP_pc': 'ln (GDP per capita)', 'lnArea': 'ln (average size of region)', 'protestants': 'Pretestants share', 'muslims': 'Muslmis Share', 'catholics': 'Catholics share', 'latitude': 'Latitude', 'LOEnglish': 'English legal origin', 'LOGerman': 'German legal origin', 'LOSocialist': 'Socialist legal origin', 'LOScandin': 'Scandinavian legal origin', 'democ': 'Democratic tradition', 'mtnall': 'Mountains', 'const': 'Constant' }) return HTML(stargazer.render_html())
### STORE DATA FOR 1995 ONLY TO AVOID WORKING WITH PANEL DATA df_1995 = df[df['year'] == 1995] ### CALCULATE LINEAR REGRESSION - 1 STAGE OLS lm = smf.ols( 'np.log(packpc) ~ np.log(real_price)', data=df_1995 ) # ==> Coefficientes are interpreted as demand elasticity for cigarettes. # ==> An increase in 1% of prices should cause a 1,21% decrease in demand. fit_lm = lm.fit() print(fit_lm.summary()) ### CALCULATE LINEAR REGRESSION WITH INSTRUMENTAL VARIABLES AS ENDOGENOUS VARIABLES ARE INVOLVED IN THE CASE - 2 STAGE OLS lm2_1 = smf.ols('np.log(real_price) ~ sales_tax', data=df_1995) fit_lm2_1 = lm2_1.fit() print( fit_lm2_1.summary() ) # ==> As expected by the results of calculated correlations, R2 is relatively high. # ==> In univariate regression models, R2 tends to be similar with correlation. ### 2 STAGE orig = np.log(df_1995['real_price']).values fitted = fit_lm2_1.fittedvalues.values lm2_2 = smf.ols('np.log(packpc) ~ fitted', data=df_1995) fit_lm2_2 = lm2_2.fit() print( fit_lm2_2.summary() ) # ==> Obtain corrected regression with revised elasticy for cigarettes. However, Standard Deviations are manually calculated. fit_lm2_2_iv = IV2SLS(np.log(df_1995['packpc']), fitted, instrument=df_1995['sales_tax']).fit() print(fit_lm2_2_iv.summary())
# Run the hypothesis test that the coefficient on electric is 0: hypothesis = '(electric = 0)' print(relevancy_results.f_test(hypothesis)) # Part 4: Instrumenting using two-stage least squares # no_null_iv = fertility[(fertility['agefbrth'].notnull()) & (fertility['electric'].notnull()) & (fertility['monthfm'].notnull()) & (fertility['ceb'].notnull()) & (fertility['educ'].notnull()) & (fertility['idlnchld'].notnull())] endog = no_null_iv['agefbrth'] exog = no_null_iv[['monthfm', 'ceb', 'idlnchld', 'educ']] instr = no_null_iv[['monthfm', 'ceb', 'idlnchld', 'electric']] dep_var_iv = no_null_iv['agefbrth'] # exog_constant = sm.add_constant(exog) instr_constant = sm.add_constant(instr) no_endog_results = IV2SLS(endog, exog_constant, instrument = instr_constant).fit() # no_endog_results.summary() # print_resids(no_endog_results.predict(), no_endog_results.resid) # print("the descriptive statistics for the errors and a histogram of them:\n\n", no_endog_results.resid.describe()) sns.distplot(no_endog_results.resid); # Part 5: replicate using matrix algebra x_mat_ols = np.matrix(x_const) y_mat_ols = np.matrix(y) y_mat_ols = np.reshape(y_mat_ols, (-1, 1)) #reshape so that its a single column vector, not row vector b_ols = np.linalg.inv(x_mat_ols.T*x_mat_ols)*x_mat_ols.T*y_mat_ols print(b_ols) # y_iv_mat = np.matrix(endog)
# model that *part* of the y_{t} - y_{t-1} is an independent endogenous variable # To correct for this we would have to do the following y_instrumented = macro_mod.wexog[0][:, 1] whitened_ydiff = y_instrumented - y[:-1] wexog = np.column_stack((macrodata['tbilrate'][1:], whitened_ydiff)) wexog = sm.add_constant(wexog, prepend=True) correct_params = sm.GLS(macrodata['realinv'][1:], wexog).fit().params print "If we correctly instrument everything, then these are the parameters" print "for the second equation" print correct_params print "Compare to output of R script statsmodels/sandbox/tests/macrodata.s" print '\nUsing IV2SLS' from statsmodels.sandbox.regression.gmm import IV2SLS miv = IV2SLS(macro_sys[0], macro_sys[1], instruments) resiv = miv.fit() print "equation 1" print resiv.params miv2 = IV2SLS(macro_sys[2], macro_sys[3], instruments) resiv2 = miv2.fit() print "equation 2" print resiv2.params ### Below is the same example using Greene's data ### run_greene = 0 if run_greene: try: data3 = np.genfromtxt('/home/skipper/school/MetricsII/Greene \ TableF5-1.txt',
def table8_9_ext5(df, name, GDP): df_8_9A = df[[ f'{name}_C2', f'{name}_I', f'{name}_instrument_C2_thresh', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'voice', 'PolStab', 'GovEffec', 'RegQual', 'ConCorr', 'RulLaw' ]].dropna(axis=0) df_8_9B = df_8_9A[[ f'{name}_C2', f'{name}_instrument_C2_thresh', f'{name}_I', 'voice', 'PolStab', 'GovEffec', 'RegQual', 'ConCorr', 'RulLaw' ]] if GDP == 'democ': df_8_9C = df_8_9A[df_8_9A.democ >= 1] elif GDP == 'GDP': df_8_9C = df_8_9A[df_8_9A.lnGDP_pc >= 7] exoA = sm.add_constant(df_8_9A[[ f'{name}_C2', f'{name}_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) exoB = sm.add_constant(df_8_9B[[f'{name}_C2', f'{name}_I']]) exoC = sm.add_constant(df_8_9C[[ f'{name}_C2', f'{name}_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) insA = sm.add_constant(df_8_9A[[ f'{name}_instrument_C2_thresh', f'{name}_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) insB = sm.add_constant( df_8_9B[[f'{name}_instrument_C2_thresh', f'{name}_I']]) insC = sm.add_constant(df_8_9C[[ f'{name}_instrument_C2_thresh', f'{name}_I', 'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall' ]]) df_8_9s = [df_8_9A, df_8_9B, df_8_9C] exos = [exoA, exoB, exoC] inss = [insA, insB, insC] y = [[f'y{idx}A', f'y{idx}B', f'y{idx}C'] for idx in range(1, 7)] est = [[f'est{idx}A', f'est{idx}B', f'est{idx}C'] for idx in range(1, 7)] star = ['starA', 'starB', 'starC'] for idx, i in enumerate(['A', 'B', 'C']): y[0][idx] = df_8_9s[idx]['voice'] y[1][idx] = df_8_9s[idx]['PolStab'] y[2][idx] = df_8_9s[idx]['GovEffec'] y[3][idx] = df_8_9s[idx]['RegQual'] y[4][idx] = df_8_9s[idx]['RulLaw'] y[5][idx] = df_8_9s[idx]['ConCorr'] est[0][idx] = IV2SLS(y[0][idx], exos[idx], inss[idx]).fit() est[1][idx] = IV2SLS(y[1][idx], exos[idx], inss[idx]).fit() est[2][idx] = IV2SLS(y[2][idx], exos[idx], inss[idx]).fit() est[3][idx] = IV2SLS(y[3][idx], exos[idx], inss[idx]).fit() est[4][idx] = IV2SLS(y[4][idx], exos[idx], inss[idx]).fit() est[5][idx] = IV2SLS(y[5][idx], exos[idx], inss[idx]).fit() star[idx] = Stargazer([ est[0][idx], est[1][idx], est[2][idx], est[3][idx], est[4][idx], est[5][idx] ]) for i in range(3): star[i].covariate_order([f'{name}_C2', f'{name}_I']) star[i].rename_covariates({ f'{name}_C2': 'Segregation $\hat{S}$ (' f'{name}' ')', f'{name}_I': 'Fractionalization $F$ (' f'{name}' ')' }) star[i].show_model_numbers(False) star[i].custom_columns([ 'Voice', 'Political stability', 'Govern-t effectiv.', 'Regul. quality', 'Rule of law', 'Control of corr' ], [1, 1, 1, 1, 1, 1]) if GDP == 'democ': star[0].add_line('Controls', ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes']) star[0].add_line('Sample', ['Full', 'Full', 'Full', 'Full', 'Full', 'Full']) star[1].add_line('Controls', ['No', 'No', 'No', 'No', 'No', 'No']) star[1].add_line('Sample', ['Full', 'Full', 'Full', 'Full', 'Full', 'Full']) star[2].add_line('Controls', ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes']) star[2].add_line( 'Sample', ['Democ', 'Democ', 'Democ', 'Democ', 'Democ', 'Democ']) star[0].title('Panel A. Baseline : All controls and full sample') star[1].title('Panel B. No controls and full sample') star[2].title('Panel C. All controls; sample excludes dictatorship') return [star[0], star[1], star[2]] if GDP == 'GDP': if name == 'ethnicity': star[2].title( 'Panal A. Ethnicity: All controls; sample excludes poorest countries' ) elif name == 'language': star[2].title( 'Panel B. Language: All controls; sample excludes poorest countries' ) return star[2]