############################################################################## ########################## SECTION 2: FIRST STAGE and SECOND STAGE ########################## SECTION: 3.1.3 to 3.3.6 IN THE THESIS ############################################################################## ################################# with packages linear models (for corrected standard error on coeffs) # Define VARIABLES FOR 2OLS dep = ['success'] endog = ['duration'] instr = ['choc'] exog= IV_numeric_val + main_cat_vars + time_vars + location_vars data = df[dep + endog + instr + exog] # First stage res = IV2SLS(data[endog], data[instr+exog], None, None).fit() print(res) # SECOND STAGE res_2sls = IV2SLS(data[dep], data[exog], data[endog], data[instr]).fit() fitted_values_ivreg = res_2sls.fitted_values.values.reshape(1, df.shape[0])[0] ivreg_y_hat = np.where(fitted_values_ivreg>=0.5, 1 ,0) RMSE_ivreg = compute_RMSE(df['success'].values, ivreg_y_hat) print(res_2sls.summary.as_latex()) ################################# with simple statsmodels
def test_wu_hausman_smoke(data): mod = IV2SLS(data.dep, data.exog, data.endog, data.instr) res = mod.fit() res.wu_hausman() res.wu_hausman([mod.endog.cols[1]])
def test_compare_single_single_parameter(data): res1 = IV2SLS(data.dep, data.exog[:, :1], None, None).fit() c = compare([res1]) assert len(c.rsquared) == 1 c.summary
def test_no_regressors(self, data): with pytest.raises(ValueError): IV2SLS(data.dep, None, None, None)
def test_2sls_ols_equiv(data): mod = IV2SLS(data.dep, data.exog, None, None) res = mod.fit() params = pinv(data.exog) @ data.dep assert_allclose(res.params, params.ravel())
def test_rank_deficient_exog(self, data): exog = data.exog.copy() exog[:, :2] = 1 with pytest.raises(ValueError): IV2SLS(data.dep, exog, data.endog, data.instr)
def monte_carlo(file, grid_points): """This function estimates the ATE for a sample with different correlation structures between U1 and V. Two different strategies for (OLS,LATE) are implemented. """ ATE = 0.5 # Define a dictionary with a key for each estimation strategy effects = {} for key_ in ["grmpy", "ols", "true", "random", "rho", "iv", "means"]: effects[key_] = [] # Loop over different correlations between V and U_1 for rho in np.linspace(0.00, 0.99, grid_points): effects["rho"] += [rho] # Readjust the initialization file values to add correlation model_spec = read(file) X = model_spec["TREATED"]["order"] update_correlation_structure(model_spec, rho) sim_spec = read(file) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = create_data(file) endog, exog, exog_ols = df_mc["wage"], df_mc[X], df_mc[["state"] + X] instr = sim_spec["CHOICE"]["order"] instr = [i for i in instr if i != "const"] # Calculate true average treatment effect ATE = np.mean(df_mc["wage1"] - df_mc["wage0"]) effects["true"] += [ATE] # Estimate via grmpy rslt = fit(file) beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"] stat = np.dot(np.mean(exog), beta_diff) effects["grmpy"] += [stat] # Estimate via OLS ols = sm.OLS(endog, exog_ols).fit() stat = ols.params[0] effects["ols"] += [stat] # Estimate via 2SLS iv = IV2SLS(endog, exog, df_mc["state"], df_mc[instr]).fit() stat = iv.params["state"] effects["iv"] += [stat] # Estimate via random random = np.mean(df_mc[df_mc.state == 1]["wage"]) - np.mean( df_mc[df_mc.state == 0]["wage"] ) stat = random effects["random"] += [stat] # outcomes stat = [ [ np.mean(df_mc[df_mc.state == 1]["wage"]), df_mc[df_mc.state == 1].shape[0], ], [ np.mean(df_mc[df_mc.state == 0]["wage"]), df_mc[df_mc.state == 0].shape[0], ], ] effects["means"] += stat create_plots(effects, effects["true"])
def test_invalid_weights(self, data): weights = np.zeros_like(data.dep) with pytest.raises(ValueError): IV2SLS(data.dep, data.exog, data.endog, data.instr, weights=weights)
def monte_carlo(file, which, grid_points=10): """ This function conducts a Monte Carlo simulation to compare the true and estimated treatment parameters for increasing (absolute) correlation between U_1 and V (i.e essential heterogeneity). In the example here, the correlation between U_1 and V becomes increasingly more negative. As we consider the absolute value of the correlation coefficient, values closer to -1 (or in the analogous case closer to +1) denote a higher degree of essential heterogeneity. The results of the Monte Carlo simulation can be used to evaluate the performance of different estimation strategies in the presence of essential heterogeneity. Depending on the specification of *which*, either the true ATE and TT, or an estimate of the ATE are returned. Options for *which*: Comparison of ATE and TT - "conventional_average_effects" Different estimation strategies for ATE - "randomization" ("random") - "ordinary_least_squares" ("ols") - "instrumental_variables" ("iv") - "grmpy_par" ("grmpy") - "grmpy_semipar"("grmpy-liv") Post-estimation: To plot the comparison between the true ATE and the respective parameter, use the function - plot_effects() for *which* = "conventional_average_effects", and - plot_estimates() else. Parameters ---------- file: yaml grmpy initialization file, provides information for the simulation process. which: string String denoting whether conventional average effects shall be computed or, alternatively, which estimation approach shall be implemented for the ATE. grid_points: int, default 10 Number of different values for rho, the correlation coefficient between U_1 and V, on the interval [0, -1), along which the parameters shall be evaluated. Returns ------- effects: list If *which* = "conventional_average_effects", list of lenght *grid_points* x 2 containing the true ATE and TT. Else, list of length *grid_points* x 1 containing an estimate of the ATE. """ # simulate a new data set with essential heterogeneity present model_dict = read(file) original_correlation = model_dict["DIST"]["params"][2] model_dict["DIST"]["params"][2] = -0.191 print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) effects = [] # Loop over different correlations between U_1 and V for rho in np.linspace(0.00, -0.99, grid_points): # effects["rho"] += [rho] # Readjust the initialization file values to add correlation model_spec = read(file) X = model_spec["TREATED"]["order"] _update_correlation_structure(file, model_spec, rho) sim_spec = read(file) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = _create_data(file) treated = df_mc["D"] == 1 Xvar = df_mc[X] instr = sim_spec["CHOICE"]["order"] instr = [i for i in instr if i != "const"] # We calculate our parameter of interest label = which.lower() if label == "conventional_average_effects": ATE = np.mean(df_mc["Y1"] - df_mc["Y0"]) TT = np.mean(df_mc["Y1"].loc[treated] - df_mc["Y0"].loc[treated]) stat = (ATE, TT) elif label in ["randomization", "random"]: random = np.mean(df_mc[df_mc.D == 1]["Y"]) - np.mean( df_mc[df_mc.D == 0]["Y"] ) stat = random elif label in ["ordinary_least_squares", "ols"]: results = sm.OLS(df_mc["Y"], df_mc[["const", "D"]]).fit() stat = results.params[1] elif label in ["instrumental_variables", "iv"]: iv = IV2SLS(df_mc["Y"], Xvar, df_mc["D"], df_mc[instr]).fit() stat = iv.params["D"] elif label in ["grmpy", "grmpy-par"]: rslt = grmpy.fit(file) beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"] stat = np.dot(np.mean(Xvar), beta_diff) elif label in ["grmpy-semipar", "grmpy-liv"]: rslt = grmpy.fit(file, semipar=True) y0_fitted = np.dot(rslt["X"], rslt["b0"]) y1_fitted = np.dot(rslt["X"], rslt["b1"]) mte_x_ = y1_fitted - y0_fitted mte_u = rslt["mte_u"] us = np.linspace(0.005, 0.995, len(rslt["quantiles"])) mte_mat = np.zeros((len(mte_x_), len(mte_u))) for i in range(len(mte_x_)): for j in range(len(mte_u)): mte_mat[i, j] = mte_x_[i] + mte_u[j] ate_tilde_p = np.mean(mte_mat, axis=1) stat = ate_tilde_p.mean() else: raise NotImplementedError effects += [stat] # Restore original init file model_dict = read(file) model_dict["DIST"]["params"][2] = original_correlation print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) return effects
def test_wooldridge_smoke(data): mod = IV2SLS(data.dep, data.exog, data.endog, data.instr) res = mod.fit() assert isinstance(res.wooldridge_regression, WaldTestStatistic) assert isinstance(res.wooldridge_score, WaldTestStatistic)
def test_first_stage_summary(data): res1 = IV2SLS(data.dep, data.exog, data.endog, data.instr).fit() assert isinstance(res1.first_stage.summary, Summary)
def test_no_regressors_exception(data): with pytest.raises(ValueError): IV2SLS(data.dep, None, None, None)
sns.boxplot(df_param[col]) plt.show() plt.figure(figsize=[11, 11]) sns.heatmap(df_param.corr(), annot=True, cmap="Oranges") #Corrélations des variables avec la marge 1 df_param.corr()['Marge_1'] # ## Estimation IV2SLS from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error, r2_score from linearmodels.iv import IV2SLS reg = IV2SLS.from_formula( ' l1 ~ 1 + sucres1 + sucres1_carre + [prix_1 ~ sucres2+ sucres2_carre]', base_reg) reg.fit() # # 300 Villes reg.fit() # ## 600 Villes reg.fit() # ## 1000 Villes reg.fit()
## First Stage # Import and select the data df4 = pd.read_stata( 'https://github.com/QuantEcon/QuantEcon.lectures.code/raw/master/ols/maketable4.dta' ) df4 = df4[df4['baseco'] == 1] df4.head() # add a constant variable df4['const'] = 1 results_fs = sm.OLS(df4['avexpr'], df4[['const', 'logem4']], missing='drop').fit() print(results_fs.summary()) ## second stage --> give unbiased and consistent estimates # retrieve the predicted values of avexpri using .predict() df4['predicted_avexpr'] = results_fs.predict() results_ss = sm.OLS(df4['logpgp95'], df4[['const', 'predicted_avexpr']]).fit() print(results_ss.summary()) ### 2SLS Regression by IV2SLS from linearmodels.iv import IV2SLS iv = IV2SLS(dependent=df4['logpgp95'], exog=df4['const'], endog=df4['avexpr'], instruments=df4['logem4']).fit(cov_type='unadjusted') print(iv.summary)
def test_anderson_rubin(data): res = IV2SLS(data.dep, data.exog, data.endog[['x1']], data.instr).fit(cov_type='unadjusted') assert_allclose(res.nobs * (res._liml_kappa - 1), .176587, rtol=1e-4)
def monte_carlo(file, which, grid_points=10): """This function estimates various effect parameters for increasing presence of essential heterogeneity, which is reflected by increasing correlation between U_1 and V. """ # simulate a new data set with essential heterogeneity present model_dict = read(file) original_correlation = model_dict["DIST"]["params"][2] model_dict["DIST"]["params"][2] = -0.191 print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) effects = [] # Loop over different correlations between V and U_1 for rho in np.linspace(0.00, -0.99, grid_points): # effects["rho"] += [rho] # Readjust the initialization file values to add correlation model_spec = read(file) X = model_spec["TREATED"]["order"] update_correlation_structure(file, model_spec, rho) sim_spec = read(file) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = create_data(file) treated = df_mc["D"] == 1 Xvar = df_mc[X] instr = sim_spec["CHOICE"]["order"] instr = [i for i in instr if i != "const"] # We calculate our parameter of interest label = which.lower() if label == "conventional_average_effects": ATE = np.mean(df_mc["Y1"] - df_mc["Y0"]) TT = np.mean(df_mc["Y1"].loc[treated] - df_mc["Y0"].loc[treated]) stat = (ATE, TT) elif label in ["random", "randomization"]: random = np.mean(df_mc[df_mc.D == 1]["Y"]) - np.mean( df_mc[df_mc.D == 0]["Y"]) stat = random elif label in ["ordinary_least_squares", "ols"]: results = sm.OLS(df_mc["Y"], df_mc[["const", "D"]]).fit() stat = results.params[1] elif label in ["instrumental_variables", "iv"]: iv = IV2SLS(df_mc["Y"], Xvar, df_mc["D"], df_mc[instr]).fit() stat = iv.params["D"] elif label in ["grmpy", "grmpy-par"]: rslt = grmpy.fit(file) beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"] stat = np.dot(np.mean(Xvar), beta_diff) elif label in ["grmpy-semipar", "grmpy-liv"]: rslt = grmpy.fit(file, semipar=True) y0_fitted = np.dot(rslt["X"], rslt["b0"]) y1_fitted = np.dot(rslt["X"], rslt["b1"]) mte_x_ = y1_fitted - y0_fitted mte_u = rslt["mte_u"] us = np.linspace(0.005, 0.995, len(rslt["quantiles"])) mte_mat = np.zeros((len(mte_x_), len(mte_u))) for i in range(len(mte_x_)): for j in range(len(mte_u)): mte_mat[i, j] = mte_x_[i] + mte_u[j] ate_tilde_p = np.mean(mte_mat, axis=1) stat = ate_tilde_p.mean() else: raise NotImplementedError effects += [stat] # Restore original init file model_dict = read(file) model_dict["DIST"]["params"][2] = original_correlation print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) return effects
def test_basmann_f(data): res = IV2SLS(data.dep, data.exog, data.endog[['x1']], data.instr).fit(cov_type='unadjusted') assert_allclose(res.basmann_f.stat, .174821, rtol=1e-4) assert_allclose(res.basmann_f.pval, 0.6760, rtol=1e-3)
def test_firstdifference_ols(data): mod = FirstDifferenceOLS(data.y, data.x) res = mod.fit(debiased=False) y = mod.dependent.values3d x = mod.exog.values3d dy = np.array(y[0, 1:] - y[0, :-1]) dy = pd.DataFrame( dy, index=mod.dependent.panel.major_axis[1:], columns=mod.dependent.panel.minor_axis, ) dy = dy.T.stack() dy = dy.reindex(mod.dependent.index) dx = x[:, 1:] - x[:, :-1] _dx = {} for i, dxi in enumerate(dx): temp = pd.DataFrame( dxi, index=mod.dependent.panel.major_axis[1:], columns=mod.dependent.panel.minor_axis, ) temp = temp.T.stack() temp = temp.reindex(mod.dependent.index) _dx[mod.exog.vars[i]] = temp dx = pd.DataFrame(index=_dx[mod.exog.vars[i]].index) for key in _dx: dx[key] = _dx[key] dx = dx[mod.exog.vars] drop = dy.isnull() | np.any(dx.isnull(), 1) dy = dy.loc[~drop] dx = dx.loc[~drop] ols_mod = IV2SLS(dy, dx, None, None) ols_res = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) res = mod.fit(cov_type="robust", debiased=False) ols_res = ols_mod.fit(cov_type="robust") assert_results_equal(res, ols_res) clusters = data.vc1 ols_clusters = mod.reformat_clusters(data.vc1) fd = mod.dependent.first_difference() ols_clusters = ols_clusters.dataframe.loc[fd.index] res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) ols_res = ols_mod.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res) res = mod.fit(cov_type="clustered", cluster_entity=True, debiased=False) entity_clusters = mod.dependent.first_difference().entity_ids ols_res = ols_mod.fit(cov_type="clustered", clusters=entity_clusters) assert_results_equal(res, ols_res) ols_clusters["entity.clusters"] = entity_clusters ols_clusters = ols_clusters.astype(np.int32) res = mod.fit(cov_type="clustered", cluster_entity=True, clusters=data.vc1, debiased=False) ols_res = ols_mod.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res)
def diagnostics(self): """ Post estimation diagnostics of first-stage fit Returns ------- res : DataFrame DataFrame where each endogenous variable appears as a row and the columns contain alternative measures. The columns are: * rsquared - R-squared from regression of endogenous on exogenous and instruments * partial.rsquared - R-squared from regression of the exogenous variable on instruments where both the exogenous variable and the instrument have been orthogonalized to the exogenous regressors in the model. * f.stat - Test that all coefficients are zero in the model used to estimate the partial R-squared. Uses a standard F-test when the covariance estimator is unadjusted - otherwise uses a Wald test statistic with a chi2 distribution. * f.pval - P-value of the test that all coefficients are zero in the model used to estimate the partial R-squared * shea.rsquared - Shea's r-squared which measures the correlation between the projected and orthogonalized instrument on the orthogonalized endogenous regressor where the orthogonalization is with respect to the other included variables in the model. """ from linearmodels.iv.model import _OLS, IV2SLS endog, exog, instr, weights = self.endog, self.exog, self.instr, self.weights w = sqrt(weights.ndarray) z = w * instr.ndarray x = w * exog.ndarray px = x @ pinv(x) ez = z - px @ z out = {} individual_results = self.individual for col in endog.pandas: inner = {} inner['rsquared'] = individual_results[col].rsquared y = w * endog.pandas[[col]].values ey = y - px @ y mod = _OLS(ey, ez) res = mod.fit(self._cov_type, **self._cov_config) inner['partial.rsquared'] = res.rsquared params = res.params.values params = params[:, None] stat = params.T @ inv(res.cov) @ params stat = float(stat.squeeze()) w_test = WaldTestStatistic(stat, null='', df=params.shape[0]) inner['f.stat'] = w_test.stat inner['f.pval'] = w_test.pval out[col] = Series(inner) out = DataFrame(out).T dep = self.dep r2sls = IV2SLS(dep, exog, endog, instr, weights=weights).fit('unadjusted') rols = _OLS(dep, self._reg, weights=weights).fit('unadjusted') shea = (rols.std_errors / r2sls.std_errors)**2 shea *= (1 - r2sls.rsquared) / (1 - rols.rsquared) out['shea.rsquared'] = shea[out.index] cols = [ 'rsquared', 'partial.rsquared', 'shea.rsquared', 'f.stat', 'f.pval' ] out = out[cols] for c in out: out[c] = to_numeric(out[c]) return out
def test_firstdifference_ols_weighted(data): mod = FirstDifferenceOLS(data.y, data.x, weights=data.w) res = mod.fit(debiased=False) y = mod.dependent.values3d x = mod.exog.values3d dy = np.array(y[0, 1:] - y[0, :-1]) dy = pd.DataFrame( dy, index=mod.dependent.panel.major_axis[1:], columns=mod.dependent.panel.minor_axis, ) dy = dy.T.stack() dy = dy.reindex(mod.dependent.index) dx = x[:, 1:] - x[:, :-1] _dx = {} for i, dxi in enumerate(dx): temp = pd.DataFrame( dxi, index=mod.dependent.panel.major_axis[1:], columns=mod.dependent.panel.minor_axis, ) temp = temp.T.stack() temp = temp.reindex(mod.dependent.index) _dx[mod.exog.vars[i]] = temp dx = pd.DataFrame(index=_dx[mod.exog.vars[i]].index) for key in _dx: dx[key] = _dx[key] dx = dx[mod.exog.vars] w = mod.weights.values3d w = 1.0 / w sw = w[0, 1:] + w[0, :-1] sw = pd.DataFrame( sw, index=mod.dependent.panel.major_axis[1:], columns=mod.dependent.panel.minor_axis, ) sw = sw.T.stack() sw = sw.reindex(mod.dependent.index) sw = 1.0 / sw sw = sw / sw.mean() drop = dy.isnull() | np.any(dx.isnull(), 1) | sw.isnull() dy = dy.loc[~drop] dx = dx.loc[~drop] sw = sw.loc[~drop] ols_mod = IV2SLS(dy, dx, None, None, weights=sw) ols_res = ols_mod.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) res = mod.fit(cov_type="robust", debiased=False) ols_res = ols_mod.fit(cov_type="robust") assert_results_equal(res, ols_res) clusters = data.vc1 ols_clusters = mod.reformat_clusters(data.vc1) fd = mod.dependent.first_difference() ols_clusters = ols_clusters.dataframe.loc[fd.index] res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) ols_res = ols_mod.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res)
sessions_AcceptedInvo['Total_Service_Duration'].mean() ( (sessions_AcceptedInvo[' outcome']==1).sum()+(sessions_AcceptedInvo[' outcome']==2).sum()) \ / (sessions_AcceptedInvo[' queue_sec'].sum()) #check with linear regression what influences outcome Y ----- #and also is data for other treatments dataForRegression = pd.read_csv('DataForRegression.csv', index_col=0) regresion_check = IV2SLS(dataForRegression.Y,\ dataForRegression[['queue_sec','invite_type', 'engagement_skill','target_skill','region','city','country','continent','user_os',\ 'browser','score','other_time','other_lines','other_number_words',\ 'inner_wait', 'visitor_duration',\ 'agent_duration', 'visitor_number_words', 'agent_number_words',\ 'visitor_lines', 'agent_lines', \ 'total_canned_lines', 'average_sent', 'min_sent', 'max_sent', 'n_sent_pos', 'n_sent_neg', 'first_sent',\ 'last_sent', 'id_rep_code', \ 'Invitation_Acep_Day_of_week', 'Invitation_Acep_Hour', \ 'NumberofAssigned', 'NumberofAssignedwhenAssigned', \ 'Rho_atarrival',\ ]], None, None).fit(cov_type='unadjusted') print(regresion_check) regresion_check2 = IV2SLS(dataForRegression.Y,\ dataForRegression[['queue_sec','invite_type', 'engagement_skill','target_skill','score','other_time',\ 'agent_number_words',\ 'visitor_lines', 'agent_lines', \ ]], None, None).fit(cov_type='unadjusted') print(regresion_check2)
var_dependent = reputation_model[0][0] all_exogenous = reputation_model[1] all_endogeneous = reputation_model[2] all_instrumental = reputation_model[3] all_results = np.zeros((len(all_instrumental), len(all_endogeneous), 2)) for i, var_endogeneous in enumerate(all_endogeneous): print('*******' + var_endogeneous.upper() + '*******', file=open('Results/' + which_site + '_Results.txt', 'a')) print('OLS model with no control\n', file=open('Results/' + which_site + '_Results.txt', 'a')) res_ols = IV2SLS(df_covariates[var_dependent], df_covariates[[var_endogeneous, 'const']], None, None).fit(cov_type='unadjusted') print(res_ols, file=open('Results/' + which_site + '_Results.txt', 'a')) print( '*******************************************************************************\n', file=open('Results/' + which_site + '_Results.txt', 'a')) for j, var_instrumental in enumerate(all_instrumental): print('***' + var_endogeneous.upper() + ': ' + var_instrumental + '***', file=open('Results/' + which_site + '_Results.txt', 'a')) print('2SLS model with no control\n', file=open('Results/' + which_site + '_Results.txt', 'a')) # 2SLS function call: IV2SLS(dependent, exogeneous, endogeneous, instrumental) res_2sls = IV2SLS(
def test_too_few_instruments(self, data): with pytest.raises(ValueError): IV2SLS(data.dep, data.exog, data.endog, None)
def test_wooldridge_score(data): res = IV2SLS(data.dep, data.exog, data.endog[['x1', 'x2']], data.instr).fit(cov_type='robust') assert_allclose(res.wooldridge_score.stat, 22.684, rtol=1e-4) assert_allclose(res.wooldridge_score.pval, 0.0000, atol=1e-4)
def test_durbin_smoke(data): mod = IV2SLS(data.dep, data.exog, data.endog, data.instr) res = mod.fit() res.durbin() res.durbin([mod.endog.cols[1]])
def test_wooldridge_regression(data): mod = IV2SLS(data.dep, data.exog, data.endog[['x1', 'x2']], data.instr) res = mod.fit(cov_type='robust', debiased=True) # Scale to correct for F vs Wald treatment assert_allclose(res.wooldridge_regression.stat, 2 * 13.3461, rtol=1e-4) assert_allclose(res.wooldridge_regression.pval, 0.0000, atol=1e-4)
def test_wooldridge_smoke(data): mod = IV2SLS(data.dep, data.exog, data.endog, data.instr) res = mod.fit() res.wooldridge_regression res.wooldridge_score
def test_wooldridge_overid(data): res = IV2SLS(data.dep, data.exog, data.endog[['x1']], data.instr).fit(cov_type='robust') assert_allclose(res.wooldridge_overid.stat, 0.221648, rtol=1e-4) assert_allclose(res.wooldridge_overid.pval, 0.6378, rtol=1e-3)
def test_first_stage_summary(data): res1 = IV2SLS(data.dep, data.exog, data.endog, data.instr).fit() res1.first_stage.summary
dif=abs(row[:-1]-dummT0V[:,:-1]).sum(axis=1) #index #dif.argmin() #value distance=row[-1]-dummT0V[dif.argmin(),-1] ITE.append(distance) #Sum of the Treated/number of Treated ATE_Matchingbii = float(sum(ITE))/float(len(dataFirstTB)) # ---------matching--------- #-----------IV's------------- corralation=dataFirstTB.corr() res_second = IV2SLS(dataFirstTB.Y,dataFirstTB[['invite_type','engagement_skill','Rho_atarrival','region','city','country','continent','user_os','browser','score','Invitation_Acep_Hour']],\ dataFirstTB.WaitTreatment, dataFirstTB.Invitation_Acep_Day_of_week).fit(cov_type='unadjusted') print(res_second) covariance(dataFirstT.queue_sec,dataFirstT.Rho_atarrival) #-0.12 corralation=dataFirstT[['queue_sec','Rho_atarrival','invite_type',\ 'engagement_skill','target_skill']].corr() #-----------IV's------------- #FIRST TREATMENT ii -30 sec dataFirstT = pd.read_csv('DataForFirstTreatmentii.csv', index_col=0)