Exemplo n.º 1
0
def test_ols_formula(data):
    # GH 185
    data = data.copy()
    fmla = "y ~ 1 + x1"
    mod = IV2SLS.from_formula(fmla, data)
    res = mod.fit()
    assert "OLS Estimation Summary" in str(res)
Exemplo n.º 2
0
def iv_2sls(data, formula, weights=None, cov="robust", clusters=None):
    """
    Fits a 2SLS model with the corresponding covariance matrix.
    The endogenous terms can be formulated using the following syntax: lwage ~ 1 + [educ ~ psem + educ_married] + age...
    Remember to include an intercept in the formula ('y ~ 1 + x1 + ...') and to assign it to an object!

    :param data : dataframe
    :param formula : patsy formula ('lwage ~ 1 + [educ ~ psem + educ_married] + age + agesq...')
    :param weights : N x 1 Series or vector containing weights to be used in estimation; defaults to None
        Use is recommended when analyzing survey data, passing on the weight available in the survey
    :param cov : str
        unadjusted: common standard errors
        robust: robust standard errors
        kernel: robust to heteroskedacity AND serial autocorrelation
        clustered: clustered standard errors by the entity column
    :param clusters : str or list containing names of the DataFrame variables to cluster by
        Only should be used when cov="clustered"
    :return : linearmodels model instance
    """

    ## Creating model instance
    if weights is None:
        mod = IV2SLS.from_formula(formula=formula, data=data)
    else:
        mod = IV2SLS.from_formula(formula=formula, data=data, weights=weights)

    ## Fitting with desired covariance matrix
    mod = mod.fit(cov_type='clustered', clusters=data[clusters]) if cov == 'clustered' else mod.fit(cov_type=cov)

    ## Summary
    print(mod.summary)

    # Helpful information
    print("To see 1st stage results (and if the instruments are relevant with Partial P-Value), call 'mod.first_stage")
    print("To check if the instrumentated variable is exogenous, call 'mod.wooldridge_regression'.")
    print("To test for the instruments exogeneity (when they are more numerous than the number of endogenous variables")
    print("- therefore, are overidentified restrictions), call 'mod.wooldridge_overid' (Ho: instruments are exogenous)")

    ## Returning the object
    return mod
Exemplo n.º 3
0
    for _i in range(sim_N):

        data = multivariate_normal([0,0], cov_matrix, size = N)
        u = data[:,0]
        v = data[:,1]
        z = np.random.randn(N)

        x = z*pi + v
        y = x*beta + u
        data=pd.DataFrame()
        data['x']=x
        data['y']=y
        data['z']=z
        model = sm.OLS(x, z).fit()
        pi_sim = model.params[0]
        mod = IV2SLS.from_formula('y ~ [ x ~ z]', data)
        beta_t = (mod.fit().params[0]-1)/mod.fit().std_errors[0]
        pi_result.append(model.tvalues[0])
        if np.abs(model.fvalue)>=10:
            beta_t_result.append(beta_t)
            beta_result.append(mod.fit().params[0])
        
    significant_pi = 100 - (percentileofscore(pi_result, 1.96) - percentileofscore(pi_result, -1.96))
    significant_beta = np.sum([1 for x in beta_t_result if np.abs(x)>=1.96])/len(beta_t_result)
    beta_df[pi] = significant_beta
    beta_df.loc['median',pi]=np.median(beta_result)
    beta_df.loc['mean',pi]=np.mean(beta_result)
    beta_df.loc['min',pi] = stats.describe(beta_result)[1][0]
    beta_df.loc['max',pi]=stats.describe(beta_result)[1][1]
    beta_df.loc['variance',pi]=stats.describe(beta_result)[3]
    beta_df.loc['skewness',pi]=stats.describe(beta_result)[4]
Then, we get the fitted values of $\hat{D}_r$ by running an ordinary least square (OLS) and plug it in the equation:

$$Y = \beta_0+\rho \hat{D}_r+ \beta_1r+\epsilon$$

The logic is that the "purified" $\hat{D}_r$ is uncorrelated with the error term $\epsilon$. Now, we can run an ordinary least square (OLS) to get the isolated effect of $\hat{D}_r$ on $Y$, that is, $\rho$ will be the unbiased causal effect.

# Install libray to run Instrumental Variable estimation
!pip install linearmodels

The computer automatically run the two stages of Instrumental Variable (IV) procedure. We indicated that the endogenous variable $D_r$ is "Share of Protestants", and the instrument variable $Z$ is "vaud". We also add the control variable "t_dist" that is the interaction between the variables "vaud" and "Border Distance".

The result is that "Share of Protestants" decreases the preference for leisure in 13.4%. 

from linearmodels.iv import IV2SLS
iv = 'Preference_for_Leisure ~ 1 + Border_Distance_in_Km + t_dist + [Share_of_Protestants ~ vaud]'
iv_result = IV2SLS.from_formula(iv, df5).fit(cov_type='robust')

print(iv_result)

We can also check the first stage to see if the instrumental variable "vaud" is correlated with "Share of Protestants" after controlling for other factors like "Border Distance" and "t_dist". Vaud increases 67% the share of Protestants. The t-value of "vaud" is 20, that is, statistically significant without any margin of doubt. 

Therefore, we are confident that the second stage result is more credible than the simple mean comparison. The Instrumental Variable impact of 13.4% is more credible than the simple mean comparison of 8.7%.

print(iv_result.first_stage)

The simple mean comparison result of 8.7% is closer to the result of 9% from the naive Sharp Regression Discontinuity (SRD) below. The Vaud region has a 9% less preference for leisure than the Fribourg. We cannot conclude that Protestants have a 9% less preference for leisure than Catholics. The Vaud region is not 100% Protestant. Neither the Fribourg region is 100% Catholic. 

The Fuzz Regression Discontinuity (FRD), that uses the Instrumental Variable (IV) estimation, is a correction for the naive comparison. The FRD isolates the impact of Protestants on preference for leisure. Therefore, the most credible estimation is that Protestants have 13.4% less preference for leisure than Catholics.

naive_srd = 'Preference_for_Leisure ~ 1 + vaud + Border_Distance_in_Km + t_dist'
srd = IV2SLS.from_formula(naive_srd, df5).fit(cov_type='robust')
sns.boxplot(df_param[col])
plt.show()

plt.figure(figsize=[11, 11])

sns.heatmap(df_param.corr(), annot=True, cmap="Oranges")

#Corrélations des variables avec la marge 1
df_param.corr()['Marge_1']

# ## Estimation IV2SLS

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from linearmodels.iv import IV2SLS

reg = IV2SLS.from_formula(
    ' l1 ~ 1 + sucres1 + sucres1_carre + [prix_1 ~ sucres2+ sucres2_carre]',
    base_reg)
reg.fit()

# # 300 Villes
reg.fit()

# ## 600 Villes
reg.fit()

# ## 1000 Villes
reg.fit()
Exemplo n.º 6
0
#### [9] IV / 2SLS (Two stage least squares)
import numpy as np
from linearmodels.iv import IV2SLS
# [Attempt 1 - https://bashtage.github.io/linearmodels/doc/iv/introduction.html (DID NOT WORK)]
#dependent = df.Recidivates# o/y (dependent variable)
#exog = df.Severity_Of_Crime# controls
#endog = df.Months_In_Jail# x (independent variable)
#instruments = df.Republican_Judge# z (instrumental variable)
#mod = IV2SLS(dependent, exog, endog, instruments)
#res = mod.fit(cov_type='unadjusted')
#res
# [Attempt 2 - https://bashtage.github.io/linearmodels/doc/iv/methods.html (WORKED/SUCCESS!!)]
#
#mod = IV2SLS.from_formula('dependent_variable ~ C(control_variable) + [independent_variable ~ instrumental_variable]', data=df)
mod = IV2SLS.from_formula(
    'Recidivates ~ C(Severity_Of_Crime) + [Months_In_Jail ~ Republican_Judge]',
    data=df)
res = mod.fit()
res
rows_statistical_significance, number_of_observations, r_squared = build_basic_regression_outputs_linear_models(
    res)
rows_statistical_significance = [
    rows_statistical_significance[len(rows_statistical_significance) - 1]
] + rows_statistical_significance[1:len(rows_statistical_significance) - 1]
rows_of_interest = [[i.replace('_', ' ') for i in el]
                    for el in rows_statistical_significance]

import PyRTF
from PyRTF.Renderer import Renderer
from build_table_2 import basic_table_with_input