def run_regressions(dataa, datab, endog1, endog2, exog1, exog2, options=0): results = [] print(endog1) for index, elem in enumerate(endog1): name = 'endog1' + '_' + str(index) if options == 0: mod = PanelOLS(dataa[elem], dataa[exog1], entity_effects=True, time_effects=True) if options == 1: mod = PanelOLS(dataa[elem], dataa[exog1], entity_effects=False, time_effects=True) results.append(mod.fit(cov_type='clustered', clusters=dataa.gvkey)) for index, elem in enumerate(endog2): name = 'endog2' + '_' + str(index) if options == 0: mod = PanelOLS(datab[elem], datab[exog2], entity_effects=True, time_effects=True) if options == 1: mod = PanelOLS(datab[elem], datab[exog2], entity_effects=False, time_effects=True) results.append(mod.fit(cov_type='clustered', clusters=datab.gvkey)) return results
def __fitreg(self, dt, start_datetime, end_datetime, y, var_pit, var_norm, fix, cluster, c): # filter dates dt = dt.loc[(dt['date'] >= start_datetime) & (dt['date'] <= end_datetime)] # filter columns dt = dt[y + ['year', 'ticker'] + [col for col in dt.columns[c:]] + fix] # choose x x = '+'.join(dt.columns[3:]) #print("Start filling NAs...") #dt = dt.fillna(dt.groupby('ticker').transform('mean')) #dt = dt.fillna(dt.transform('mean')) dt = dt.dropna() #print("Filling NAs done.") dt = dt.set_index(['ticker', 'year']) if len(fix) == 0 and len(cluster) == 0: mod = PanelOLS.from_formula(y[0] + '~1+' + x, data=dt) fit1 = mod.fit(cov_type='clustered', cluster_time=False, cluster_entity=False) return fit1 if len(fix) == 1: mod = PanelOLS.from_formula(y[0] + '~1+' + x + '+' + fix[0], data=dt) if len(cluster) == 0: fit1 = mod.fit(cov_type='clustered', cluster_time=False, cluster_entity=False) return fit1 elif cluster == ['year']: fit1 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=False) return fit1 elif cluster == ['ticker']: fit1 = mod.fit(cov_type='clustered', cluster_time=False, cluster_entity=True) return fit1 elif cluster == ['year', 'ticker' ] or cluster == ['ticker', 'year']: fit1 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) return fit1 else: raise KeyError("Please choose either year or ticker, or both.") if len(fix) > 1: raise KeyError( "You have {} fixed effects! Please pick one.".format(len(fix)))
def run_regressions_3(data=[], endog=[], exog=[], options=0, clusterfirm=0): results = [] print(endog) for index, elem in enumerate(data): # name = 'endog' + '_' + str(index) if options == 0: mod = PanelOLS(elem[endog], elem[exog], entity_effects=True, time_effects=True) if options == 1: mod = PanelOLS(elem[endog], elem[exog], entity_effects=False, time_effects=True) if options == 2: print(type(elem)) mod = PooledOLS(elem[endog], elem[exog]) if clusterfirm == 0: results.append(mod.fit(cov_type='clustered', clusters=elem.gvkey)) if clusterfirm == 1: results.append(mod.fit(cov_type='clustered', cluster_entity=True)) if clusterfirm == 2: results.append(mod.fit()) return results
def regressions(data, endog, exog, options, clusterfirm, constant): #results = [] if constant == 1: exog = sm.add_constant(data[exog]) if constant == 0: exog = data[exog] if options == 0: mod = PanelOLS(data[endog], exog, entity_effects=True, time_effects=True) if options == 1: mod = PanelOLS(data[endog], exog, entity_effects=False, time_effects=True) if options == 2: #print(data[[endog]], exog) mod = PooledOLS(data[endog], exog) if clusterfirm == 0: results = mod.fit(cov_type='clustered', clusters=data.gvkey) if clusterfirm == 1: results = mod.fit(cov_type='clustered', cluster_entity=True) if clusterfirm == 2: results = mod.fit() return results
def preprocessing_regression(self): #Filling missing values with mean values. imputer = SimpleImputer(missing_values=np.nan, strategy='mean') self.df.iloc[:, :9] = imputer.fit_transform(self.df.iloc[:, :9]) data = self.df.iloc[:, :10] #Taking natural log of variable that have outliers data.mezun = np.log(self.df.iloc[:, 2]) data.yogunluk = np.log(self.df.iloc[:, 3]) data.dogum = np.log(self.df.iloc[:, 4]) #Setting indexes in order to shape to data into panel form. data = data.set_index(['iller', 'yil']) #Regressing variables to find out time effect on the relation between regressand and regressors. mod = PanelOLS(data.mezun, data.iloc[:, 1:9], time_effects=True) res = mod.fit(cov_type='clustered', cluster_entity=True) return res
def run_regressions_2(data, endog=[], exog=[], options=0): results = [] print(endog) for index, elem in enumerate(endog): name = 'endog' + '_' + str(index) if options == 0: for i, e in enumerate(endog): mod = PanelOLS(data[elem], data[e], entity_effects=True, time_effects=True) if options == 1: mod = PanelOLS(data[elem], data[e], entity_effects=False, time_effects=True) results.append(mod.fit(cov_type='clustered', clusters=data.gvkey)) return results
def cond_corr_e2_e1timesprize(df): """Correlation of e2 and the interaction of e1 and prize after partialing out other effects.""" df_resid = pd.DataFrame(columns=["e2_resid", "e1timesprize_resid"]) for label in ["e2", "e1timesprize"]: column, formula = ( f"{label}_resid", f"{label}~e1+prize+tt2+tt3+tt4+tt5+tt6+tt7+tt8+tt9+tt10+EntityEffects", ) df_resid.loc[:, column] = PanelOLS.from_formula(formula, data=df).fit().resids return df_resid["e2_resid"].corr(df_resid["e1timesprize_resid"])
def get_fe( regression_variables: List[Tuple], data: Dict[str, pd.DataFrame], datasets: Dict[pd.DataFrame, Any], entity_effects: bool = False, time_effects: bool = False, ) -> Tuple[DataFrame, Any, List[Any], Any]: """When effects are correlated with the regressors the RE and BE estimators are not consistent. The usual solution is to use Fixed Effects which are called entity_effects when applied to entities and time_effects when applied to the time dimension. [Source: LinearModels] Parameters ---------- regression_variables : list The regressions variables entered where the first variable is the dependent variable. data : dict A dictionary containing the datasets. datasets: dict A dictionary containing the column and dataset names of each column/dataset combination. entity_effects : bool Whether to include entity effects time_effects : bool Whether to include time effects Returns ------- The dataset used, the dependent variable, the independent variable and the OLS model. """ regression_df, dependent_variable, independent_variables = get_regression_data( regression_variables, data, datasets, "FE") if regression_df.empty: model = None else: with warnings.catch_warnings(record=True) as warning_messages: exogenous = add_constant(regression_df[independent_variables]) model = PanelOLS( regression_df[dependent_variable], exogenous, entity_effects=entity_effects, time_effects=time_effects, ).fit() console.print(model) if len(warning_messages) > 0: console.print("Warnings:") for warning in warning_messages: console.print(f"[red]{warning.message}[/red]".replace( "\n", "")) return regression_df, dependent_variable, independent_variables, model
def old_percentile_correlation(df): """J percentile of the correlation of e2 and e1 after partialing out other effects.""" df_resid = pd.DataFrame(columns=["e2_resid", "e1_resid"], index=df.index) for label in ["e2", "e1"]: column, formula = f"{label}_resid", f"{label}~prize+e1timesprize+TimeEffects" df_resid.loc[:, column] = PanelOLS.from_formula(formula, data=df).fit().resids dfs = dict() for sub in df_resid.index.get_level_values('subject').unique(): dfs[f"{sub}"] = df_resid.query(f"subject == {sub}") cond_corr = list() for key in dfs: cond_corr.append(dfs[key]["e2_resid"].corr(dfs[key]["e1_resid"])) return np.percentile(cond_corr, 66)
def panel_regression(y, xs, years, country, list_x, prev=0, show=False, save=True, path="", diff=False, constant=False, entity_effects=False): data = bdf.filter_origin_country_dataset(y, country, years, xs.index.levels[0].tolist(), xs, prev) if constant == False: exog = data[list_x] else: exog = sm.add_constant(data[list_x]) # if diff == False: mod = PanelOLS(data.y, exog, entity_effects=entity_effects) else: mod = FirstDifferenceOLS(data.y, exog) res = mod.fit() #print("The R-squared of the regression model is %f." %res.rsquared) #print("Estimated parameters:") #print(pd.DataFrame(res.params)) evaluation(data, res.fitted_values, constant, len(xs.columns.tolist())) if show == True: pmf.plot_real_VS_prediction(y, res.fitted_values, xs, years, country, 45, "Regression model", save, path) else: pass return (res.params, res.fitted_values)
def balancing_tests_cantonal_results(df, exog): ##These are the conditional results ##between countries as= asylum seekers mod_balancing = PanelOLS(df.share_AS_between * 100, exog, entity_effects=True, time_effects=True, singletons=False) result_balancing_canton = mod_balancing.fit(cov_type='clustered', clusters=df.id_e, singletons=False) mod_balancing2 = PanelOLS(df.share_AS_within * 100, exog, entity_effects=True, time_effects=True, singletons=False) result_balancing_canton2 = mod_balancing2.fit(cov_type='clustered', clusters=df.id_e, singletons=False) mod_balancing3 = PanelOLS(df.sex_ratio_AS_ntc * 100, exog, entity_effects=True, time_effects=True, singletons=False) result_balancing_canton3 = mod_balancing3.fit(cov_type='clustered', clusters=df.id_e, singletons=False) return (compare( { 'Between countries': result_balancing_canton, 'Within countries': result_balancing_canton2, 'Sex ratio': result_balancing_canton3 }, stars=True))
def process_data(tag, area_tag): """ 处理数据 :param area_tag :return: """ root_path = getRootPath() tif_file = os.path.join( root_path, "{0}/result/avg_data/avg_{1}.tif".format(tag, area_tag)) bandArray = get_raster_band_array(tif_file) df = pd.DataFrame(bandArray, columns=["sday", "eday", "gsl", "gdd", "edd", "pre"]) df.sday = df.sday.astype(np.int64) df.eday = df.eday.astype(np.int64) df = df.set_index(["eday", "sday"]) df.dropna() print("-------- use EntityEffects ---------") mod = PanelOLS.from_formula('gsl ~ 1 + gdd + edd + pre + EntityEffects', df) res = mod.fit(cov_type='unadjusted') print(res)
def process_data(tag, area_tag): """ 处理预测数据: :param area_tag: :return: """ print("process data area_tag: {}".format(area_tag)) root_path = getRootPath() src_path = os.path.join(root_path, "{0}/process/merge/{1}".format(tag, area_tag)) tif_files = walkDirFile(src_path, ext=".tif") bandArray = None flag = False for tif_file in tif_files: tempArr = get_raster_band_array(tif_file) if not flag: bandArray = tempArr flag = True else: bandArray = np.vstack((bandArray, tempArr)) if not flag: return df = pd.DataFrame( bandArray, columns=["sday", "eday", "gsl", "year", "gdd", "edd", "pre"]) df.sday = df.sday.astype(np.int64) df.eday = df.eday.astype(np.int64) df.year = df.year.astype(np.int64) df = df.set_index(["year", "eday"]) df.dropna() print("-------- use EntityEffects ---------") mod = PanelOLS.from_formula('gsl ~ 1 + gdd + edd + pre + EntityEffects', df) res = mod.fit(cov_type='unadjusted') print(res)
jtrain2 = jtrain jtrain2[:5] ## Define the ID and Time column for Panel Regression jtrain2 = jtrain2.set_index(['fcode', 'year']) print(jtrain2.head(5)) exog_vars = ['d88', 'd89', 'grant', 'grant_1'] grant_vars = ['grant'] exog = sm.add_constant(jtrain2[exog_vars]) grant0 = sm.add_constant(jtrain2[grant_vars]) ## Model Pooled OLS model_pool = PooledOLS(jtrain2.lscrap, exog) pooled_res = model_pool.fit() print(pooled_res) ## Model Fixed Effects -- Entity Effects - True model_fe = PanelOLS(jtrain2.lscrap, exog, entity_effects=True) fe_res = model_fe.fit() print(fe_res) ## Model Fixed Effects -- Entity and Time Effects - True model_fe = PanelOLS(jtrain2.lscrap, exog, entity_effects=True, time_effects=True) fe_res = model_fe.fit() print(fe_res) ## Random Effects Model model_re = RandomEffects(jtrain2.lscrap, exog) re_res = model_re.fit() print(fe_res) ################################################# ## Regress scrap~grant
#mod = PanelOLS(temp.UE12M, temp[['activeWeight12M']], entity_effects = True) #mod = PanelOLS(temp.UE3M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE3M, temp[['activeWeight3MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE3M, temp[['activeWeight6MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE3M, temp[['activeWeight12MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE6M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE6M, temp[['activeWeight3MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE6M, temp[['activeWeight6MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE6M, temp[['activeWeight12MSquared']], entity_effects = True) mod = PanelOLS(temp.UE12M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE12M, temp[['activeWeight3MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE12M, temp[['activeWeight6MSquared']], entity_effects = True) #mod = PanelOLS(temp.UE12M, temp[['activeWeight12MSquared']], entity_effects = True) #Both entity and time effect #mod = PanelOLS(temp.UE3M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], entity_effects = True, time_effects = True) #mod = PanelOLS(temp.UE3M, temp[['activeWeight3M']], entity_effects = True, time_effects = True) #mod = PanelOLS(temp.UE3M, temp[['activeWeight6M']], entity_effects = True, time_effects = True) #mod = PanelOLS(temp.UE3M, temp[['activeWeight12M']], entity_effects = True, time_effects = True) #mod = PanelOLS(temp.UE6M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], entity_effects = True, time_effects = True) #mod = PanelOLS(temp.UE6M, temp[['activeWeight3M']], entity_effects = True, time_effects = True) #mod = PanelOLS(temp.UE6M, temp[['activeWeight6M']], entity_effects = True, time_effects = True)
x = np.stack([calc_mat[:, 1], calc_mat[:, 2], calc_mat[:, 3], calc_mat[:, 4]]) ones = np.ones(len(x[0])) X = sm.add_constant(np.column_stack((x[0], ones))) for elem in x[1:]: X = sm.add_constant(np.column_stack((elem, X))) res = sm.OLS(y,X).fit() print(res.summary()) FE模型回归 company_codes = [] for each_file in file_list: company_code = each_file.split('.')[0] company_code = int(company_code) company_codes.append(company_code) time = [2019] * 50 df = pd.DataFrame({ 'TDA': x[0], 'CR5': x[1], 'SIZE': x[2], 'ROE': x[3], 'REWARD': y, 'YEAR': time, 'CODE': company_codes }) df.to_stata('Stock/res.dta') df = df.set_index(['CODE', 'YEAR']) exog_vars = ['TDA', 'LDA', 'SIZE', 'ROE'] exog = sm.add_constant(df[exog_vars]) model = PanelOLS(df['REWARD'], exog, entity_effects=True) fe = model.fit() print(fe)
test['volume'] = test['volume'] / 1000000 test = test.loc[test['year'].isin(['2020', '2018', '2019'])] test = test[[ 'year', 'ticker', 'assetclasslevel1', 'assetclasslevel2', 'assetclasslevel3', 'cd', 'cdlag1', 'pd', 'volume', 'age' ]] test = test.dropna() # In[16]: test0 = test.set_index(['ticker', 'year']) # fix assetclasslevel1, cluster time + ticker mod = PanelOLS.from_formula( 'cd ~ 1 + cdlag1 + volume + pd + age + assetclasslevel1', data=test0) fit01 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) # fix assetclasslevel2, cluster time + ticker mod = PanelOLS.from_formula( 'cd ~ 1 + cdlag1 + volume + pd + age + assetclasslevel2', data=test0) fit02 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) # fix assetclasslevel3, cluster time + ticker mod = PanelOLS.from_formula( 'cd ~ 1 + cdlag1 + volume + pd + age + assetclasslevel3', data=test0) fit03 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) # fix year, cluster time + ticker mod = PanelOLS.from_formula('cd ~ 1 + cdlag1 + volume + pd + TimeEffects', data=test0)
# CLO has much more positive holding period return than corporate bonds # In[41]: #Part B # 1. OLS without fixed effect hpr_OLS = smf.ols(formula='lnhpr ~ clo+tmkt_rf+tsmb+thml+tterm+tdef+hp', data=ps5) # I use panel data to regression holding period return on common risk factors (tmkt_rf,tsmb,thml,tterm,and tdef) and # holding period. CLO is an indicator which is 1 if bond is CLO. If CLO is significant and positive, CLO has higher # return than corporate bond. res = hpr_OLS.fit() print(res.summary()) # The significant positive coefficient for CLO shows that CLO has higher excess return than corporate bond # In[59]: # 2. OLS with firm fixed effect startyear = pd.Categorical(ps5.startyear) ps5 = ps5.set_index(['entity_name', 'startyear']) # In[67]: exog_vars = ['clo', 'tmkt_rf', 'tsmb', 'thml', 'tterm', 'tdef', 'hp'] exog = sm.add_constant(ps5[exog_vars]) mod = PanelOLS(ps5.lnhpr, exog, entity_effects=True) res = mod.fit() print(res) # After adding firm fixed effect, the coefficient of CLO is still significant positive and at similiar magnititude. # The argument that CLO has higher excess return than corporate return is valid.
def baseline_results(df): ##first column of baseline mi_data = df.set_index(["id_e_t", "id_a"]) exog_vars = [ "kid012_all", "all_exp_13", "all_exp_14", "all_exp_15", "all_exp_16", "all_exp_17", "all_exp_18", "all_exp_19", "all_exp_20", "all_exp_21", "all_exp_22", "all_exp_23", "all_exp_24", "all_exp_25", "all_exp_26", "all_exp_27", "all_exp_28", "all_exp_29", "all_exp_30", "all_exp_31", "all_exp_32", "all_exp_33", "all_exp_34", "all_exp_35", "all_exp_36", "all_exp_37", "all_exp_38", "all_exp_39", "all_exp_40", "all_exp_41", "all_exp_42", "all_exp_43", "all_exp_44", "all_exp_45", "all_exp_46", "all_exp_47", "all_exp_48", "all_exp_49", "all_exp_50", "all_exp_51", "all_exp_52", "all_exp_53", "all_exp_54", "all_exp_55", "all_exp_56", "all_exp_57", "all_exp_58", "all_exp_59", "all_exp_60", "all_exp_61", "all_exp_62", "all_exp_63", "all_exp_64", "all_exp_65", "all_exp_66", "all_exp_67", "all_exp_68", "all_exp_69", "all_exp_70", "all_exp_71", "all_exp_72", "all_exp_73", "all_exp_74", "all_exp_75", "all_exp_76", "all_exp_77", "all_exp_78", "all_exp_79", "all_exp_80", "all_exp_81", "all_exp_82", "all_exp_83", "all_exp_84", "all_exp_85", "all_exp_86" ] exog_baseline = sm.add_constant(mi_data[exog_vars]) mod = PanelOLS(mi_data.crime_rate_all_violent_p30, exog_baseline, entity_effects=True, time_effects=True, singletons=False) res = mod.fit(cov_type='clustered', clusters=mi_data.id_e, singletons=False) ##second column of baseline results CPRT_baseline_maleage_sub = df[(df['allmk_periode'] == 1)] mi_data2 = CPRT_baseline_maleage_sub.set_index(["id_a", "id_e_t"]) exog_vars2 = [ "kid012_all", "all_exp_13", "all_exp_14", "all_exp_15", "all_exp_16", "all_exp_17", "all_exp_18", "all_exp_19", "all_exp_20", "all_exp_21", "all_exp_22", "all_exp_23", "all_exp_24", "all_exp_25", "all_exp_26", "all_exp_27", "all_exp_28", "all_exp_29", "all_exp_30", "all_exp_31", "all_exp_32", "all_exp_33", "all_exp_34", "all_exp_35", "all_exp_36", "all_exp_37", "all_exp_38", "all_exp_39", "all_exp_40", "all_exp_41", "all_exp_42", "all_exp_43", "all_exp_44", "all_exp_45", "all_exp_46", "all_exp_47", "all_exp_48", "all_exp_49", "all_exp_50", "all_exp_51", "all_exp_52", "all_exp_53", "all_exp_54", "all_exp_55", "all_exp_56", "all_exp_57", "all_exp_58", "all_exp_59", "all_exp_60", "all_exp_61", "all_exp_62", "all_exp_63", "all_exp_64", "all_exp_65", "all_exp_66", "all_exp_67", "all_exp_68", "all_exp_69", "all_exp_70", "all_exp_71", "all_exp_72", "all_exp_73", "all_exp_74", "all_exp_75", "all_exp_76", "all_exp_77", "all_exp_78", "all_exp_79", "all_exp_80", "all_exp_81", "all_exp_82", "all_exp_83", "all_exp_84", "all_exp_85", "all_exp_86" ] exog2 = sm.add_constant(mi_data2[exog_vars2]) mod2 = PanelOLS(mi_data2.crime_rate_all_violent_p30, exog2, entity_effects=True, time_effects=True, singletons=False) res2 = mod2.fit(cov_type='clustered', clusters=mi_data2.id_e, singletons=False) ##third column of baseline results CPRT_baseline_maleage_sub_sub = df[(df['all_periode'] == 1)] CPRT_baseline_maleage_sub_sub = CPRT_baseline_maleage_sub_sub.drop( ['kid012_all'], axis=1) CPRT_baseline_maleage_sub_sub = CPRT_baseline_maleage_sub_sub.rename( columns={"kid012": "kid012_all"}) mi_data3 = CPRT_baseline_maleage_sub_sub.set_index(["id_a", "id_e_t"]) exog_vars3 = [ "kid012_all", "exp_all_13", "exp_all_14", "exp_all_15", "exp_all_16", "exp_all_17", "exp_all_18", "exp_all_19", "exp_all_20", "exp_all_21", "exp_all_22", "exp_all_23", "exp_all_24", "exp_all_25", "exp_all_26", "exp_all_27", "exp_all_28", "exp_all_29", "exp_all_30", "exp_all_31", "exp_all_32", "exp_all_33", "exp_all_34", "exp_all_35", "exp_all_36", "exp_all_37", "exp_all_38", "exp_all_39", "exp_all_40", "exp_all_41", "exp_all_42", "exp_all_43", "exp_all_44", "exp_all_45", "exp_all_46", "exp_all_47", "exp_all_48", "exp_all_49", "exp_all_50", "exp_all_51", "exp_all_52", "exp_all_53", "exp_all_54", "exp_all_55", "exp_all_56", "exp_all_57", "exp_all_58", "exp_all_59", "exp_all_60", "exp_all_61", "exp_all_62", "exp_all_63", "exp_all_64", "exp_all_65", "exp_all_66", "exp_all_67", "exp_all_68", "exp_all_69", "exp_all_70", "exp_all_71", "exp_all_72", "exp_all_73", "exp_all_74", "exp_all_75", "exp_all_76", "exp_all_77", "exp_all_78", "exp_all_79", "exp_all_80", "exp_all_81", "exp_all_82", "exp_all_83", "exp_all_84", "exp_all_85", "exp_all_86" ] exog3 = sm.add_constant(mi_data3[exog_vars3]) mod3 = PanelOLS(mi_data3.crime_rate_all_violent_p30, exog3, entity_effects=True, time_effects=True, singletons=False) res3 = mod3.fit(cov_type='clustered', clusters=mi_data3.id_e, singletons=False) ##4th column CPRT_baseline_maleage_sub4 = df[(df['mk_periode'] == 1)] mi_data4 = CPRT_baseline_maleage_sub4.set_index(["id_a", "id_e_t"]) exog_vars4 = [ "MK_kid012", "exp_mk_13", "exp_mk_14", "exp_mk_15", "exp_mk_16", "exp_mk_17", "exp_mk_18", "exp_mk_19", "exp_mk_20", "exp_mk_21", "exp_mk_22", "exp_mk_23", "exp_mk_24", "exp_mk_25", "exp_mk_26", "exp_mk_27", "exp_mk_28", "exp_mk_29", "exp_mk_30", "exp_mk_31", "exp_mk_32", "exp_mk_33", "exp_mk_34", "exp_mk_35", "exp_mk_36", "exp_mk_37", "exp_mk_38", "exp_mk_39", "exp_mk_40", "exp_mk_41", "exp_mk_42", "exp_mk_43", "exp_mk_44", "exp_mk_45", "exp_mk_46", "exp_mk_47", "exp_mk_48", "exp_mk_49", "exp_mk_50", "exp_mk_51", "exp_mk_52", "exp_mk_53", "exp_mk_54", "exp_mk_55", "exp_mk_56", "exp_mk_57", "exp_mk_58", "exp_mk_59", "exp_mk_60", "exp_mk_61", "exp_mk_62", "exp_mk_63", "exp_mk_64", "exp_mk_65", "exp_mk_66", "exp_mk_67", "exp_mk_68", "exp_mk_69", "exp_mk_70", "exp_mk_71", "exp_mk_72", "exp_mk_73", "exp_mk_74", "exp_mk_75", "exp_mk_76", "exp_mk_77", "exp_mk_78", "exp_mk_79", "exp_mk_80", "exp_mk_81", "exp_mk_82", "exp_mk_83" ] exp_mk4 = [ "exp_mk_13", "exp_mk_14", "exp_mk_15", "exp_mk_16", "exp_mk_17", "exp_mk_18", "exp_mk_19", "exp_mk_20", "exp_mk_21", "exp_mk_22", "exp_mk_23", "exp_mk_24", "exp_mk_25", "exp_mk_26", "exp_mk_27", "exp_mk_28", "exp_mk_29", "exp_mk_30", "exp_mk_31", "exp_mk_32", "exp_mk_33", "exp_mk_34", "exp_mk_35", "exp_mk_36", "exp_mk_37", "exp_mk_38", "exp_mk_39", "exp_mk_40", "exp_mk_41", "exp_mk_42", "exp_mk_43", "exp_mk_44", "exp_mk_45", "exp_mk_46", "exp_mk_47", "exp_mk_48", "exp_mk_49", "exp_mk_50", "exp_mk_51", "exp_mk_52", "exp_mk_53", "exp_mk_54", "exp_mk_55", "exp_mk_56", "exp_mk_57", "exp_mk_58", "exp_mk_59", "exp_mk_60", "exp_mk_61", "exp_mk_62", "exp_mk_63", "exp_mk_64", "exp_mk_65", "exp_mk_66", "exp_mk_67", "exp_mk_68", "exp_mk_69", "exp_mk_70", "exp_mk_71", "exp_mk_72", "exp_mk_73", "exp_mk_74", "exp_mk_75", "exp_mk_76", "exp_mk_77", "exp_mk_78", "exp_mk_79", "exp_mk_80", "exp_mk_81", "exp_mk_82", "exp_mk_83", "exp_mk_84", "exp_mk_85", "exp_mk_86", "exp_mk_87", "exp_mk_88", "exp_mk_89", "exp_mk_90", "exp_mk_91", "exp_mk_92", "exp_mk_93", "exp_mk_94", "exp_mk_95", "exp_mk_96", "exp_mk_97", "exp_mk_98", "exp_mk_99" ] exog4 = sm.add_constant(mi_data4[exog_vars4]) mod4 = PanelOLS(mi_data4.crime_rate_all_violent_p30, exog4, entity_effects=True, time_effects=True, singletons=False) res4 = mod4.fit(cov_type='clustered', clusters=mi_data4.id_e, singletons=False) ##presentation return (compare({ 'Full': res, 'CC and MK': res2, 'CC': res3, 'MK': res4 }, stars=True))
def baseline_results_women(df): CPRT_baseline_female = df.groupby(by=['sex']) CPRT_baseline_women = CPRT_baseline_female.get_group("F") CPRT_baseline_womenage = CPRT_baseline_women[~( CPRT_baseline_women['age'] <= 18)] mi_data_women = CPRT_baseline_womenage.set_index(["id_e_t", "id_a"]) exog_vars = [ "kid012_all", "all_exp_13", "all_exp_14", "all_exp_15", "all_exp_16", "all_exp_17", "all_exp_18", "all_exp_19", "all_exp_20", "all_exp_21", "all_exp_22", "all_exp_23", "all_exp_24", "all_exp_25", "all_exp_26", "all_exp_27", "all_exp_28", "all_exp_29", "all_exp_30", "all_exp_31", "all_exp_32", "all_exp_33", "all_exp_34", "all_exp_35", "all_exp_36", "all_exp_37", "all_exp_38", "all_exp_39", "all_exp_40", "all_exp_41", "all_exp_42", "all_exp_43", "all_exp_44", "all_exp_45", "all_exp_46", "all_exp_47", "all_exp_48", "all_exp_49", "all_exp_50", "all_exp_51", "all_exp_52", "all_exp_53", "all_exp_54", "all_exp_55", "all_exp_56", "all_exp_57", "all_exp_58", "all_exp_59", "all_exp_60", "all_exp_61", "all_exp_62", "all_exp_63", "all_exp_64", "all_exp_65", "all_exp_66", "all_exp_67", "all_exp_68", "all_exp_69", "all_exp_70" ] exog_women = sm.add_constant(mi_data_women[exog_vars]) CPRT_baseline_womenage.head() mod_women = PanelOLS(mi_data_women.crime_rate_all_violent_p30, exog_women, entity_effects=True, time_effects=True, drop_absorbed=True, singletons=False) res_women = mod_women.fit(cov_type='clustered', cluster=mi_data_women.id_e, singletons=False) CPRT_baseline_womenage_sub = CPRT_baseline_womenage[( CPRT_baseline_womenage['allmk_periode'] == 1)] mi_data2_women = CPRT_baseline_womenage_sub.set_index(["id_a", "id_e_t"]) exog_vars2 = [ "kid012_all", "all_exp_13", "all_exp_14", "all_exp_15", "all_exp_16", "all_exp_17", "all_exp_18", "all_exp_19", "all_exp_20", "all_exp_21", "all_exp_22", "all_exp_23", "all_exp_24", "all_exp_25", "all_exp_26", "all_exp_27", "all_exp_28", "all_exp_29", "all_exp_30", "all_exp_31", "all_exp_32", "all_exp_33", "all_exp_34", "all_exp_35", "all_exp_36", "all_exp_37", "all_exp_38", "all_exp_39", "all_exp_40", "all_exp_41", "all_exp_42", "all_exp_43", "all_exp_44", "all_exp_45", "all_exp_46", "all_exp_47", "all_exp_48", "all_exp_49", "all_exp_50", "all_exp_51", "all_exp_52", "all_exp_53", "all_exp_54", "all_exp_55", "all_exp_56", "all_exp_57", "all_exp_58", "all_exp_59", "all_exp_60", "all_exp_61", "all_exp_62", "all_exp_63", "all_exp_64", "all_exp_65", "all_exp_66", "all_exp_67", "all_exp_68", "all_exp_69", "all_exp_70", "all_exp_71" ] exog2_women = sm.add_constant(mi_data2_women[exog_vars2]) mod2_women = PanelOLS(mi_data2_women.crime_rate_all_violent_p30, exog2_women, entity_effects=True, time_effects=True, drop_absorbed=True, singletons=False) res2_women = mod2_women.fit(cov_type='clustered', cluster=mi_data2_women["id_e"], singletons=False) CPRT_baseline_womenage_sub_sub = CPRT_baseline_womenage[( CPRT_baseline_womenage['all_periode'] == 1)] mi_data3_women = CPRT_baseline_womenage_sub_sub.set_index( ["id_a", "id_e_t"]) exog_vars3 = [ "kid012", "exp_all_13", "exp_all_14", "exp_all_15", "exp_all_16", "exp_all_17", "exp_all_18", "exp_all_19", "exp_all_20", "exp_all_21", "exp_all_22", "exp_all_23", "exp_all_24", "exp_all_25", "exp_all_26", "exp_all_27", "exp_all_28", "exp_all_29", "exp_all_30", "exp_all_31", "exp_all_32", "exp_all_33", "exp_all_34", "exp_all_35", "exp_all_36", "exp_all_37", "exp_all_38", "exp_all_39", "exp_all_40", "exp_all_41", "exp_all_42", "exp_all_43", "exp_all_44", "exp_all_45", "exp_all_46", "exp_all_47", "exp_all_48", "exp_all_49", "exp_all_50", "exp_all_51", "exp_all_52", "exp_all_53", "exp_all_54", "exp_all_55", "exp_all_56", "exp_all_57", "exp_all_58", "exp_all_59", "exp_all_60", "exp_all_61", "exp_all_62", "exp_all_63", "exp_all_64", "exp_all_65", "exp_all_66", "exp_all_67", "exp_all_68", "exp_all_69", "exp_all_70", "exp_all_71" ] exog3_women = sm.add_constant(mi_data3_women[exog_vars3]) mod3_women = PanelOLS(mi_data3_women.crime_rate_all_violent_p30, exog3_women, entity_effects=True, time_effects=True, drop_absorbed=True, singletons=False) res3_women = mod3_women.fit(cov_type='clustered', cluster=mi_data3_women["id_e"], singletons=False) ##Table 5 column 4 women CPRT_baseline_womenage_sub4 = CPRT_baseline_womenage[( CPRT_baseline_womenage['mk_periode'] == 1)] mi_data4_women = CPRT_baseline_womenage_sub4.set_index(["id_a", "id_e_t"]) ##had to delete nr. 71-86 exog_vars4 = [ "MK_kid012", "exp_mk_13", "exp_mk_14", "exp_mk_15", "exp_mk_16", "exp_mk_17", "exp_mk_18", "exp_mk_19", "exp_mk_20", "exp_mk_21", "exp_mk_22", "exp_mk_23", "exp_mk_24", "exp_mk_25", "exp_mk_26", "exp_mk_27", "exp_mk_28", "exp_mk_29", "exp_mk_30", "exp_mk_31", "exp_mk_32", "exp_mk_33", "exp_mk_34", "exp_mk_35", "exp_mk_36", "exp_mk_37", "exp_mk_38", "exp_mk_39", "exp_mk_40", "exp_mk_41", "exp_mk_42", "exp_mk_43", "exp_mk_44", "exp_mk_45", "exp_mk_46", "exp_mk_47", "exp_mk_48", "exp_mk_49", "exp_mk_50", "exp_mk_51", "exp_mk_52", "exp_mk_53", "exp_mk_54", "exp_mk_55", "exp_mk_56", "exp_mk_57", "exp_mk_58", "exp_mk_59", "exp_mk_60", "exp_mk_61", "exp_mk_62", "exp_mk_63", "exp_mk_64", "exp_mk_65", "exp_mk_66", "exp_mk_67", "exp_mk_68", "exp_mk_69", "exp_mk_70" ] exog4_women = sm.add_constant(mi_data4_women[exog_vars4]) mod4_women = PanelOLS(mi_data4_women.crime_rate_all_violent_p30, exog4_women, entity_effects=True, time_effects=True, drop_absorbed=True, singletons=False) res4_women = mod4_women.fit(cov_type='clustered', cluster=CPRT_baseline_womenage_sub["id_e"], singletons=False) return (compare( { 'Full': res_women, 'CC and MK': res2_women, 'CC': res3_women, 'MK': res4_women }, stars=True))
autor["other"] = autor["rs_om"] + autor["rs_of"] autor["married"] = autor["marfem"] + autor["marmale"] # Create categorical for state autor["state_c"] = pd.Categorical(autor["state"]) # Set index for use with linearmodels autor = autor.set_index(["state", "year"], drop=False) # Diff-in-diff regression did = PanelOLS.from_formula( ("lnths ~" "1 +" "lnemp +" "admico_2 + admico_1 + admico0 + admico1 + admico2 + admico3 + mico4 +" "admppa_2 + admppa_1 + admppa0 + admppa1 + admppa2 + admppa3 + mppa4 +" "admgfa_2 + admgfa_1 + admgfa0 + admgfa1 + admgfa2 + admgfa3 + mgfa4 +" "state_c:t +" "EntityEffects + TimeEffects"), data=autor, drop_absorbed=True).fit(cov_type='clustered', cluster_entity=True) # Store results in a DataFrame for a plot results_did = pd.DataFrame({ "coef": did.params * 100, "ci": 1.96 * did.std_errors * 100 }) # Keep only the relevant coefficients results_did = results_did.filter(regex="admico|mico", axis=0).reset_index()
VIF[y] = 1 / (1 - res.rsquared) with open('../result/VIF.txt', 'w') as f: print(VIF, file=f) # pooled 回归 x = data[["MV", "RM", "BM", "ROE", "Inv"]] y = data["Ret"] results = sm.OLS(y, x).fit() with open('../result/pooled_reg.txt', 'w') as f: print(results.summary(), file=f) # 固定效应回归 data['Time'] = pd.to_datetime(data['Time']) data = data.set_index(['Stkcd', 'Time']) dependent = data.Ret exog = sm.add_constant(data[['MV', 'BM', 'RM', 'ROE', 'Inv']]) mod = PanelOLS(dependent, exog, entity_effects=True) res = mod.fit(cov_type='clustered') with open('../result/fixed_effects.txt', 'w') as f: print(res, file=f) # 控制行业回归 data = pd.read_csv("../data/data_all.csv") data['Time'] = pd.to_datetime(data['Time']) data = data.set_index(['Industry', 'Time']) dependent = data.Ret exog = sm.add_constant(data[['MV', 'BM', 'RM', 'ROE', 'Inv']]) mod = PanelOLS(dependent, exog, entity_effects=True) res = mod.fit(cov_type='clustered') with open('../result/industry_control.txt', 'w') as f: print(res, file=f)
index='treat', columns='year', values='defor', aggfunc=np.sum) count = pd.pivot_table(data=defor_df, index='treat', columns='year', values='defor', aggfunc="count") defor_df = defor_df.set_index(['idx', 'year']) # ============================================================================= # Run regression to estimate treatment effect # ============================================================================= ## Simple diff in diff mod = PanelOLS.from_formula('defor ~ treat * post', defor_df) res = mod.fit(cov_type='clustered', cluster_entity=True) print(res) ## Generalized did using two-way fixed effects # Outer is entity, inner is time from linearmodels.panel import PanelOLS defor_df['t'] = defor_df['treat'] * defor_df['post'] mod = PanelOLS.from_formula('defor ~ t + EntityEffects + TimeEffects', defor_df) res = mod.fit(cov_type='clustered', cluster_entity=True) print(res) ### KEY OBSERVATION: FE estimator yields ~ (estimate of att) = diff + att while ### simple diff in diff yields ~ (estimate of att) = att
print("1% :", orePriceRes_BDI_ADF[2]) print("5% :", orePriceRes_BDI_ADF[3]) print("10% :", orePriceRes_BDI_ADF[4]) # Setting up the DataFrame for PanelOLS and cluster effect by port freightCost_panel = freightCost_df.set_index(["port", "date"]) # Defining the Explanatory Variables freightCost_vars = [ "growth", "logd", "logf", "ore_price", "port_dummy1", "port_dummy2" ] freightCost_reg = sm.add_constant(freightCost_panel[freightCost_vars]) # Running a panel regression freightCost_results = PanelOLS(freightCost_panel["avefreight"], freightCost_reg, entity_effects=False).fit(cov_type="clustered", cluster_entity=True) # Setting up the DataFrame for PanelOLS and cluster effect by port freightCost_BDI_panel = freightCost_df.set_index(["port", "date"]) # Defining the Explanatory Variables freightCost_BDI_vars = [ "growth", "logd", "logf", "ore_price", "BDI", "port_dummy1", "port_dummy2" ] freightCost_BDI_reg = sm.add_constant( freightCost_BDI_panel[freightCost_BDI_vars]) # Running a panel regression freightCost_BDI_results = PanelOLS(freightCost_BDI_panel["avefreight"], freightCost_BDI_reg,
from linearmodels.panel import PanelOLS import statsmodels.api as sm from linearmodels.panel import PooledOLS import sys import os DATA_FILE = sys.argv[1] OUTPUT_FILE = sys.argv[2] change_df = pd.read_csv(DATA_FILE) base = os.path.basename(OUTPUT_FILE) incomegroup = base.split(".")[0].split("_")[-1] select_df = change_df[change_df.IncomeGroup == incomegroup] #filter out unbalanced data points num_period = len(select_df.period.unique()) select_df['size'] = select_df.groupby('Code')['Code'].transform('size') select_df = select_df[select_df['size'] == num_period] select_df['Income_t0_log'] = np.log10(select_df['Income_t0']) select_df = select_df.set_index(['Code', 'date']) exog_vars = [ 'Income_t0_log', 'nm_change', 'shm_change', 'ne_change', 'sum_adv_t0' ] exog = sm.add_constant(select_df[exog_vars]) mod = PanelOLS(select_df.growth_rate, exog, entity_effects=True) fe_res = mod.fit() with open(OUTPUT_FILE, 'w') as f: f.write(fe_res.summary.as_text())
# print(data1) d = pd.Categorical(data1['Date']) data1 = data1.set_index(['ID', 'Date']) data1['Date'] = d # print(data1) exog_vars = [ 'Kilo', 'Brakes', 'Range', 'Speed', 'RPM', 'Engine fuel rate', 'Date' ] a = ['Kilo', 'Brakes', 'Range', 'Speed', 'RPM', 'Engine fuel rate'] print(data1[a]) exog = sm.add_constant(data1[exog_vars]) exog1 = sm.add_constant(data1[a]) mod = PanelOLS(data1['Accelerator pedal position'], exog, entity_effects=True, time_effects=False) mod1 = PooledOLS(data1['Accelerator pedal position'], exog1) mod2 = RandomEffects(data1['Accelerator pedal position'], exog1) mod3 = BetweenOLS(data1['Accelerator pedal position'], exog1) res = mod.fit() pooled_res = mod1.fit() re_res = mod2.fit() be_res = mod3.fit() print(res) print(compare({'Pooled': pooled_res, 'RE': re_res, 'BE': be_res})) if __name__ == '__main__': pass
def balancing_tests_cohort_results(df, exog): post_exposure1 = PanelOLS(df.adult, exog, entity_effects=True, time_effects=True, singletons=False) result_balancing_canton1 = post_exposure1.fit(cov_type='clustered', clusters=df.id_e, singletons=False) post_exposure2 = PanelOLS(df.below_median_age_restr, exog, entity_effects=True, time_effects=True, singletons=False) result_balancing_canton2 = post_exposure2.fit(cov_type='clustered', clusters=df.id_e, singletons=False) post_exposure3 = PanelOLS(df.sex_ratio, exog, entity_effects=True, time_effects=True, singletons=False) result_balancing_canton3 = post_exposure3.fit(cov_type='clustered', clusters=df.id_e, singletons=False) post_exposure4 = PanelOLS(df.have_adults_patch, exog, entity_effects=True, time_effects=True, singletons=False) result_balancing_canton4 = post_exposure4.fit(cov_type='clustered', clusters=df.id_e, singletons=False) return (compare( { 'Size of cohort': result_balancing_canton1, 'Below median age': result_balancing_canton2, 'Sex ratio': result_balancing_canton3, 'Have families': result_balancing_canton4 }, stars=True))
def crime_by_type(df): mi_data = df.set_index(["id_e_t", "id_a"]) exog_vars = [ "kid012_all", "all_exp_13", "all_exp_14", "all_exp_15", "all_exp_16", "all_exp_17", "all_exp_18", "all_exp_19", "all_exp_20", "all_exp_21", "all_exp_22", "all_exp_23", "all_exp_24", "all_exp_25", "all_exp_26", "all_exp_27", "all_exp_28", "all_exp_29", "all_exp_30", "all_exp_31", "all_exp_32", "all_exp_33", "all_exp_34", "all_exp_35", "all_exp_36", "all_exp_37", "all_exp_38", "all_exp_39", "all_exp_40", "all_exp_41", "all_exp_42", "all_exp_43", "all_exp_44", "all_exp_45", "all_exp_46", "all_exp_47", "all_exp_48", "all_exp_49", "all_exp_50", "all_exp_51", "all_exp_52", "all_exp_53", "all_exp_54", "all_exp_55", "all_exp_56", "all_exp_57", "all_exp_58", "all_exp_59", "all_exp_60", "all_exp_61", "all_exp_62", "all_exp_63", "all_exp_64", "all_exp_65", "all_exp_66", "all_exp_67", "all_exp_68", "all_exp_69", "all_exp_70", "all_exp_71", "all_exp_72", "all_exp_73", "all_exp_74", "all_exp_75", "all_exp_76", "all_exp_77", "all_exp_78", "all_exp_79", "all_exp_80", "all_exp_81", "all_exp_82", "all_exp_83", "all_exp_84", "all_exp_85", "all_exp_86" ] exog_baseline_type = sm.add_constant(mi_data[exog_vars]) result_6_1 = PanelOLS(mi_data.crime_rate_violent_p30, exog_baseline_type, entity_effects=True, time_effects=True, drop_absorbed=True, singletons=True) res_6_violent = result_6_1.fit(cov_type='clustered', cluster=mi_data["id_e"]) result_6_2 = PanelOLS(mi_data.crime_rate_freedom_p30, exog_baseline_type, entity_effects=True, time_effects=True, drop_absorbed=True, singletons=True) res_6_freedom = result_6_2.fit(cov_type='clustered', cluster=mi_data["id_e"]) result_6_3 = PanelOLS(mi_data.crime_rate_sexual_p30, exog_baseline_type, entity_effects=True, time_effects=True, drop_absorbed=True, singletons=True) res_6_sexual = result_6_3.fit(cov_type='clustered', cluster=mi_data["id_e"]) result_6_4 = PanelOLS(mi_data.crime_rate_property_p30, exog_baseline_type, entity_effects=True, time_effects=True, drop_absorbed=True, singletons=True) res_6_property = result_6_4.fit(cov_type='clustered', cluster=mi_data["id_e"]) return (compare( { 'violent': res_6_violent, 'freedom': res_6_freedom, 'sexual': res_6_sexual, 'property': res_6_property }, stars=True))
import sys import pandas as pd import statsmodels.api as sm from linearmodels.panel import PanelOLS DATA_FILE = sys.argv[1] OUTPUT_FILE = sys.argv[2] change_df = pd.read_csv(DATA_FILE) change_df = change_df.set_index(["Code", "date"]) exog_vars = ["Income_t0_log", "nm_change", "shm_change", "ne_change", "sum_adv_t0"] exog = sm.add_constant(change_df[exog_vars]) mod = PanelOLS(change_df.growth_rate, exog) fe_res = mod.fit() with open(OUTPUT_FILE, "w") as f: f.write(fe_res.summary.as_text())
import numpy as np import linearmodels as lm lm.WARN_ON_MISSING = False from linearmodels import utility utility.missing_warning(np.array([True, True, False])) from linearmodels.panel import PanelOLS, RandomEffects, PooledOLS from linearmodels.datasets import wage_panel import statsmodels.api as sm data = wage_panel.load() data = data.set_index(['nr','year']) dependent = data.lwage exog = sm.add_constant(data[['expersq','married','union']]) mod = PanelOLS(dependent, exog, entity_effects=True, time_effects=True) res = mod.fit(cov_type='unadjusted') res2 = mod.fit(cov_type='robust') exog = sm.add_constant(data[['exper', 'expersq','married','union']]) mod = PanelOLS(dependent, exog, entity_effects=True) res3 = mod.fit(cov_type='clustered',cluster_entity=True) mod = RandomEffects(dependent, exog) res4 = mod.fit(cov_type='robust') from linearmodels.panel.results import compare exog = sm.add_constant(data[['exper', 'expersq','married','union']].copy()) import pandas as pd exog['year'] = pd.Categorical(data.reset_index()['year']) mod = PooledOLS(dependent, exog) res5 = mod.fit(cov_type='robust') print(compare([res,res2, res3, res4, res5])) print(data.columns)