def __fitreg(self, dt, start_datetime, end_datetime, y, var_pit, var_norm, fix, cluster, c): # filter dates dt = dt.loc[(dt['date'] >= start_datetime) & (dt['date'] <= end_datetime)] # filter columns dt = dt[y + ['year', 'ticker'] + [col for col in dt.columns[c:]] + fix] # choose x x = '+'.join(dt.columns[3:]) #print("Start filling NAs...") #dt = dt.fillna(dt.groupby('ticker').transform('mean')) #dt = dt.fillna(dt.transform('mean')) dt = dt.dropna() #print("Filling NAs done.") dt = dt.set_index(['ticker', 'year']) if len(fix) == 0 and len(cluster) == 0: mod = PanelOLS.from_formula(y[0] + '~1+' + x, data=dt) fit1 = mod.fit(cov_type='clustered', cluster_time=False, cluster_entity=False) return fit1 if len(fix) == 1: mod = PanelOLS.from_formula(y[0] + '~1+' + x + '+' + fix[0], data=dt) if len(cluster) == 0: fit1 = mod.fit(cov_type='clustered', cluster_time=False, cluster_entity=False) return fit1 elif cluster == ['year']: fit1 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=False) return fit1 elif cluster == ['ticker']: fit1 = mod.fit(cov_type='clustered', cluster_time=False, cluster_entity=True) return fit1 elif cluster == ['year', 'ticker' ] or cluster == ['ticker', 'year']: fit1 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) return fit1 else: raise KeyError("Please choose either year or ticker, or both.") if len(fix) > 1: raise KeyError( "You have {} fixed effects! Please pick one.".format(len(fix)))
def cond_corr_e2_e1timesprize(df): """Correlation of e2 and the interaction of e1 and prize after partialing out other effects.""" df_resid = pd.DataFrame(columns=["e2_resid", "e1timesprize_resid"]) for label in ["e2", "e1timesprize"]: column, formula = ( f"{label}_resid", f"{label}~e1+prize+tt2+tt3+tt4+tt5+tt6+tt7+tt8+tt9+tt10+EntityEffects", ) df_resid.loc[:, column] = PanelOLS.from_formula(formula, data=df).fit().resids return df_resid["e2_resid"].corr(df_resid["e1timesprize_resid"])
def old_percentile_correlation(df): """J percentile of the correlation of e2 and e1 after partialing out other effects.""" df_resid = pd.DataFrame(columns=["e2_resid", "e1_resid"], index=df.index) for label in ["e2", "e1"]: column, formula = f"{label}_resid", f"{label}~prize+e1timesprize+TimeEffects" df_resid.loc[:, column] = PanelOLS.from_formula(formula, data=df).fit().resids dfs = dict() for sub in df_resid.index.get_level_values('subject').unique(): dfs[f"{sub}"] = df_resid.query(f"subject == {sub}") cond_corr = list() for key in dfs: cond_corr.append(dfs[key]["e2_resid"].corr(dfs[key]["e1_resid"])) return np.percentile(cond_corr, 66)
def process_data(tag, area_tag): """ 处理数据 :param area_tag :return: """ root_path = getRootPath() tif_file = os.path.join( root_path, "{0}/result/avg_data/avg_{1}.tif".format(tag, area_tag)) bandArray = get_raster_band_array(tif_file) df = pd.DataFrame(bandArray, columns=["sday", "eday", "gsl", "gdd", "edd", "pre"]) df.sday = df.sday.astype(np.int64) df.eday = df.eday.astype(np.int64) df = df.set_index(["eday", "sday"]) df.dropna() print("-------- use EntityEffects ---------") mod = PanelOLS.from_formula('gsl ~ 1 + gdd + edd + pre + EntityEffects', df) res = mod.fit(cov_type='unadjusted') print(res)
def process_data(tag, area_tag): """ 处理预测数据: :param area_tag: :return: """ print("process data area_tag: {}".format(area_tag)) root_path = getRootPath() src_path = os.path.join(root_path, "{0}/process/merge/{1}".format(tag, area_tag)) tif_files = walkDirFile(src_path, ext=".tif") bandArray = None flag = False for tif_file in tif_files: tempArr = get_raster_band_array(tif_file) if not flag: bandArray = tempArr flag = True else: bandArray = np.vstack((bandArray, tempArr)) if not flag: return df = pd.DataFrame( bandArray, columns=["sday", "eday", "gsl", "year", "gdd", "edd", "pre"]) df.sday = df.sday.astype(np.int64) df.eday = df.eday.astype(np.int64) df.year = df.year.astype(np.int64) df = df.set_index(["year", "eday"]) df.dropna() print("-------- use EntityEffects ---------") mod = PanelOLS.from_formula('gsl ~ 1 + gdd + edd + pre + EntityEffects', df) res = mod.fit(cov_type='unadjusted') print(res)
test['volume'] = test['volume'] / 1000000 test = test.loc[test['year'].isin(['2020', '2018', '2019'])] test = test[[ 'year', 'ticker', 'assetclasslevel1', 'assetclasslevel2', 'assetclasslevel3', 'cd', 'cdlag1', 'pd', 'volume', 'age' ]] test = test.dropna() # In[16]: test0 = test.set_index(['ticker', 'year']) # fix assetclasslevel1, cluster time + ticker mod = PanelOLS.from_formula( 'cd ~ 1 + cdlag1 + volume + pd + age + assetclasslevel1', data=test0) fit01 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) # fix assetclasslevel2, cluster time + ticker mod = PanelOLS.from_formula( 'cd ~ 1 + cdlag1 + volume + pd + age + assetclasslevel2', data=test0) fit02 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) # fix assetclasslevel3, cluster time + ticker mod = PanelOLS.from_formula( 'cd ~ 1 + cdlag1 + volume + pd + age + assetclasslevel3', data=test0) fit03 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) # fix year, cluster time + ticker mod = PanelOLS.from_formula('cd ~ 1 + cdlag1 + volume + pd + TimeEffects', data=test0)
index='treat', columns='year', values='defor', aggfunc=np.sum) count = pd.pivot_table(data=defor_df, index='treat', columns='year', values='defor', aggfunc="count") defor_df = defor_df.set_index(['idx', 'year']) # ============================================================================= # Run regression to estimate treatment effect # ============================================================================= ## Simple diff in diff mod = PanelOLS.from_formula('defor ~ treat * post', defor_df) res = mod.fit(cov_type='clustered', cluster_entity=True) print(res) ## Generalized did using two-way fixed effects # Outer is entity, inner is time from linearmodels.panel import PanelOLS defor_df['t'] = defor_df['treat'] * defor_df['post'] mod = PanelOLS.from_formula('defor ~ t + EntityEffects + TimeEffects', defor_df) res = mod.fit(cov_type='clustered', cluster_entity=True) print(res) ### KEY OBSERVATION: FE estimator yields ~ (estimate of att) = diff + att while ### simple diff in diff yields ~ (estimate of att) = att
autor["other"] = autor["rs_om"] + autor["rs_of"] autor["married"] = autor["marfem"] + autor["marmale"] # Create categorical for state autor["state_c"] = pd.Categorical(autor["state"]) # Set index for use with linearmodels autor = autor.set_index(["state", "year"], drop=False) # Diff-in-diff regression did = PanelOLS.from_formula( ("lnths ~" "1 +" "lnemp +" "admico_2 + admico_1 + admico0 + admico1 + admico2 + admico3 + mico4 +" "admppa_2 + admppa_1 + admppa0 + admppa1 + admppa2 + admppa3 + mppa4 +" "admgfa_2 + admgfa_1 + admgfa0 + admgfa1 + admgfa2 + admgfa3 + mgfa4 +" "state_c:t +" "EntityEffects + TimeEffects"), data=autor, drop_absorbed=True).fit(cov_type='clustered', cluster_entity=True) # Store results in a DataFrame for a plot results_did = pd.DataFrame({ "coef": did.params * 100, "ci": 1.96 * did.std_errors * 100 }) # Keep only the relevant coefficients results_did = results_did.filter(regex="admico|mico", axis=0).reset_index()
def __fitreg(self, dt, start_datetime, end_datetime, y, var_pit, var_norm, fix, cluster, c): # filter dates dt = dt.loc[(dt['date'] >= start_datetime) & (dt['date'] <= end_datetime)] # filter columns dt = dt[y + ['year', 'ticker'] + [col for col in dt.columns[self.c:]] + fix] # choose x x = '+'.join(dt.columns[3:]) #print("Start filling NAs...") #dt = dt.fillna(dt.groupby('ticker').transform('mean')) #dt = dt.fillna(dt.transform('mean')) dt = dt.dropna() #print("Filling NAs done.") dt = dt.set_index(['ticker', 'year']) self.assetclass = dt[fix].drop_duplicates().reset_index(drop=True) # winsorise before running regression for col in dt.columns[:-1]: # get the upper and lower bound as quantile of +/- 3 sigma of standard normal lb = dt[col].quantile(stats.norm.cdf(-3)) ub = dt[col].quantile(stats.norm.cdf(3)) # winsorise for outlier data points dt.loc[dt[col] < lb, col] = lb dt.loc[dt[col] > ub, col] = ub print(dt.info()) if len(fix) == 0 and len(cluster) == 0: mod = PanelOLS.from_formula(y[0] + '~1+' + x, data=dt) fit1 = mod.fit(cov_type='heteroskedastic', cluster_time=False, cluster_entity=False) print(fit1) return fit1 if len(fix) == 1: mod = PanelOLS.from_formula(y[0] + '~1+' + x + '+' + fix[0], data=dt) if len(cluster) == 0: fit1 = mod.fit(cov_type='clustered', cluster_time=False, cluster_entity=False) print(fit1) return fit1 elif cluster == ['year']: fit1 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=False) print(fit1) return fit1 elif cluster == ['ticker']: fit1 = mod.fit(cov_type='clustered', cluster_time=False, cluster_entity=True) print(fit1) return fit1 elif cluster == ['year', 'ticker' ] or cluster == ['ticker', 'year']: fit1 = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True) print(fit1) return fit1 else: raise KeyError("Please choose either year or ticker, or both.") if len(fix) > 1: raise KeyError( "You have {} fixed effects! Please pick one.".format(len(fix)))