def build_crsp_m(permnos): crsp_m = conn.raw_sql(f""" select a.permno, a.permco, a.date, b.ticker, b.ncusip, b.shrcd, b.exchcd, b.siccd, a.prc, a.ret, a.retx, a.shrout, a.vol from crsp.msf as a left join crsp.msenames as b on a.permno=b.permno and b.namedt<=a.date and a.date<=b.nameendt where a.permno in {permnos} and b.exchcd between 1 and 3 and b.shrcd between 10 and 11 """) crsp_m['me'] = crsp_m['prc'].abs()*crsp_m['shrout'] crsp_m['prca'] = crsp_m['prc'].abs() crsp_m['lprc'] = crsp_m.groupby(['permno','permco'])['prca'].shift(1) crsp_m['lme'] = crsp_m.groupby(['permno','permco'])['me'].shift(1) crsp_m[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_m[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) # Line up date to be end of month crsp_m['date'] = pd.to_datetime(crsp_m['date']) crsp_m['jdate'] = crsp_m['date']+MonthEnd(0) return crsp_m
def build_compq(permnos): gvkeys = permnos_to_gvkeys(permnos) compq = conn.raw_sql(f""" select fyearq, fqtr, apdedateq, datadate, pdateq, fdateq, f.gvkey, REVTQ, REQ, EPSPIQ, ACTQ, INVTQ, LCTQ, CHQ, CSHOQ, PRCCQ, NIQ, ATQ, LTQ, GDWLQ from comp.fundq as f where f.gvkey in {gvkeys} and REVTQ != 'NaN' """) compq.fillna(value=np.nan, inplace=True) compq.dropna(axis='rows', how='any', subset=['fyearq', 'fqtr'], inplace=True) compq['fyearq'].astype(int) compq['fqtr'].astype(int) compq['datadate'] = pd.to_datetime(compq['datadate']) compq['permno'] = compq['gvkey'].apply(gvkey_to_permno) compq['quickq'] = (compq['actq'] - compq['invtq']) / compq['lctq'] compq['curratq'] = compq['actq'] / compq['lctq'] compq['cashrratq'] = compq['chq'] / compq['lctq'] compq['peq'] = (compq['cshoq'] * compq['prccq']) / compq['niq'] compq['roeq'] = compq['niq'] / (compq['prccq'] * compq['prccq']) compq['roaq'] = compq['niq'] / (compq['atq'] - compq['ltq']) return compq
def build_ccm_data(permnos, comp, crsp_jun): ccm = conn.raw_sql(f""" select gvkey, lpermno as permno, linktype, linkprim, linkdt, linkenddt from crsp.ccmxpf_linktable where lpermno in {permnos} and substr(linktype,1,1)='L' and linkprim in ('P', 'C') """) ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) # if linkenddt is missing then set to next month's june (Changed) ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')+YearEnd(0)+MonthEnd(6)) ccm['linkenddt'] = ccm['linkenddt'].dt.date ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) ccm1['yearend'] = ccm1['datadate']+YearEnd(0) ccm1['jdate'] = ccm1['yearend']+MonthEnd(6) ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= (ccm1['linkenddt']))] ccm2 = ccm2.drop(columns=['datadate_a','linktype','linkdt','linkenddt']) ccm_data = pd.merge(ccm2, crsp_jun, how='left', on=['permno', 'jdate']) ccm_data = ccm_data[ccm_data.dec_me != 0] ccm_data['beme'] = ccm_data['be']*1000/ccm_data['dec_me'] # drop duplicates ccm_data = ccm_data.sort_values(by=['permno', 'date']).drop_duplicates() ccm_data = ccm_data.sort_values(by=['gvkey', 'date']).drop_duplicates() # Note: Different from SAS, Python count start from zero, will see if I need to add 1 to better serve the need ccm_data['count'] = ccm_data.groupby(['gvkey']).cumcount() # Parallel to the cleaning step for 'dr' ccm_data['dr'] = np.where(ccm_data.drc.notna() & ccm_data.drlt.notna(), ccm_data.drc+ccm_data.drlt, None) ccm_data['dr'] = np.where(ccm_data.drc.notna() & ccm_data.drlt.isna(), ccm_data.drc, ccm_data['dr']) ccm_data['dr'] = np.where(ccm_data.drc.isna() & ccm_data.drlt.notna(), ccm_data.drlt, ccm_data['dr']) # Parallel to the cleaning step for 'dc' ccm_data.loc[(ccm_data['dcvt'].isna()) & (ccm_data['dcpstk'].notna()) & (ccm_data['pstk'].notna()) & (ccm_data['dcpstk'] > ccm_data['pstk']), 'dc'] = ccm_data['dcpstk'] - ccm_data['pstk'] ccm_data.loc[(ccm_data['dcvt'].isna()) & (ccm_data['dcpstk'].notna()) & (ccm_data['pstk'].isna()), 'dc'] = ccm_data['dcpstk'] ccm_data.loc[(ccm_data['dc'].isna()), 'dc'] = ccm_data['dcvt'] ccm_data['xint'] = ccm_data['xint'].fillna(0) ccm_data['xsga'] = ccm_data['xsga'].fillna(0) ccm_data = ccm_data.sort_values(by=['permno', 'date']).drop_duplicates() ccm_data.fillna(value=pd.np.nan, inplace=True) return ccm_data
def build_dlret(permnos): dlret = conn.raw_sql(f""" select permno, dlret, dlstdt from crsp.msedelist where permno in {permnos} """) dlret.permno = dlret.permno.astype(int) # Line up date to be end of month dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) dlret['jdate'] = dlret['dlstdt']+MonthEnd(0) return dlret
def build_comp(permnos): gvkeys = permnos_to_gvkeys(permnos) comp = conn.raw_sql(f""" select fyear, apdedate, datadate, pdate, fdate, c.gvkey, f.cusip as cnum, datadate as datadate_a, c.cik, sic as sic2, sic, naics, sale, revt, cogs, xsga, xrd, xad, ib, ebitda, ebit, nopi, spi, pi, txp, ni, txfed, txfo, txt, xint, capx, oancf, dvt, ob, gdwlia, gdwlip, gwo, rect, act, che, ppegt, invt, at, aco, intan, ao, ppent, gdwl, fatb, fatl, lct, dlc, dltt, lt, dm, dcvt, cshrc, dcpstk, pstk, ap, lco, lo, drc, drlt, txdi, ceq, scstkc, emp, csho, /*addition*/ pstkrv, pstkl, txditc, datadate as year, /*market*/ abs(prcc_f) as prcc_f, csho*prcc_f as mve_f, /*HXZ*/ am, ajex, txdb, seq, dvc, dvp, dp, dvpsx_f, mib, ivao, ivst, sstk, prstkc, dv, dltis, dltr, dlcch, oibdp, dvpa, tstkp, oiadp, xpp, xacc, re, ppenb, ppenls, capxv, fopt, wcap from comp.names as c, comp.funda as f where c.gvkey in {gvkeys} and f.gvkey=c.gvkey /*get consolidated, standardized, industrial format statements*/ and f.indfmt='INDL' and f.datafmt='STD' and f.popsrc='D' and f.consol='C' """) comp.cnum = comp.cnum.replace(' ', '').str.slice(0, 6) comp.sic2 = comp.sic2 + '12' comp.apdedate = pd.to_datetime(comp.apdedate) comp.datadate = pd.to_datetime(comp.datadate) comp.pdate = pd.to_datetime(comp.pdate) comp.fdate = pd.to_datetime(comp.fdate) comp.year = comp['datadate'].dt.year comp = comp.dropna(subset=['at', 'prcc_f', 'ni']) comp['ps_beme'] = np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv']) comp['ps_beme'] = np.where(comp['ps_beme'].isnull(), comp['pstk'], comp['ps_beme']) comp['ps_beme'] = np.where(comp['ps_beme'].isnull(), 0, comp['ps_beme']) comp['txditc'] = comp['txditc'].fillna(0) comp['be'] = comp['ceq']+comp['txditc']-comp['ps_beme'] comp['be'] = np.where(comp['be'] > 0, comp['be'], None) comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates(['gvkey', 'datadate']) comp['count'] = comp.groupby(['gvkey']).cumcount() return comp
def construct_daily(business_day, permno): assert isinstance(permno, (list, str)), 'invalid permno data type' if isinstance(permno, list): assert len(permno) != 0, 'zero permno list length' if isinstance(permno, str): permno = list([permno]) permno = tuple(permno) daily_df = conn.raw_sql(f""" select a.date, a.permno, b.ticker, b.shrcd, b.siccd, a.ret, abs(a.prc) as prc, a.shrout, a.cfacpr, a.cfacshr from crsp.dsf as a left join crsp.msenames as b on a.permno = b.permno and b.namedt <= a.date and a.date <= b.nameendt and a.date = '{business_day}' where b.permno in {permno if len(permno) > 1 else '(' + permno[0] + ')'} """) return daily_df
def build_compa(permnos): gvkeys = permnos_to_gvkeys(permnos) compa = conn.raw_sql(f""" select fyear, apdedate, datadate, pdate, fdate, f.gvkey, REVT, EBIT, EBITDA, RE, EPSPI, GP, OPINCAR, ACT, INVT, LCT, CH, OANCF, DVP, DVC, PRSTKC, NI, CSHO, PRCC_F, mkvalt, BKVLPS, AT, LT, DVT, ICAPT, XINT, DLCCH, DLTT, GDWL, GWO, CAPX, DLC, SEQ from comp.funda as f where f.gvkey in {gvkeys} and REVT != 'NaN' and f.indfmt='INDL' and f.datafmt='STD' and f.popsrc='D' and f.consol='C' """) compa.fillna(value=np.nan, inplace=True) compa['fyear'].astype(int) compa['fqtr'] = 4 compa['datadate'] = pd.to_datetime(compa['datadate']) compa['permno'] = compa['gvkey'].apply(gvkey_to_permno) compa['gma'] = compa['gp'] / compa['revt'] compa['operprof'] = compa['opincar'] / compa['revt'] compa['quick'] = (compa['act'] - compa['invt']) / compa['lct'] compa['currat'] = compa['act'] / compa['lct'] compa['cashrrat'] = compa['ch'] / compa['lct'] compa['cftrr'] = compa['oancf'] / compa['revt'] compa['dpr'] = (compa['dvp'] + compa['dvc'] + compa['prstkc']) / compa['ni'] compa['pe'] = (compa['csho'] * compa['prcc_f']) / compa['ni'] compa['pb'] = (compa['mkvalt']) / (compa['bkvlps']) compa['roe'] = compa['ni'] / (compa['csho'] * compa['prcc_f']) compa['roa'] = compa['ni'] / (compa['at'] - compa['lt']) compa['roic'] = (compa['ni'] - compa['dvt']) / compa['icapt'] compa['cod'] = compa['xint'] / (compa['dlcch'] + compa['dltt']) compa['capint'] = compa['capx'] / compa['at'] compa['lev'] = (compa['dltt'] + compa['dlc']) / compa['seq'] return compa
def build_ccm_jun(ccm_data): ccm_jun = ccm_data.copy() ccm_jun['mve6b'] = ccm_jun['dec_me'] ccm_jun['ep'] = ccm_jun.ib/ccm_jun.mve_f ccm_jun['cashpr'] = (ccm_jun.mve_f+ccm_jun.dltt-ccm_jun['at'])/ccm_jun.che ccm_jun['dy'] = ccm_jun.dvt/ccm_jun.mve_f ccm_jun['lev'] = ccm_jun['lt']/ccm_jun.mve_f ccm_jun['sp'] = ccm_jun.sale/ccm_jun.mve_f ccm_jun['roic'] = (ccm_jun.ebit-ccm_jun.nopi)/(ccm_jun.ceq+ccm_jun['lt']-ccm_jun.che) ccm_jun['rd_sale'] = ccm_jun.xrd/ccm_jun.sale ccm_jun['rd_mve'] = ccm_jun['xrd']/ccm_jun['mve_f'] ccm_jun['sp'] = ccm_jun.sale/ccm_jun.mve_f #duplicate? # treatment for lagged terms ccm_jun['lagat']=ccm_jun.groupby(['permno'])['at'].shift(1) ccm_jun['lat'] = ccm_jun['lagat'] ccm_jun['lagcsho']=ccm_jun.groupby(['permno'])['csho'].shift(1) ccm_jun['laglt']=ccm_jun.groupby(['permno'])['lt'].shift(1) ccm_jun['lagact']=ccm_jun.groupby(['permno'])['act'].shift(1) ccm_jun['lagche']=ccm_jun.groupby(['permno'])['che'].shift(1) ccm_jun['lagdlc']=ccm_jun.groupby(['permno'])['dlc'].shift(1) ccm_jun['lagtxp']=ccm_jun.groupby(['permno'])['txp'].shift(1) ccm_jun['laglct']=ccm_jun.groupby(['permno'])['lct'].shift(1) ccm_jun['laginvt']=ccm_jun.groupby(['permno'])['invt'].shift(1) ccm_jun['lagemp']=ccm_jun.groupby(['permno'])['emp'].shift(1) ccm_jun['lagsale']=ccm_jun.groupby(['permno'])['sale'].shift(1) ccm_jun['lagib']=ccm_jun.groupby(['permno'])['ib'].shift(1) ccm_jun['lag2at']=ccm_jun.groupby(['permno'])['at'].shift(2) ccm_jun['lagrect']=ccm_jun.groupby(['permno'])['rect'].shift(1) ccm_jun['lagcogs']=ccm_jun.groupby(['permno'])['cogs'].shift(1) ccm_jun['lagxsga']=ccm_jun.groupby(['permno'])['xsga'].shift(1) ccm_jun['lagppent']=ccm_jun.groupby(['permno'])['ppent'].shift(1) ccm_jun['lagdp']=ccm_jun.groupby(['permno'])['dp'].shift(1) ccm_jun['lagxad']=ccm_jun.groupby(['permno'])['xad'].shift(1) ccm_jun['lagppegt']=ccm_jun.groupby(['permno'])['ppegt'].shift(1) ccm_jun['lagceq']=ccm_jun.groupby(['permno'])['ceq'].shift(1) ccm_jun['lagcapx']=ccm_jun.groupby(['permno'])['capx'].shift(1) ccm_jun['lag2capx']=ccm_jun.groupby(['permno'])['capx'].shift(2) ccm_jun['laggdwl']=ccm_jun.groupby(['permno'])['gdwl'].shift(1) ccm_jun['lagdvt']=ccm_jun.groupby(['permno'])['dvt'].shift(1) ccm_jun['lagob']=ccm_jun.groupby(['permno'])['ob'].shift(1) ccm_jun['lagaco']=ccm_jun.groupby(['permno'])['aco'].shift(1) ccm_jun['lagintan']=ccm_jun.groupby(['permno'])['intan'].shift(1) ccm_jun['lagao']=ccm_jun.groupby(['permno'])['ao'].shift(1) ccm_jun['lagap']=ccm_jun.groupby(['permno'])['ap'].shift(1) ccm_jun['laglco']=ccm_jun.groupby(['permno'])['lco'].shift(1) ccm_jun['laglo']=ccm_jun.groupby(['permno'])['lo'].shift(1) ccm_jun['lagdr']=ccm_jun.groupby(['permno'])['dr'].shift(1) ccm_jun['lagxrd']=ccm_jun.groupby(['permno'])['xrd'].shift(1) ccm_jun['lagni']=ccm_jun.groupby(['permno'])['ni'].shift(1) ccm_jun['lagdltt']=ccm_jun.groupby(['permno'])['dltt'].shift(1) ccm_jun['agr']=np.where(ccm_jun['at'].isna() | ccm_jun.lagat.isna(), np.NaN, (ccm_jun.lagat-ccm_jun['at'])/ccm_jun.lagat) ccm_jun['gma']=(ccm_jun['revt']-ccm_jun['cogs'])/ccm_jun['lagat'] ccm_jun['chcsho']=ccm_jun.csho/ccm_jun.lagcsho -1 ccm_jun['lgr']=ccm_jun['lt']/ccm_jun.laglt -1 ccm_jun['acc']=(ccm_jun.ib-ccm_jun.oancf)/(ccm_jun['at']+ccm_jun.lagat) * 2 ccm_jun.loc[ccm_jun['oancf'].isna(), 'acc'] = (((ccm_jun['act'] - lag(ccm_jun, 'act')) - (ccm_jun['che'] - lag(ccm_jun, 'che'))) \ - (((ccm_jun['lct'] - lag(ccm_jun,'lct')) - (ccm_jun['dlc'] - lag(ccm_jun,'dlc')) - (ccm_jun['txp'] - lag(ccm_jun, 'txp'))) - ccm_jun['dp'])) \ / ((ccm_jun['at']+lag(ccm_jun,'at'))/2) ccm_jun.loc[ccm_jun['ib'] != 0, 'pctacc'] = (ccm_jun['ib'] - ccm_jun['oancf'])/np.abs(ccm_jun['ib']) ccm_jun.loc[ccm_jun['ib'] == 0, 'pctacc'] = (ccm_jun['ib'] - ccm_jun['oancf'])/(0.01) ccm_jun.loc[(ccm_jun['oancf'].isna()) & (ccm_jun['ib'] != 0), 'pctacc'] = (((ccm_jun['act'] - lag(ccm_jun,'act')) - (ccm_jun['che'] - lag(ccm_jun,'che'))) \ - ((ccm_jun['lct'] - lag(ccm_jun,'lct'))-(ccm_jun['dlc']-lag(ccm_jun,'dlc')) - (ccm_jun['txp'] - lag(ccm_jun,'txp')) - ccm_jun['dp'] ))/np.abs(ccm_jun['ib']) ccm_jun.loc[(ccm_jun['oancf'].isna()) & (ccm_jun['ib'] == 0), 'pctacc'] = (((ccm_jun['act'] - lag(ccm_jun,'act')) - (ccm_jun['che'] - lag(ccm_jun,'che'))) \ - ((ccm_jun['lct'] - lag(ccm_jun,'lct'))-(ccm_jun['dlc']-lag(ccm_jun,'dlc')) - (ccm_jun['txp'] - lag(ccm_jun,'txp')) - ccm_jun['dp'] ))/0.01 ccm_jun['cfp']= (ccm_jun['ib']-(ccm_jun['act']-ccm_jun['lagact']-(ccm_jun['che']-ccm_jun['lagche'])))\ -(ccm_jun['lct']-ccm_jun['laglct']-(ccm_jun['dlc']-ccm_jun['lagdlc'])\ -(ccm_jun['txp']-ccm_jun['lagtxp'])-ccm_jun['dp'])/ccm_jun['mve_f'] ccm_jun['cfp']=np.where(ccm_jun['oancf'].notna(),ccm_jun['oancf']/ccm_jun['mve_f'], ccm_jun['cfp']) ccm_jun['absacc']=ccm_jun['acc'].abs() ccm_jun['chinv']=2*(ccm_jun['invt']-ccm_jun['laginvt'])/(ccm_jun['at']+ccm_jun['lagat']) ccm_jun['spii']=np.where((ccm_jun['spi']!=0)&ccm_jun['spi'].notna(), 1, 0) ccm_jun['spi']=2*ccm_jun['spi']/(ccm_jun['at']+ccm_jun['lagat']) ccm_jun['cf']=2*ccm_jun['oancf']/(ccm_jun['at']+ccm_jun['lagat']) ccm_jun['cf']=np.where(ccm_jun['oancf'].isna(), (ccm_jun['ib']-(ccm_jun['act']-ccm_jun['lagact']-(ccm_jun['che']-ccm_jun['lagche'])))\ -(ccm_jun['lct']-ccm_jun['laglct']-(ccm_jun['dlc']-ccm_jun['lagdlc'])\ -(ccm_jun['txp']-ccm_jun['lagtxp'])-ccm_jun['dp'])/((ccm_jun['at']+ccm_jun['lagat'])/2),ccm_jun['cf']) ccm_jun['hire']=(ccm_jun['emp']-ccm_jun['lagemp'])/ccm_jun['lagemp'] ccm_jun['hire']=np.where(ccm_jun['emp'].isna() | ccm_jun['lagemp'].isna(), 0, ccm_jun['hire']) ccm_jun['sgr']=ccm_jun['sale']/ccm_jun['lagsale'] -1 ccm_jun['chpm']=ccm_jun['ib']/ccm_jun['sale']-ccm_jun['lagib']/ccm_jun['lagsale'] ccm_jun['chato']=(ccm_jun['sale']/((ccm_jun['at']+ccm_jun['lagat'])/2)) - (ccm_jun['lagsale']/((ccm_jun['lagat']+ccm_jun['lag2at'])/2)) ccm_jun['pchsale_pchinvt']=((ccm_jun['sale']-(ccm_jun['lagsale']))/(ccm_jun['lagsale']))-((ccm_jun['invt']-(ccm_jun['laginvt']))/(ccm_jun['laginvt'])) ccm_jun['pchsale_pchrect']=((ccm_jun['sale']-(ccm_jun['lagsale']))/(ccm_jun['lagsale']))-((ccm_jun['rect']-(ccm_jun['lagrect']))/(ccm_jun['lagrect'])) ccm_jun['pchgm_pchsale']=(((ccm_jun['sale']-ccm_jun['cogs'])-((ccm_jun['lagsale'])-(ccm_jun['lagcogs'])))/((ccm_jun['lagsale'])-(ccm_jun['lagcogs'])))-((ccm_jun['sale']-(ccm_jun['lagsale']))/(ccm_jun['lagsale'])) ccm_jun['pchsale_pchxsga']=((ccm_jun['sale']-(ccm_jun['lagsale']))/(ccm_jun['lagsale']) )-((ccm_jun['xsga']\ -(ccm_jun['lagxsga'])) /(ccm_jun['lagxsga']) ) ccm_jun['depr']=ccm_jun['dp']/ccm_jun['ppent'] ccm_jun['pchdepr']=((ccm_jun['dp']/ccm_jun['ppent'])-((ccm_jun['lagdp'])/(ccm_jun['lagppent'])))/((ccm_jun['lagdp'])/(ccm_jun['lagppent'])) ccm_jun['chadv']=np.log(1+ccm_jun['xad'])-np.log((1+(ccm_jun['lagxad']))) ccm_jun['invest']=((ccm_jun['ppegt']-(ccm_jun['lagppegt'])) + (ccm_jun['invt']-(ccm_jun['laginvt'])) ) / (ccm_jun['lagat']) ccm_jun['invest']=np.where(ccm_jun['ppegt'].isna(), ((ccm_jun['ppent']-(ccm_jun['lagppent'])) + (ccm_jun['invt']-(ccm_jun['laginvt'])) ) / (ccm_jun['lagat']), ccm_jun['invest']) ccm_jun['egr']=((ccm_jun['ceq']-(ccm_jun['lagceq']))/(ccm_jun['lagceq'])) ccm_jun['capx']=np.where(ccm_jun['capx'].isna() & ccm_jun['count']>=1,ccm_jun['ppent']-(ccm_jun['lagppent']), ccm_jun['capx']) ccm_jun['pchcapx']=(ccm_jun['capx']-ccm_jun['lagcapx'])/ccm_jun['lagcapx'] ccm_jun['grcapx']=(ccm_jun['capx']-ccm_jun['lag2capx'])/ccm_jun['lag2capx'] ccm_jun['grGW'] = np.nan ccm_jun['grGW']=(ccm_jun['gdwl']-lag(ccm_jun, 'gdwl'))/lag(ccm_jun, 'gdwl') ccm_jun['grGW']=np.where((ccm_jun['gdwl'].isna()) | (ccm_jun['gdwl']==0), 0, ccm_jun['grGW']) ccm_jun['grGW']=np.where((ccm_jun['gdwl'].notna()) & (ccm_jun['gdwl']!=0) & (ccm_jun['grGW'].isna()), 1, ccm_jun['grGW']) ccm_jun['woGW']=np.where((ccm_jun['gdwlia'].notna()&ccm_jun['gdwlia']!=0)|(ccm_jun['gdwlip'].notna()&(ccm_jun['gdwlip']!=0))|\ (ccm_jun['gwo'].notna()&ccm_jun['gwo']!=0) , 1, 0) ccm_jun['tang']=(ccm_jun['che']+ccm_jun['rect']*0.715+ccm_jun['invt']*0.547+ccm_jun['ppent']*0.535)/ccm_jun['at'] ccm_jun['sic']=ccm_jun['sic'].astype(int) ccm_jun['sin']=np.where(ccm_jun['sic'].between(2100,2199) | ccm_jun['sic'].between(2080,2085) | (ccm_jun['naics'].isin(['7132', '71312', \ '713210', '71329', '713290', '72112', '721120'])), 1, 0) ccm_jun['act']=np.where(ccm_jun['act'].isna(), ccm_jun['che']+ccm_jun['rect']+ccm_jun['invt'],ccm_jun['act']) ccm_jun['lct']=np.where(ccm_jun['lct'].isna(), ccm_jun['ap'], ccm_jun['lct']) ccm_jun['currat']=ccm_jun['act']/ccm_jun['lct'] ccm_jun['pchcurrat']= ((ccm_jun['act']/ccm_jun['lct']) - (lag(ccm_jun, 'act')/lag(ccm_jun, 'lct'))) / (lag(ccm_jun,'act')/lag(ccm_jun,'lct')) ccm_jun['quick']=(ccm_jun['act']-ccm_jun['invt'])/ccm_jun['lct'] ccm_jun['pchquick']=((ccm_jun['act']-ccm_jun['invt'])/ccm_jun['lct'] - ((ccm_jun['lagact'])-(ccm_jun['laginvt']))/(ccm_jun['laglct']) )/ (((ccm_jun['lagact'])-(ccm_jun['laginvt']))/(ccm_jun['laglct'])) ccm_jun['salecash']=ccm_jun['sale']/ccm_jun['che'] ccm_jun['salerec']=ccm_jun['sale']/ccm_jun['rect'] ccm_jun['saleinv']=ccm_jun['sale']/ccm_jun['invt'] ccm_jun['pchsaleinv']=((ccm_jun['sale']/ccm_jun['invt'])-((ccm_jun['lagsale'])/(ccm_jun['laginvt'])) ) / ((ccm_jun['lagsale'])/(ccm_jun['laginvt'])) ccm_jun['cashdebt']=(ccm_jun['ib']+ccm_jun['dp'])/((ccm_jun['lt']+(ccm_jun['laglt']))/2) ccm_jun['realestate']=(ccm_jun['fatb']+ccm_jun['fatl'])/ccm_jun['ppegt'] ccm_jun['realestate']=np.where(ccm_jun['ppegt'].isna(), (ccm_jun['fatb']+ccm_jun['fatl'])/ccm_jun['ppent'], ccm_jun['realestate']) ccm_jun['divi']=np.where((ccm_jun['dvt'].notna() & ccm_jun['dvt']>0) & ((ccm_jun['lagdvt'])==0 | (ccm_jun['lagdvt'].isna())),1,0) ccm_jun['divo']=np.where((ccm_jun['dvt'].isna() | ccm_jun['dvt']==0) & ((ccm_jun['lagdvt'])>0 & (ccm_jun['lagdvt'].notna())),1,0) ccm_jun['obklg']=ccm_jun['ob']/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['chobklg']=(ccm_jun['ob']-(ccm_jun['lagob']))/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['securedind']=np.where(ccm_jun['dm'].notna() &ccm_jun['dm']!=0, 1, 0) ccm_jun['secured']=ccm_jun['dm']/ccm_jun['dltt'] ccm_jun['convind']=np.where((ccm_jun['dc'].notna() & ccm_jun['dc']!=0) | (ccm_jun['cshrc'].notna() & ccm_jun['cshrc']!=0) , 1, 0) ccm_jun['dc']=ccm_jun['dc'].astype(float) ## There will be inf in the result ccm_jun['conv']=ccm_jun['dc']/ccm_jun['dltt'] ccm_jun['grltnoa']=((ccm_jun['rect']+ccm_jun['invt']+ccm_jun['ppent']+ccm_jun['aco']+ccm_jun['intan']+ccm_jun['ao']-ccm_jun['ap']-ccm_jun['lco']-ccm_jun['lo'])\ -((ccm_jun['lagrect'])+(ccm_jun['laginvt'])+(ccm_jun['lagppent'])+(ccm_jun['lagaco'])+(ccm_jun['lagintan'])+(ccm_jun['lagao'])-(ccm_jun['lagap'])\ -(ccm_jun['laglco'])-(ccm_jun['laglo'])) -(ccm_jun['rect']-(ccm_jun['lagrect'])+ccm_jun['invt']-(ccm_jun['laginvt'])+ccm_jun['aco']-(ccm_jun['lagaco'])\ -(ccm_jun['ap']-(ccm_jun['lagap'])+ccm_jun['lco']-(ccm_jun['laglco'])) -ccm_jun['dp']))/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['chdrc']=(ccm_jun['dr']-(ccm_jun['lagdr']))/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['xrd/lagat']=ccm_jun['xrd']/(ccm_jun['lagat']) ccm_jun['lag(xrd/lagat)']=ccm_jun.groupby(['permno'])['xrd/lagat'].shift(1) ccm_jun['rd']=np.where(((ccm_jun['xrd']/ccm_jun['at'])-ccm_jun['lag(xrd/lagat)'])/ccm_jun['lag(xrd/lagat)']>0.05, 1, 0) ccm_jun['rdbias']=(ccm_jun['xrd']/(ccm_jun['lagxrd']))-1-ccm_jun['ib']/(ccm_jun['lagceq']) ccm_jun['roe']=ccm_jun['ib']/(ccm_jun['lagceq']) ccm_jun['ps_beme']=np.where(ccm_jun['pstkrv'].isnull(), ccm_jun['pstkl'], ccm_jun['pstkrv']) ccm_jun['ps_beme']=np.where(ccm_jun['ps_beme'].isnull(),ccm_jun['pstk'], ccm_jun['ps_beme']) ccm_jun['ps_beme']=np.where(ccm_jun['ps_beme'].isnull(),0,ccm_jun['ps_beme']) ccm_jun['txditc']=ccm_jun['txditc'].fillna(0) ccm_jun['be']=ccm_jun['ceq']+ccm_jun['txditc']-ccm_jun['ps_beme'] ccm_jun['be']=np.where(ccm_jun['be']>0,ccm_jun['be'],np.NaN) ccm_jun['operprof']=np.where(ccm_jun['be'].notna() & ccm_jun['revt'].notna() & (ccm_jun['cogs'].notna() | ccm_jun['xsga'].notna() | ccm_jun['xint'].notna()),\ (ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']-ccm_jun['xint'])/ccm_jun['be'], np.nan) ccm_jun['ps']=(ccm_jun['ni']>0).astype(int) +(ccm_jun['oancf']>0)+(ccm_jun['ni']/ccm_jun['at'] > (ccm_jun['lagni'])/(ccm_jun['lagat']))+(ccm_jun['oancf']>ccm_jun['ni'])+(ccm_jun['dltt']/ccm_jun['at'] < (ccm_jun['lagdltt'])/(ccm_jun['lagat']))\ +(ccm_jun['act']/ccm_jun['lct'] > (ccm_jun['lagact'])/(ccm_jun['laglct'])) +((ccm_jun['sale']-ccm_jun['cogs'])/ccm_jun['sale'] > ((ccm_jun['lagsale'])-(ccm_jun['lagcogs']))/(ccm_jun['lagsale']))\ + (ccm_jun['sale']/ccm_jun['at'] > (ccm_jun['lagsale'])/(ccm_jun['lagat']))+ (ccm_jun['scstkc']==0) def tr_fyear(row): if row['fyear']<=1978: value = 0.48 elif row['fyear']<=1986: value = 0.46 elif row['fyear']==1987: value = 0.4 elif row['fyear']>=1988 and row['fyear']<=1992: value = 0.34 elif row['fyear']>=1993: value = 0.35 else: value='' return value ccm_jun['tr']=ccm_jun.apply(tr_fyear, axis=1) ccm_jun['tb_1']=((ccm_jun['txfo']+ccm_jun['txfed'])/ccm_jun['tr'])/ccm_jun['ib'] ccm_jun['tb_1']=np.where(ccm_jun['txfo'].isna() | ccm_jun['txfed'].isna(),((ccm_jun['txt']+ccm_jun['txdi'])/ccm_jun['tr'])/ccm_jun['ib'], ccm_jun['tb_1']) # if (txfo+txfed>0 or txt>txdi) and ib<=0 then # tb_1=1; #!!! Caution that for condition, when using | and &, one must apply parenthesis ccm_jun['tb_1']=np.where(((ccm_jun['txfo']+ccm_jun['txfed'])>0 | (ccm_jun['txt']>ccm_jun['txdi'])) & (ccm_jun['ib']<=0), 1, ccm_jun['tb_1']) # *variables that will be used in subsequent steps to get to final RPS; # *--prep for for Mohanram (2005) score; # roa=ni/((at+lag(at))/2); # cfroa=oancf/((at+lag(at))/2); ccm_jun['roa']=ccm_jun['ni']/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['cfroa']=ccm_jun['oancf']/((ccm_jun['at']+(ccm_jun['lagat']))/2) # if missing(oancf) then # cfroa=(ib+dp)/((at+lag(at))/2); # xrdint=xrd/((at+lag(at))/2); # capxint=capx/((at+lag(at))/2); # xadint=xad/((at+lag(at))/2); ccm_jun['cfroa']=np.where(ccm_jun['oancf'].isna(),ccm_jun['ib']+ccm_jun['dp'] /((ccm_jun['at']+(ccm_jun['lagat']))/2), ccm_jun['cfroa']) ccm_jun['xrdint']=ccm_jun['xrd']/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['capxint']=ccm_jun['capx']/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['xadint']=ccm_jun['xad']/((ccm_jun['at']+(ccm_jun['lagat']))/2) # /*HXZ*/ # adm=xad/mve6b; # gad=(xad-lag(xad))/lag(xad); # rdm=xrd/mve6b; # rds=xrd/sale; # ol=(cogs+xsga)/at; # rc_1=xrd+0.8*lag(xrd)+0.6*lag2(xrd)+0.4*lag3(xrd)+0.2*lag4(xrd); #New lag terms for this section ccm_jun['lag2xrd']=ccm_jun.groupby(['permno'])['lagxrd'].shift(1) ccm_jun['lag3xrd']=ccm_jun.groupby(['permno'])['lag2xrd'].shift(1) ccm_jun['lag4xrd']=ccm_jun.groupby(['permno'])['lag3xrd'].shift(1) # Here I follow previous naming of mve6b as dec_me ccm_jun['adm']=ccm_jun['xad']/ccm_jun['dec_me'] ccm_jun['gad']=(ccm_jun['xad']-(ccm_jun['lagxad']))/(ccm_jun['lagxad']) ccm_jun['rdm']=ccm_jun['xrd']/ccm_jun['dec_me'] ccm_jun['rds']=ccm_jun['xrd']/ccm_jun['sale'] ccm_jun['ol']=(ccm_jun['cogs']+ccm_jun['xsga'])/ccm_jun['at'] ccm_jun['rc_1']=ccm_jun['xrd']+0.8*(ccm_jun['lagxrd'])+0.6*(ccm_jun['lag2xrd'])+0.4*(ccm_jun['lag3xrd'])+0.2*(ccm_jun['lag4xrd']) ccm_jun['rca'] = ccm_jun['rc_1']/ccm_jun['at'] ccm_jun['eps_1'] = ccm_jun['ajex']/ccm_jun['prcc_f'] ccm_jun['x_1']=ccm_jun['ppent']+ccm_jun['intan']+ccm_jun['ao']-ccm_jun['lo']+ccm_jun['dp'] ccm_jun['etr'] = (ccm_jun['x_1'] - (lag(ccm_jun, 'x_1') + lag(ccm_jun,'x_1',2)+lag(ccm_jun,'x_1',3))/3) * (ccm_jun['eps_1'] - lag(ccm_jun, 'eps_1')) ccm_jun['x_2'] = ccm_jun['sale']/ccm_jun['emp'] ccm_jun['lfe'] = (ccm_jun['x_2'] - lag(ccm_jun, 'x_2'))/lag(ccm_jun, 'x_2') ccm_jun['kz'] = -1.002*(ccm_jun['ib'] + ccm_jun['dp'])/lag(ccm_jun,'ppent') + 0.283*(ccm_jun['at']+ccm_jun['mve6b']-ccm_jun['ceq']-ccm_jun['txdb'])/ccm_jun['at'] + \ 3.139*(ccm_jun['dlc'] + ccm_jun['dltt'])/(ccm_jun['dlc'] + ccm_jun['dltt'] + ccm_jun['seq']) - 39.368*(ccm_jun['dvc']+ccm_jun['dvp'])/lag(ccm_jun,'ppent') -\ 1.315*(ccm_jun['che'])/lag(ccm_jun,'ppent') ccm_jun['cdd']=ccm_jun['dcvt']/(ccm_jun['dlc']+ccm_jun['dltt']) ccm_jun['roaq_a']=ccm_jun['ib']/(ccm_jun['lagat']) # must reindex in order to set back the value ccm_jun['roavol_1']=ccm_jun.groupby(['permno'])['roaq_a'].rolling(10).std(skipna=True).reset_index()['roaq_a'] ccm_jun['cs_1']=(ccm_jun['ib']-(ccm_jun['act']-(ccm_jun['lagact'])-(ccm_jun['lct']-(ccm_jun['laglct']))-(ccm_jun['che']-(ccm_jun['lagche']))+ccm_jun['dlc']-(ccm_jun['lagdlc'])))/(ccm_jun['lagat']) ccm_jun['roavol_2']=ccm_jun.groupby(['permno'])['cs_1'].rolling(10).std().reset_index()['cs_1'] ccm_jun['roavol_a']=ccm_jun['roavol_1']/ccm_jun['roavol_2'] ccm_jun['gdwl']=ccm_jun['gdwl'].fillna(0) ccm_jun['intan']=ccm_jun['intan'].fillna(0) ccm_jun['ala']=ccm_jun['che']+0.75*(ccm_jun['act']-ccm_jun['che'])-0.5*(ccm_jun['at']-ccm_jun['act']-ccm_jun['gdwl']-ccm_jun['intan']) ccm_jun['alm']=ccm_jun['ala']/(ccm_jun['at']+ccm_jun['prcc_f']*ccm_jun['csho']-ccm_jun['ceq']) ccm_jun['ob_a']=ccm_jun['ob']/(0.5*ccm_jun['at']+0.5*(ccm_jun['lagat'])) ccm_jun['x_3']=ccm_jun['capx']/ccm_jun['sale'] ccm_jun['lagx_3']=ccm_jun.groupby(['permno'])['x_3'].shift(1) ccm_jun['lag2x_3']=ccm_jun.groupby(['permno'])['lagx_3'].shift(1) ccm_jun['lag3x_3']=ccm_jun.groupby(['permno'])['lag2x_3'].shift(1) ccm_jun['cinvest_a']=ccm_jun['x_3']/(((ccm_jun['lagx_3'])+(ccm_jun['lag2x_3'])+(ccm_jun['lag3x_3']))/3)-1 ccm_jun['dpia'] = (ccm_jun['ppegt']-lag(ccm_jun,'ppegt') + ccm_jun['invt'] - lag(ccm_jun, 'invt'))/lag(ccm_jun, 'at') ccm_jun['dlc']=ccm_jun['dlc'].fillna(0) ccm_jun['dltt']=ccm_jun['dltt'].fillna(0) ccm_jun['mib']=ccm_jun['mib'].fillna(0) ccm_jun['pstk']=ccm_jun['pstk'].fillna(0) ccm_jun['ceq']=ccm_jun['ceq'].fillna(0) ccm_jun['noa']=((ccm_jun['at']-ccm_jun['che'])-(ccm_jun['at']-ccm_jun['dlc']-ccm_jun['dltt']-ccm_jun['mib']-ccm_jun['pstk']-ccm_jun['ceq']))/(ccm_jun['lagat']) ccm_jun['lagnoa']=ccm_jun.groupby(['permno'])['noa'].shift(1) ccm_jun['dnoa']=ccm_jun['noa']-(ccm_jun['lagnoa']) ccm_jun['lag3capx']=ccm_jun.groupby(['permno'])['capx'].shift(3) ccm_jun['pchcapx3']=ccm_jun['capx']/(ccm_jun['lag3capx'])-1 ccm_jun['x_4']=ccm_jun['dlc']+ccm_jun['dltt'] ccm_jun['lag5x_4']=ccm_jun.groupby(['permno'])['capx'].shift(5) ccm_jun['cdi']=np.log(ccm_jun['x_4']/(ccm_jun['lag5x_4'])) ccm_jun['ivg']=ccm_jun['invt']/(ccm_jun['laginvt'])-1 ccm_jun['dcoa']=(ccm_jun['act']-(ccm_jun['lagact'])-(ccm_jun['che']-(ccm_jun['lagche'])))/(ccm_jun['lagat']) ccm_jun['dcol']=(ccm_jun['lct']-(ccm_jun['laglct'])-(ccm_jun['dlc']-(ccm_jun['lagdlc'])))/(ccm_jun['lagat']) ccm_jun['dwc']=(ccm_jun['dcoa']-ccm_jun['dcol'])/(ccm_jun['lagat']) ccm_jun['lagivao']=ccm_jun.groupby(['permno'])['ivao'].shift(1) ccm_jun['dnca']=(ccm_jun['at']-ccm_jun['act']-ccm_jun['ivao']-((ccm_jun['lagat'])-(ccm_jun['lagact'])-(ccm_jun['lagivao'])))/(ccm_jun['lagat']) ccm_jun['dncl']=(ccm_jun['lt']-ccm_jun['lct']-ccm_jun['dltt']-((ccm_jun['laglt'])-(ccm_jun['laglct'])-(ccm_jun['lagdltt'])))/(ccm_jun['lagat']) ccm_jun['dnco']=(ccm_jun['dnca']-ccm_jun['dncl'])/(ccm_jun['lagat']) ccm_jun['lagivst']=ccm_jun.groupby(['permno'])['ivst'].shift(1) ccm_jun['lagpstk']=ccm_jun.groupby(['permno'])['pstk'].shift(1) ccm_jun['dfin']=(ccm_jun['ivst']+ccm_jun['ivao']-ccm_jun['dltt']-ccm_jun['dlc']-ccm_jun['pstk']-((ccm_jun['lagivst'])+(ccm_jun['lagivao'])-(ccm_jun['lagdltt'])-(ccm_jun['lagdlc'])-(ccm_jun['lagpstk'])))/(ccm_jun['lagat']) ccm_jun['ta']=(ccm_jun['dwc']+ccm_jun['dnco']+ccm_jun['dfin'])/(ccm_jun['lagat']) ccm_jun['dsti']=(ccm_jun['ivst']-(ccm_jun['lagivst']))/(ccm_jun['lagat']) ccm_jun['dfnl']=(ccm_jun['dltt']+ccm_jun['dlc']+ccm_jun['pstk']-((ccm_jun['lagdltt'])+(ccm_jun['lagdlc'])+(ccm_jun['lagpstk'])))/(ccm_jun['lagat']) ccm_jun['egr_hxz']=(ccm_jun['ceq']-(ccm_jun['lagceq']))/(ccm_jun['lagat']) ccm_jun['lagpstkrv']=ccm_jun.groupby(['permno'])['pstkrv'].shift(1) ccm_jun['txp']=ccm_jun['txp'].fillna(0) ccm_jun['poa']=(ccm_jun['act']-(ccm_jun['lagact'])-(ccm_jun['che']-(ccm_jun['lagche']))-(ccm_jun['lct']-(ccm_jun['laglct'])-(ccm_jun['dlc']-(ccm_jun['lagdlc']))-(ccm_jun['txp']-(ccm_jun['lagtxp'])))-ccm_jun['dp'])/(ccm_jun['ni'].abs()) ccm_jun['nef']=(ccm_jun['sstk']-ccm_jun['prstkc']-ccm_jun['dv'])/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['dlcch']=ccm_jun['dlcch'].fillna(0) ccm_jun['ndf']=(ccm_jun['dltis']-ccm_jun['dltr']+ccm_jun['dlcch'])/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['nxf'] = ccm_jun['ndf'] + ccm_jun['nef'] ccm_jun['atm']=ccm_jun['at']/ccm_jun['dec_me'] ccm_jun['cp']=(ccm_jun['ib']+ccm_jun['dp'])/ccm_jun['dec_me'] ccm_jun['op']=(ccm_jun['dvc']+ccm_jun['prstkc']-(ccm_jun['pstkrv']-(ccm_jun['lagpstkrv'])))/ccm_jun['dec_me'] ccm_jun['nop']=(ccm_jun['dvc']+ccm_jun['prstkc']-(ccm_jun['pstkrv']-(ccm_jun['lagpstkrv']))-ccm_jun['sstk']+ccm_jun['pstkrv']-(ccm_jun['lagpstkrv']))/ccm_jun['dec_me'] ccm_jun['em'] = (ccm_jun['mve6b'] + ccm_jun['dlc'] + ccm_jun['dltt'] + ccm_jun['pstkrv'] - ccm_jun['che'])/ccm_jun['oibdp'] ccm_jun['dvpa']=ccm_jun['dvpa'].fillna(0) ccm_jun['tstkp']=ccm_jun['tstkp'].fillna(0) ccm_jun['ndp']=ccm_jun['dltt']+ccm_jun['dlc']+ccm_jun['pstk']+ccm_jun['dvpa']-ccm_jun['tstkp']-ccm_jun['che'] ccm_jun['ebp']=(ccm_jun['ndp']+ccm_jun['ceq']+ccm_jun['tstkp']-ccm_jun['dvpa'])/(ccm_jun['ndp']+ccm_jun['dec_me']) ccm_jun['rna']=ccm_jun['oiadp']/(ccm_jun['lagnoa']) ccm_jun['pm']=ccm_jun['rna']/ccm_jun['sale'] ccm_jun['ato']=ccm_jun['sale']/(ccm_jun['lagnoa']) ccm_jun['cto']=ccm_jun['sale']/(ccm_jun['lagat']) ccm_jun['gpa']=(ccm_jun['revt']-ccm_jun['cogs'])/ccm_jun['at'] ccm_jun['rmw']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']-ccm_jun['xint'])/(ccm_jun['ceq']+ccm_jun['pstk']) ccm_jun['ole']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']-ccm_jun['xint'])/((ccm_jun['lagceq'])+(ccm_jun['lagpstk'])) ccm_jun['opa']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']+ccm_jun['xrd'])/ccm_jun['at'] ccm_jun['ola']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']+ccm_jun['xrd'])/(ccm_jun['lagat']) ccm_jun['lagxpp']=ccm_jun.groupby(['permno'])['xpp'].shift(1) ccm_jun['lagdrc']=ccm_jun.groupby(['permno'])['drc'].shift(1) ccm_jun['lagdrlt']=ccm_jun.groupby(['permno'])['drlt'].shift(1) ccm_jun['lagxacc']=ccm_jun.groupby(['permno'])['xacc'].shift(1) ccm_jun['cop']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']+ccm_jun['xrd']-(ccm_jun['rect']-(ccm_jun['lagrect']))-(ccm_jun['invt']-(ccm_jun['laginvt']))-\ (ccm_jun['xpp']-(ccm_jun['lagxpp']))+ccm_jun['drc']-(ccm_jun['lagdrc'])+ccm_jun['drlt']-(ccm_jun['lagdrlt'])+ccm_jun['ap']-(ccm_jun['lagap'])+ccm_jun['xacc']-(ccm_jun['lagxacc']))/ccm_jun['at'] ccm_jun['cla']=(ccm_jun['revt']-ccm_jun['cogs']-ccm_jun['xsga']+ccm_jun['xrd']-(ccm_jun['rect']-(ccm_jun['lagrect']))-(ccm_jun['invt']-(ccm_jun['laginvt']))-\ (ccm_jun['xpp']-(ccm_jun['lagxpp']))+ccm_jun['drc']-(ccm_jun['lagdrc'])+ccm_jun['drlt']-(ccm_jun['lagdrlt'])+ccm_jun['ap']-(ccm_jun['lagap'])+ccm_jun['xacc']-(ccm_jun['lagxacc']))/ccm_jun['lagat'] ccm_jun['i_1']=np.where(ccm_jun['lt']>ccm_jun['at'],1,0) ccm_jun['i_2']=np.where((ccm_jun['ni']<0) & (ccm_jun['lagni']<0),1,0) ccm_jun['os']=-1.32-0.407*np.log(ccm_jun['at'])+6.03*(ccm_jun['dlc']+ccm_jun['dltt'])/ccm_jun['at']-1.43*(ccm_jun['act']-ccm_jun['lct'])/ccm_jun['at']+0.076*(ccm_jun['lct']/ccm_jun['act'])-1.72*ccm_jun['i_1']-2.37*ccm_jun['ni']/ccm_jun['at']-1.83*(ccm_jun['pi']+ccm_jun['dp'])/ccm_jun['lt']+0.285*ccm_jun['i_2']-0.521*(ccm_jun['ni']+(ccm_jun['lagni']))/((ccm_jun['ni'].abs())+ccm_jun['lagni'].abs()) ccm_jun['zs']=1.2*(ccm_jun['act']-ccm_jun['lct'])/ccm_jun['at']+1.4*ccm_jun['re']/ccm_jun['at']+3.3*ccm_jun['oiadp']/ccm_jun['at']+0.6*ccm_jun['dec_me']/ccm_jun['lt']+ccm_jun['sale']/ccm_jun['at'] ccm_jun['bi']=np.where(ccm_jun['be']>0,ccm_jun['at']/ccm_jun['be'],np.NaN) ccm_jun['pchsale_pchinvt_hxz']=(ccm_jun['sale']-(ccm_jun['lagsale']))/(0.5*ccm_jun['sale']+0.5*(ccm_jun['lagsale']))-(ccm_jun['invt']-(ccm_jun['laginvt']))/(0.5*ccm_jun['invt']+0.5*(ccm_jun['laginvt'])) ccm_jun['pchsale_pchrect_hxz']=(ccm_jun['sale']-(ccm_jun['lagsale']))/(0.5*ccm_jun['sale']+0.5*(ccm_jun['lagsale']))-(ccm_jun['rect']-(ccm_jun['lagrect']))/(0.5*ccm_jun['rect']+0.5*(ccm_jun['lagrect'])) ccm_jun['gm_1']=ccm_jun['sale']-ccm_jun['cogs'] ccm_jun['laggm_1']=ccm_jun.groupby(['permno'])['gm_1'].shift(1) ccm_jun['pchgm_pchsale_hxz']=(ccm_jun['gm_1']-(ccm_jun['laggm_1']))/(0.5*(ccm_jun['gm_1']+(ccm_jun['laggm_1'])))-(ccm_jun['sale']-(ccm_jun['lagsale']))/(0.5*ccm_jun['sale']+0.5*(ccm_jun['lagsale'])) ccm_jun['pchsale_pchxsga_hxz']=(ccm_jun['sale']-(ccm_jun['lagsale']))/(0.5*ccm_jun['sale']+0.5*(ccm_jun['lagsale']))-(ccm_jun['xsga']-(ccm_jun['lagxsga']))/(0.5*ccm_jun['xsga']+0.5*(ccm_jun['lagxsga'])) ccm_jun['realestate_hxz']=(ccm_jun['fatb']+ccm_jun['fatl'])/ccm_jun['ppegt'] ccm_jun['secured_hxz']=ccm_jun['dm']/(ccm_jun['dltt']+ccm_jun['dlc']) ccm_jun['agr_hxz']=ccm_jun['at']/(ccm_jun['lagat'])-1 ccm_jun['lagx_1']=ccm_jun.groupby(['permno'])['x_1'].shift(1) ccm_jun['grltnoa_hxz']=(ccm_jun['x_1']-(ccm_jun['lagx_1']))/((ccm_jun['at']+(ccm_jun['lagat']))/2) ccm_jun['lagajex']=ccm_jun.groupby(['permno'])['ajex'].shift(1) ccm_jun['chcsho_hxz']=np.log(ccm_jun['csho']*ccm_jun['ajex'])-np.log((ccm_jun['lagcsho'])*(ccm_jun['lagajex'])) ccm_jun['lagcapxv']=ccm_jun.groupby(['permno'])['capxv'].shift(1) ccm_jun['lag2capxv']=ccm_jun.groupby(['permno'])['capxv'].shift(2) ccm_jun['pchcapx_hxz']=(ccm_jun['capxv']-0.5*(ccm_jun['lagcapxv'])-0.5*(ccm_jun['lag2capxv']))/(0.5*(ccm_jun['lagcapxv'])+0.5*(ccm_jun['lag2capxv'])) ccm_jun['txp']=ccm_jun['txp'].fillna(0) ccm_jun['acc_hxz']=(ccm_jun['act']-(ccm_jun['lagact'])-(ccm_jun['che']-(ccm_jun['lagche']))-(ccm_jun['lct']-(ccm_jun['laglct'])-(ccm_jun['dlc']-(ccm_jun['lagdlc']))-(ccm_jun['txp']-(ccm_jun['lagtxp'])))-ccm_jun['dp'])/(ccm_jun['lagat']) ccm_jun['pctacc_hxz']=(ccm_jun['dwc']+ccm_jun['dnco']+ccm_jun['dfin'])/(ccm_jun['ni'].abs()) ccm_jun['lev_hxz']=(ccm_jun['dlc']+ccm_jun['dltt'])/ccm_jun['dec_me'] ccm_jun['ep_hxz']=ccm_jun['ib']/ccm_jun['dec_me'] ccm_jun['lagwcap']=ccm_jun.groupby(['permno'])['wcap'].shift(1) ccm_jun['cfp_hxz']=(ccm_jun['fopt']-(ccm_jun['wcap']-(ccm_jun['lagwcap'])))/ccm_jun['dec_me'] ccm_jun['cfp_hxz']=np.where(ccm_jun['oancf'].notna(),(ccm_jun['fopt']-ccm_jun['oancf'])/ccm_jun['dec_me'],ccm_jun['cfp_hxz']) ccm_jun['tb_hxz']=ccm_jun['pi']/ccm_jun['ni'] ccm_jun = ccm_jun.sort_values(['sic2', 'fyear']) a=ccm_jun.groupby(['sic2','fyear'])['chpm'].mean() a=a.rename('meanchpm') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['chpmia']=ccm_jun['chpm']-ccm_jun['meanchpm'] a=ccm_jun.groupby(['sic2','fyear'])['chato'].mean() a=a.rename('meanchato') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['chatoia']=ccm_jun['chato']-ccm_jun['meanchato'] a=ccm_jun.groupby(['sic2','fyear'])['sale'].sum() a=a.rename('indsale') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) a=ccm_jun.groupby(['sic2','fyear'])['hire'].mean() a=a.rename('meanhire') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['chempia']=ccm_jun['hire']-ccm_jun['meanhire'] ccm_jun['beme']=ccm_jun['beme'].astype(float) a=ccm_jun.groupby(['sic2','fyear'])['beme'].mean() a=a.rename('meanbeme') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['beme_ia']=ccm_jun['beme']-ccm_jun['meanbeme'] ccm_jun['bm_ia'] = ccm_jun['beme_ia'] a=ccm_jun.groupby(['sic2','fyear'])['pchcapx'].mean() a=a.rename('meanpchcapx') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['pchcapx_ia']=ccm_jun['pchcapx']-ccm_jun['meanpchcapx'] a=ccm_jun.groupby(['sic2','fyear'])['tb_1'].mean() a=a.rename('meantb_1') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['tb']=ccm_jun['tb_1']-ccm_jun['meantb_1'] a=ccm_jun.groupby(['sic2','fyear'])['cfp'].mean() a=a.rename('meancfp') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['cfp_ia']=ccm_jun['cfp']-ccm_jun['meancfp'] a=ccm_jun.groupby(['sic2','fyear'])['mve_f'].mean() a=a.rename('meanmve_f') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['mve_ia']=ccm_jun['mve_f']-ccm_jun['meanmve_f'] a=ccm_jun.groupby(['sic2','fyear'])['at'].sum() a=a.rename('indat') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) a=ccm_jun.groupby(['sic2','fyear'])['be'].sum() a=a.rename('indbe') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) a=ccm_jun.groupby(['sic2','fyear'])['pchcapx_hxz'].mean() a=a.rename('meanpchcapx_hxz') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['pchcapx_ia_hxz']=ccm_jun['pchcapx_hxz']-ccm_jun['meanpchcapx_hxz'] ccm_jun['herfraw']=(ccm_jun['sale']/ccm_jun['indsale'])*(ccm_jun['sale']/ccm_jun['indsale']) a=ccm_jun.groupby(['sic2','fyear'])['herfraw'].sum() a=a.rename('herf') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['haraw']=(ccm_jun['at']/ccm_jun['indat'])*(ccm_jun['at']/ccm_jun['indat']) a=ccm_jun.groupby(['sic2','fyear'])['haraw'].sum() a=a.rename('ha') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun['heraw']=(ccm_jun['be']/ccm_jun['indbe'])*(ccm_jun['be']/ccm_jun['indbe']) a=ccm_jun.groupby(['sic2','fyear'])['heraw'].sum() a=a.rename('he') ccm_jun=pd.merge(ccm_jun, a, how='left', on=['sic2','fyear']) ccm_jun = ccm_jun.sort_values(['sic2', 'fyear']) indmd=pd.DataFrame() indmd = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['roa'].median() indmd = indmd.rename(columns={'roa': 'md_roa'}) indmd['md_cfroa'] = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['cfroa'].median()['cfroa'] indmd['md_xrdint'] = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['xrdint'].median()['xrdint'] indmd['md_capxint'] = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['capxint'].median()['capxint'] indmd['md_xadint'] = ccm_jun.groupby(['sic2', 'fyear'], as_index=False)['xadint'].median()['xadint'] ccm_jun=pd.merge(ccm_jun, indmd, how='left', on=['sic2','fyear']) ccm_jun = ccm_jun.sort_values(['gvkey', 'datadate']) ccm_jun['m1']=np.where(ccm_jun['roa']>ccm_jun['md_roa'], 1, 0) ccm_jun['m2']=np.where(ccm_jun['cfroa']>ccm_jun['md_cfroa'], 1, 0) ccm_jun['m3']=np.where(ccm_jun['oancf']>ccm_jun['ni'], 1, 0) ccm_jun['m4']=np.where(ccm_jun['xrdint']>ccm_jun['md_xrdint'], 1, 0) ccm_jun['m5']=np.where(ccm_jun['capxint']>ccm_jun['md_capxint'], 1, 0) ccm_jun['m6']=np.where(ccm_jun['xadint']>ccm_jun['md_xadint'], 1, 0) compr=conn.raw_sql(""" select splticrm, gvkey, datadate from comp.adsprate """) compr.datadate=pd.to_datetime(compr.datadate) ccm_jun['year']=ccm_jun['datadate'].dt.year compr['year']=compr['datadate'].dt.year ccm_jun=pd.merge(ccm_jun, compr[['splticrm','gvkey','year']], how='left', on=['gvkey','year']) ccm_jun = ccm_jun.sort_values(['gvkey', 'datadate']) cpi=pd.DataFrame() cpi['fyear'] = list(reversed(range(1924, 2017+1))) cpi['cpi'] = [246.19,242.23,236.53,229.91,229.17,229.594, 224.939,218.056,214.537,215.303,207.342,201.6,195.3,188.9, 183.96,179.88,177.1,172.2,166.6,163,160.5,156.9, 152.4,148.2,144.5,140.3,136.2,130.7,124,118.3, 113.6,109.6,107.6,103.9,99.6,96.5,90.9,82.4, 72.6,65.2,60.6,56.9,53.8,49.3,44.2,41.8, 40.6,38.9,36.7,34.8,33.4,32.5,31.6,31, 30.7,30.2,29.9,29.6,29.2,28.9,28.2,27.3, 26.8,26.9,26.8,26.7,25.9,24,23.7,24.4, 22.2,19.7,18.1,17.6,17.3,16.3,14.7,14,13.8,14.1, 14.4,13.9,13.6,13.3,13.1,13.6,15.1,16.6,17.2,17.1,17.2,17.5,17.7,17] ccm_jun = pd.merge(ccm_jun, cpi, how='left', on='fyear') ccm_jun = ccm_jun.sort_values(['gvkey', 'datadate']) ccm_jun = ccm_jun.drop_duplicates(['gvkey','datadate']) credit_mapping = {'D': 1, 'C': 2, 'CC': 3, 'CCC-':4, 'CCC':5, 'CCC+':6, 'B-': 7, 'B': 8, 'B+':9, 'BB-':10, 'BB':11, 'BB+': 12, 'BBB-':13, 'BBB':14, 'BBB+': 15, 'A-':16, 'A':17, 'A+':18, 'AA-':19,'AA':20,'AA+':21,'AAA':22} ccm_jun['credrat'] = ccm_jun['splticrm'].map(credit_mapping) ccm_jun['credrat'] = ccm_jun['credrat'] ccm_jun['credrat'] = 0 ccm_jun.loc[ccm_jun['credrat'] < lag(ccm_jun, 'credrat'), 'credrat_dwn'] = 1 ccm_jun.loc[ccm_jun['credrat'] >= lag(ccm_jun, 'credrat'), 'credrat_dwn'] = 0 ccm_jun = ccm_jun.sort_values(['gvkey','datadate']) ccm_jun['avgat'] = (ccm_jun['at']+ccm_jun['lagat'])/2 ccm_jun.loc[ccm_jun['count']==0, 'orgcap_1'] = (ccm_jun['xsga']/ccm_jun['cpi'])/(.1+.15) orgcap_1 = ccm_jun[['orgcap_1','xsga','cpi']] prev_row = None for i, row in orgcap_1.iterrows(): if (np.isnan(row['orgcap_1'])) and prev_row is not None: row['orgcap_1'] = prev_row['orgcap_1']*(1-0.15)+row['xsga']/row['cpi'] prev_row = row ccm_jun['orgcap_1'] = orgcap_1['orgcap_1'] ccm_jun['orgcap'] = ccm_jun['orgcap_1']/ccm_jun['avgat'] ccm_jun.loc[ccm_jun['count'] == 0, 'orgcap'] = np.nan ccm_jun.loc[ccm_jun['count']==0, 'oc_1'] = ccm_jun['xsga']/(.1+.15) oc_1 = ccm_jun[['oc_1','xsga','cpi']] prev_row = None for i, row in oc_1.iterrows(): if(np.isnan(row['oc_1'])) and prev_row is not None: row['oc_1'] = (1-0.15)*prev_row['oc_1'] + row['xsga']/row['cpi'] prev_row = row ccm_jun['oc_1'] = oc_1['oc_1'] ccm_jun['oca'] = ccm_jun['oc_1']/ccm_jun['at'] mean_orgcap = ccm_jun.rename(columns={'orgcap':'orgcap_mean'}).groupby(['sic2','fyear'])['orgcap_mean'].mean() std_orgcap = ccm_jun.rename(columns={'orgcap':'orgcap_std'}).groupby(['sic2','fyear'])['orgcap_std'].std() ccm_jun = pd.merge(ccm_jun, mean_orgcap, on=['sic2','fyear'], how='left') ccm_jun = pd.merge(ccm_jun, std_orgcap, on=['sic2','fyear'], how='left') ccm_jun['orgcap_ia'] = (ccm_jun['orgcap']-ccm_jun['orgcap_mean'])/ccm_jun['orgcap_std'] mean_oca = ccm_jun.rename(columns={'oca':'oca_mean'}).groupby(['sic2','fyear'])['oca_mean'].mean() std_oca = ccm_jun.rename(columns={'oca':'oca_std'}).groupby(['sic2','fyear'])['oca_std'].std() ccm_jun = pd.merge(ccm_jun, mean_oca, on=['sic2','fyear'], how='left') ccm_jun = pd.merge(ccm_jun, std_oca, on=['sic2','fyear'], how='left') ccm_jun['oca_ia'] = (ccm_jun['oca']-ccm_jun['oca_mean'])/ccm_jun['oca_std'] ccm_jun.loc[ccm_jun['dvpsx_f']>0, 'dvpsx_1'] = 1 ccm_jun.loc[ccm_jun['dvpsx_f']<=0, 'dvpsx_1'] = 0 ccm_jun['ww'] = -0.091*(ccm_jun['ib']+ccm_jun['dp'])/ccm_jun['at'] - 0.062*ccm_jun['dvpsx_1'] + 0.021*ccm_jun['dltt']/ccm_jun['at'] -0.044*np.log(ccm_jun['at']) + 0.102*(ccm_jun['indsale']/lag(ccm_jun,'indsale')-1)-0.035*(ccm_jun['sale']/lag(ccm_jun,'sale')-1) return ccm_jun
def build_compq6(permnos, ccm_jun): gvkeys = permnos_to_gvkeys(permnos) def lag(df, col, n=1, on='gvkey'): return df.groupby(on)[col].shift(n) lnk = conn.raw_sql(f""" select * from crsp.ccmxpf_linktable where lpermno in {permnos} """) lnk = lnk[lnk['linktype'].isin( ['LU', 'LC', 'LD', 'LF', 'LN', 'LO', 'LS', 'LX'])] lnk = lnk[(2018 >= lnk['linkdt'].astype(str).str[0:4].astype(int)) | (lnk['linkdt'] == '.B')] lnk = lnk[(lnk['linkenddt'].isna()) | ("1940" <= lnk['linkenddt'].astype(str).str[0:4])] lnk = lnk.sort_values(['gvkey', 'linkdt']) lnk['linkdt'] = pd.to_datetime(lnk['linkdt']) lnk['linkenddt'] = pd.to_datetime(lnk['linkenddt']) ccm_jun2 = pd.merge(lnk[['gvkey', 'linkdt', 'linkenddt', 'lpermno']], ccm_jun, on='gvkey', how='inner') ccm_jun2['datadate'] = pd.to_datetime(ccm_jun2['datadate']) ccm_jun2 = ccm_jun2[(ccm_jun2['linkdt'] <= ccm_jun2['datadate']) | (ccm_jun2['linkdt'] == '.B')] ccm_jun2 = ccm_jun2[(ccm_jun2['datadate'] <= ccm_jun2['linkenddt']) | (ccm_jun2['linkenddt'].isna())] ccm_jun2 = ccm_jun2[(ccm_jun2['lpermno'] != '.') & ccm_jun2['gvkey'].notna()] temp = ccm_jun2[[ 'gvkey', 'permno', 'datadate', 'fyear', 'sic2', 'cfp', 'ep', 'cashpr', 'dy', 'lev', 'sp', 'roic', 'rd_sale', 'chadv', 'agr', 'invest', 'gma', 'chcsho', 'lgr', 'egr', 'chpm', 'chato', 'chinv', 'hire', 'cf', 'acc', 'pctacc', 'absacc', 'spii', 'spi', 'sgr', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', 'pchcapx', 'ps', 'divi', 'divo', 'obklg', 'chobklg', 'securedind', 'secured', 'convind', 'conv', 'grltnoa', 'chdrc', 'rd', 'rdbias', 'chpmia', 'chatoia', 'chempia', 'pchcapx_ia', 'tb', 'cfp_ia', 'mve_ia', 'herf', 'credrat', 'credrat_dwn', 'orgcap', 'grcapx', 'depr', 'pchdepr', 'grGW', 'tang', 'woGW', 'sin', 'currat', 'pchcurrat', 'quick', 'pchquick', 'orgcap_ia', 'adm', 'gad', 'rdm', 'rds', 'ol', 'ww', 'cdd', 'roavol_a', 'ala', 'alm', 'ob_a', 'cinvest_a', 'noa', 'dnoa', 'pchcapx3', 'cdi', 'ivg', 'dcoa', 'dcol', 'dwc', 'dnca', 'dncl', 'dnco', 'dfin', 'ta', 'dsti', 'dfnl', 'poa', 'nef', 'ndf', 'atm', 'cp', 'op', 'nop', 'ndp', 'ebp', 'rna', 'pm', 'ato', 'cto', 'gpa', 'rmw', 'ole', 'opa', 'ola', 'cop', 'cla', 'os', 'zs', 'bi', 'oca', 'oca_ia', 'ha', 'he', 'pchsale_pchinvt_hxz', 'pchsale_pchrect_hxz', 'pchgm_pchsale_hxz', 'pchsale_pchxsga_hxz', 'realestate_hxz', 'secured_hxz', 'agr_hxz', 'grltnoa_hxz', 'chcsho_hxz', 'pchcapx_ia_hxz', 'acc_hxz', 'egr_hxz', 'pctacc_hxz', 'lev_hxz', 'ep_hxz', 'cfp_hxz', 'tb_hxz', 'salecash', 'salerec', 'pchsaleinv', 'cashdebt', 'realestate', 'roe', 'operprof', 'mve_f', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6' ]] crsp_msf = conn.raw_sql(f""" select ret, retx, prc, shrout, vol, date, permno from crsp.msf where permno in {permnos} """) crsp_msf = crsp_msf[crsp_msf['permno'].isin(temp['permno'])] crsp_msf['date'] = pd.to_datetime(crsp_msf['date']) crsp_msf = crsp_msf.sort_values('date') z = temp[['datadate', 'permno']] z['date_l'] = temp['datadate'] + pd.TimedeltaIndex( [7] * len(z), 'M') + pd.TimedeltaIndex([-5] * len(z), 'd') z['date_u'] = temp['datadate'] + pd.TimedeltaIndex([20] * len(z), 'M') z = pd.merge(z, crsp_msf, on='permno', how='left') z['date'] = pd.to_datetime(z['date']) z = z[(z['date'] >= z['date_l']) & (z['date'] < z['date_u'])] temp2 = pd.merge(z, temp, on=['permno', 'datadate'], how='left') crsp_mseall = conn.raw_sql(f""" select date, permno, exchcd, shrcd, siccd from crsp.mseall where permno in {permnos} and exchcd in (1, 2, 3) and shrcd in (10, 11) """) crsp_mseall = crsp_mseall.sort_values(['permno', 'exchcd', 'date']) mseall_min = crsp_mseall.groupby([ 'permno', 'exchcd' ])['date'].min().reset_index().rename(columns={'date': 'exchstdt'}) mseall_max = crsp_mseall.groupby([ 'permno', 'exchcd' ])['date'].max().reset_index().rename(columns={'date': 'exchedt'}) crsp_mseall = pd.merge(crsp_mseall, mseall_min, on=['permno', 'exchcd']) crsp_mseall = pd.merge(crsp_mseall, mseall_max, on=['permno', 'exchcd']) crsp_mseall = crsp_mseall.rename(columns={'date': 'time_1'}) crsp_mseall = crsp_mseall.sort_values(['permno', 'exchcd']) crsp_mseall = crsp_mseall.drop_duplicates(['permno', 'exchcd']) crsp_mseall['exchstdt'] = pd.to_datetime(crsp_mseall['exchstdt']) crsp_mseall['exchedt'] = pd.to_datetime(crsp_mseall['exchedt']) temp2 = pd.merge(temp2, crsp_mseall, on='permno', how='left') temp2 = temp2[((temp2['date'] >= temp2['exchstdt']) & (temp2['date'] <= temp2['exchedt']))] crsp_mseall_dl = conn.raw_sql(f""" select dlret, dlstcd, exchcd, date, permno from crsp.mseall where permno in {permnos} """) crsp_mseall_dl['date'] = pd.to_datetime(crsp_mseall_dl['date']) temp2 = pd.merge(temp2, crsp_mseall_dl, on=['date', 'permno']) temp2['exchcd'] = temp2['exchcd_x'] temp2.loc[(temp2['dlret'].isna()) & ((temp2['dlstcd'] == 500) | ((temp2['dlstcd'] >= 520) & (temp2['dlstcd'] <= 584))) & (temp2['exchcd'].isin([1, 2])), 'dlret'] = -0.35 temp2.loc[(temp2['dlret'].isna()) & ((temp2['dlstcd'] == 500) | ((temp2['dlstcd'] >= 520) & (temp2['dlstcd'] <= 584))) & (temp2['exchcd'].isin([3])), 'dlret'] = -0.55 temp2.loc[(temp2['dlret'].notna()) & (temp2['dlret'] < -1), 'dlret'] = -1 temp2.loc[( temp2['dlret'].isna() ), 'dlret'] = 0 #TODO: wtf? this should not be 0... i think this should be not missing... temp2['ret'] = temp2['ret'] + temp2['dlret'] temp2 = temp2.sort_values(['permno', 'date', 'datadate'], ascending=[True, True, False]) temp2 = temp2.drop_duplicates(['permno', 'date']) temp2 = temp2.rename(columns={'datadate': 'time_2'}) temp2['mve0'] = np.abs(temp2['prc']) * temp2['shrout'] temp2['mvel1'] = lag(temp2, 'mve0') temp2['pps'] = lag(temp2, 'prc') comp_qtr = conn.raw_sql(f""" select fyearq, fqtr, apdedateq, datadate, pdateq, fdateq, c.gvkey, f.cusip as cnum, datadate as datadate_q, rdq, sic as sic2, ibq, saleq, txtq, revtq, cogsq, xsgaq, atq, actq, cheq, lctq, dlcq, ppentq, xrdq, rectq, invtq, ppegtq, txdbq, dlttq, dvpsxq, gdwlq, intanq, txditcq, dpq, oibdpq, cshprq, ajexq, oiadpq, ivaoq, mibq, xintq, drcq, drltq, apq, abs(prccq) as prccq, abs(prccq)*cshoq as mveq, ceqq, seqq, pstkq, atq, ltq, pstkrq from comp.names as c, comp.fundq as f where c.gvkey in {gvkeys} and f.gvkey = c.gvkey and f.indfmt='INDL' and f.datafmt='STD' and f.popsrc='D' and f.consol='C' """) comp_qtr.apdedateq = pd.to_datetime(comp_qtr.apdedateq) comp_qtr.datadate = pd.to_datetime(comp_qtr.datadate) comp_qtr.pdateq = pd.to_datetime(comp_qtr.pdateq) comp_qtr.fdateq = pd.to_datetime(comp_qtr.fdateq) comp_qtr = comp_qtr.loc[:, ~comp_qtr.columns.duplicated()] comp_qtr['cshoq'] = comp_qtr['mveq'] / abs(comp_qtr['prccq']) comp_qtr = comp_qtr.sort_values(['gvkey', 'datadate_q']) comp_qtr = comp_qtr.drop_duplicates(['gvkey', 'datadate_q']) def lag(df, col, n=1, on='gvkey'): z = df.groupby(on)[col].shift(n) z = z.reset_index() z = z.sort_values('index') z = z.set_index('index') return z[col] compq3 = comp_qtr compq3.loc[compq3['pstkrq'].notna(), 'pstk'] = compq3['pstkrq'] compq3.loc[compq3['pstkrq'].isna(), 'ptsk'] = compq3['pstkq'] compq3['scal'] = compq3['seqq'] compq3.loc[compq3['seqq'].isna(), 'scal'] = compq3['ceqq'] + compq3['pstk'] compq3.loc[(compq3['seqq'].isna()) & ((compq3['ceqq'].isna()) | (compq3['pstk'].isna())), 'scal'] = compq3['atq'] - compq3['ltq'] compq3['chtx'] = (compq3['txtq'] - lag(compq3, 'txtq', 4)) / lag( compq3, 'atq', 4) compq3['roaq'] = compq3['ibq'] / lag(compq3, 'atq') compq3['roeq'] = compq3['ibq'] / lag(compq3, 'scal') compq3['rsup'] = (compq3['saleq'] - lag(compq3, 'saleq', 4)) / compq3['mveq'] compq3['sacc'] = ( ((compq3['actq'] - lag(compq3, 'actq')) - (compq3['cheq'] - lag(compq3, 'cheq'))) - ((compq3['lctq'] - lag(compq3, 'lctq')) - (compq3['dlcq'] - lag(compq3, 'dlcq')))) / compq3['saleq'] compq3.loc[compq3['saleq'] <= 0, 'sacc'] = ((compq3['actq'] - lag(compq3, 'actq')) - (compq3['cheq'] - lag(compq3, 'cheq'))) def trailing_std(df, col, n=15, on='gvkey'): z = df.groupby(on)[col].rolling(n).std() z = z.reset_index() z = z.sort_values('level_1') z = z.set_index('level_1') return z[col] compq3['stdacc'] = trailing_std(compq3, 'sacc', 16) compq3['sgrvol'] = trailing_std(compq3, 'rsup', 15) compq3['roavol'] = trailing_std(compq3, 'roaq', 15) compq3['scf'] = compq3['ibq'] / compq3['saleq'] - compq3['sacc'] compq3.loc[compq3['saleq'] <= 0, 'scf'] = compq3['ibq'] / 0.01 - compq3['sacc'] compq3['stdcf'] = trailing_std(compq3, 'scf', 16) compq3['cash'] = compq3['cheq'] / compq3['atq'] compq3['cinvest'] = (compq3['ppentq'] - lag( compq3, 'ppentq')) / compq3['saleq'] - (1 / 3) * ( (lag(compq3, 'ppentq') - lag(compq3, 'ppentq', 2)) / lag(compq3, 'saleq')) - (1 / 3) * ( (lag(compq3, 'ppentq', 2) - lag(compq3, 'ppentq', 3)) / lag(compq3, 'saleq', 2)) - (1 / 3) * ( (lag(compq3, 'ppentq', 3) - lag(compq3, 'ppentq', 4)) / lag(compq3, 'saleq', 3)) compq3.loc[compq3['saleq'] <= 0, 'cinvest'] = ( compq3['ppentq'] - lag(compq3, 'ppentq')) / 0.01 - (1 / 3) * ( (lag(compq3, 'ppentq') - lag(compq3, 'ppentq', 2)) / 0.01) - (1 / 3) * ( (lag(compq3, 'ppentq', 2) - lag(compq3, 'ppentq', 3)) / 0.01 ) - (1 / 3) * ( (lag(compq3, 'ppentq', 3) - lag(compq3, 'ppentq', 4)) / 0.01) compq3['che'] = compq3['ibq'] = lag(compq3, 'ibq', 4) #compq3['nincr'] #TODO: nincr compq3['rdmq'] = compq3['xrdq'] / compq3['mveq'] compq3['rdsq'] = compq3['xrdq'] / compq3['saleq'] compq3['olq'] = (compq3['cogsq'] + compq3['xsgaq']) / compq3['atq'] compq3['tanq'] = (compq3['cheq'] + 0.715 * compq3['rectq'] + 0.54 * compq3['invtq'] + 0.535 * compq3['ppegtq']) / compq3['atq'] compq3['kzq'] = -1.002*((compq3['ibq'] + lag(compq3, 'ibq', 1) + lag(compq3, 'ibq', 2) + lag(compq3, 'ibq', 3) + compq3['dpq']) / lag(compq3,'ppentq')) \ + 0.283*(compq3['atq'] + compq3['mveq'] - compq3['ceqq'] - compq3['txdbq'])/compq3['atq'] \ - 3.139 * (compq3['dlcq'] + compq3['dlttq'])/(compq3['dlcq'] + compq3['dlttq'] + compq3['seqq']) \ + 39.368*(compq3['dvpsxq'] * compq3['cshoq'] + lag(compq3, 'dvpsxq')*lag(compq3, 'cshoq') + lag(compq3, 'dvpsxq',2)*lag(compq3, 'cshoq',2) + lag(compq3, 'dvpsxq',3)*lag(compq3, 'cshoq', 3))/lag(compq3, 'ppentq')\ - 1.315*compq3['cheq']/lag(compq3, 'ppentq') compq3.loc[compq3['gdwlq'].isna(), 'gdwlq'] = 0 compq3.loc[compq3['intanq'].isna(), 'intanq'] = 0 compq3['alaq'] = compq3['cheq'] + 0.75 * ( compq3['actq'] - compq3['cheq']) + 0.5 * (compq3['atq'] - compq3['actq'] - compq3['gdwlq'] - compq3['intanq']) compq3['almq'] = compq3['alaq'] / (compq3['atq'] + compq3['mveq'] - compq3['ceqq']) compq3['laq'] = compq3['atq'] / lag(compq3, 'atq') - 1 compq3.loc[compq3['seqq'].isna(), 'seqq'] = compq3['ceqq'] + compq3['pstkq'] - compq3['ltq'] compq3.loc[compq3['txditcq'].notna(), 'bmq'] = (compq3['seqq'] + compq3['txditcq'] - compq3['pstkq']) / compq3['mveq'] compq3['dmq'] = (compq3['dlcq'] + compq3['dlttq']) / compq3['mveq'] compq3['amq'] = compq3['atq'] / compq3['mveq'] compq3['epq'] = compq3['ibq'] / compq3['mveq'] compq3['cpq'] = (compq3['ibq'] + compq3['dpq']) / compq3['mveq'] compq3['emq'] = (compq3['mveq'] + compq3['dlcq'] + compq3['dlttq'] + compq3['pstkq'] - compq3['cheq']) / compq3['oibdpq'] compq3['spq'] = compq3['saleq'] / compq3['mveq'] compq3['ndpq'] = compq3['dlttq'] + compq3['dlcq'] + compq3[ 'pstkq'] - compq3['cheq'] compq3['ebpq'] = (compq3['ndpq'] + compq3['ceqq']) / (compq3['ndpq'] + compq3['mveq']) compq3['x_1'] = compq3['saleq'] / (compq3['cshprq'] * compq3['ajexq']) compq3['rs'] = (compq3['x_1'] - lag(compq3, 'x_1', 4)) / trailing_std( compq3, 'x_1', 6) compq3['droeq'] = compq3['roeq'] - lag(compq3, 'roeq', 4) compq3['droaq'] = compq3['roaq'] - lag(compq3, 'roaq', 4) compq3.loc[compq3['dlcq'].isna(), 'dlcq'] = 0 compq3.loc[compq3['ivaoq'].isna(), 'ivaoq'] = 0 compq3.loc[compq3['mibq'].isna(), 'mibq'] = 0 compq3.loc[compq3['pstkq'].isna(), 'pstkq'] = 0 compq3['noaq'] = (compq3['atq'] - compq3['cheq'] - compq3['ivaoq']) - ( compq3['dlcq'] - compq3['dlttq'] - compq3['mibq'] - compq3['pstkq'] - compq3['ceqq']) / lag(compq3, 'atq') compq3['rnaq'] = compq3['oiadpq'] / lag(compq3, 'noaq') compq3['pmq'] = compq3['oiadpq'] / compq3['saleq'] compq3['atoq'] = compq3['saleq'] / lag(compq3, 'noaq') compq3['ctoq'] = compq3['saleq'] / lag(compq3, 'atq') compq3['glaq'] = (compq3['revtq'] - compq3['cogsq']) / lag(compq3, 'atq') compq3['oleq'] = (compq3['revtq'] - compq3['cogsq'] - compq3['xsgaq'] - compq3['xintq']) / lag(compq3, 'bmq') compq3['olaq'] = (compq3['revtq'] - compq3['cogsq'] - compq3['xsgaq'] + compq3['xrdq']) / lag(compq3, 'atq') compq3['claq'] = ((compq3['revtq'] - compq3['cogsq'] - compq3['xsgaq'] + compq3['xrdq'] - (compq3['rectq']-lag(compq3, 'rectq')) - (compq3['invtq']-lag(compq3, 'invtq')) \ + compq3['drcq'] - lag(compq3, 'drcq') + compq3['drltq'] - lag(compq3, 'drltq') + compq3['apq'] - lag(compq3, 'apq'))) / lag(compq3, 'atq') compq3['blq'] = compq3['atq'] / compq3['bmq'] compq3['sgq'] = compq3['saleq'] / lag(compq3, 'saleq', 4) compq3.loc[compq3['dvpsxq'] > 0, 'dvpsxq_1'] = 1 compq3.loc[compq3['dvpsxq'] <= 0, 'dvpsxq_1'] = 0 temp_indsaleq = compq3.groupby(['sic2', 'fyearq'])['saleq'].sum().reset_index() temp_indsaleq = temp_indsaleq.rename(columns={'saleq': 'indsaleq'}) compq3 = pd.merge(compq3, temp_indsaleq, on=['fyearq', 'sic2']) compq3['wwq'] = -0.091*(compq3['ibq'] + compq3['dpq'])/compq3['atq'] - 0.062*compq3['dvpsxq_1'] + 0.021*compq3['dlttq']/compq3['atq'] \ - 0.044 * np.log(compq3['atq']) + 0.102*(compq3['indsaleq']/lag(compq3, 'indsaleq') - 1) - 0.035*(compq3['saleq']/lag(compq3, 'saleq') - 1) temp_md_roavol = compq3.groupby(['fyearq', 'fqtr', 'sic2'])['roavol'].median().reset_index() temp_md_roavol = temp_md_roavol.rename(columns={'roavol': 'md_roavol'}) compq3 = pd.merge(compq3, temp_md_roavol, on=['fyearq', 'fqtr', 'sic2']) temp_md_sgrvol = compq3.groupby(['fyearq', 'fqtr', 'sic2'])['sgrvol'].median().reset_index() temp_md_sgrvol = temp_md_sgrvol.rename(columns={'sgrvol': 'md_sgrvol'}) compq3 = pd.merge(compq3, temp_md_sgrvol, on=['fyearq', 'fqtr', 'sic2']) compq3.loc[compq3['roavol'] < compq3['md_roavol'], 'm7'] = 1 compq3.loc[compq3['roavol'] >= compq3['md_roavol'], 'm7'] = 0 compq3.loc[compq3['sgrvol'] < compq3['md_sgrvol'], 'm8'] = 1 compq3.loc[compq3['sgrvol'] >= compq3['md_sgrvol'], 'm8'] = 0 # ibessum = conn.raw_sql(f""" # select ticker, cusip, fpedats, statpers, ANNDATS_ACT, # numest, ANNTIMS_ACT, medest, actual, stdev # from ibes.statsum_epsus # where ticker in {tics} # and fpi='6' # and statpers<ANNDATS_ACT # and measure='EPS' # and (fpedats-statpers)>=0 # """) # ibessum = ibessum[(ibessum['medest'].notna()) & (ibessum['fpedats'].notna())] # ibessum = ibessum.sort_values(by=['cusip','fpedats','statpers'], ascending=[True,True,False]) # ibessum = ibessum.drop_duplicates(['cusip', 'fpedats']) # # crsp_msenames = conn.raw_sql("""select * from crsp.msenames""") # crsp_msenames = crsp_msenames[crsp_msenames['ncusip'].notna()] # crsp_msenames = crsp_msenames.sort_values(['permno','ncusip']) # crsp_msenames = crsp_msenames.drop_duplicates(['permno','ncusip']) # names = crsp_msenames.rename(columns={'cusip':'cusip6'}) # # ibessum2 = pd.merge(ibessum, names[['ncusip','cusip6']], left_on='cusip', right_on=['ncusip'], how='left') # ibessum2['cusip6'] = ibessum2['cusip6'].astype(str).str[0:6] compq3['cnum'] = compq3['cnum'].astype(str).str[0:6] # compq4 = pd.merge(compq3, ibessum2[['medest','actual','cusip6','fpedats']], left_on=['cnum','datadate_q'], right_on=['cusip6','fpedats'], how='left') compq4 = compq3 compq4 = compq4.sort_values(['gvkey', 'datadate_q']) compq4 = compq4.drop_duplicates(['gvkey', 'datadate_q']) # compq4.loc[(compq4['medest'].isna()) | (compq4['actual']).isna(), 'sue'] = compq4['che']/compq4['mveq'] # compq4.loc[(compq4['medest'].notna()) & (compq4['actual']).notna(), 'sue'] = (compq4['actual'] - compq4['medest'])/abs(compq4['prccq']) lnk = conn.raw_sql(f""" select * from crsp.ccmxpf_linktable where lpermno in {permnos} """) lnk = lnk[lnk['linktype'].isin( ['LU', 'LC', 'LD', 'LF', 'LN', 'LO', 'LS', 'LX'])] lnk = lnk[(2018 >= lnk['linkdt'].astype(str).str[0:4].astype(int)) | (lnk['linkdt'] == '.B')] lnk = lnk[(lnk['linkenddt'].isna()) | ("1940" <= lnk['linkenddt'].astype(str).str[0:4])] lnk = lnk.sort_values(['gvkey', 'linkdt']) compq5 = pd.merge(compq4, lnk[['gvkey', 'linkdt', 'linkenddt', 'lpermno']], on='gvkey', how='inner') compq5 = compq5[(compq5['linkdt'] <= compq5['datadate_q']) | (compq5['linkdt'] == '.B')] compq5 = compq5[(compq5['datadate_q'] <= compq5['linkenddt']) | (compq5['linkenddt'].isna())] compq5 = compq5[(compq5['lpermno'] != '.') & compq5['gvkey'].notna()] compq5 = compq5[(compq5['lpermno'].notna()) & (compq5['rdq'].notna())] crsp_dsf = conn.raw_sql(f""" select vol, ret, permno, date from crsp.dsf as d where d.permno in {permnos} """) crsp_dsf = crsp_dsf[crsp_dsf['permno'].isin(compq5['lpermno'])] crsp_dsf['date'] = pd.to_datetime(crsp_dsf['date']) crsp_dsf = crsp_dsf.sort_values('date') compq5['temp_rdq'] = np.busday_offset( compq5['rdq'].values.astype('datetime64[D]'), -10, roll='forward') crsp_dsf['avgvol'] = trailing_std(crsp_dsf, 'vol', n=21, on='permno') compq5 = pd.merge(compq5, crsp_dsf[['date', 'permno', 'avgvol']], how='left', left_on=['temp_rdq', 'lpermno'], right_on=['date', 'permno']) compq5['temp_rdq'] = np.busday_offset( compq5['rdq'].values.astype('datetime64[D]'), 1, roll='forward') crsp_dsf['aeavol'] = trailing_std(crsp_dsf, 'vol', n=3, on='permno') crsp_dsf['ear'] = lag(crsp_dsf, 'ret', 0, on='permno') + lag( crsp_dsf, 'ret', 1, on='permno') + lag(crsp_dsf, 'ret', 2, on='permno') compq6 = pd.merge(compq5, crsp_dsf[['date', 'permno', 'aeavol', 'avgvol', 'ear']], how='left', left_on=['temp_rdq', 'lpermno'], right_on=['date', 'permno']) compq6['avgvol'] = compq6['avgvol_x'] compq6['aeavol'] = (compq6['aeavol'] - compq6['avgvol']) / compq6['avgvol'] compq6 = compq6[[ 'fyearq', 'fqtr', 'apdedateq', 'datadate', 'pdateq', 'fdateq', 'gvkey', 'lpermno', 'datadate_q', 'rdq', 'chtx', 'roaq', 'rsup', 'stdacc', 'stdcf', 'sgrvol', 'rdmq', 'rdsq', 'olq', 'tanq', 'kzq', 'alaq', 'almq', 'laq', 'bmq', 'dmq', 'amq', 'epq', 'cpq', 'emq', 'spq', 'ndpq', 'ebpq', 'wwq', 'rs', 'droeq', 'droaq', 'noaq', 'rnaq', 'pmq', 'atoq', 'ctoq', 'glaq', 'oleq', 'olaq', 'claq', 'blq', 'sgq', 'roavol', 'cash', 'cinvest', 'm7', 'm8', 'prccq', 'roeq', 'aeavol', 'ear' ]] compq6 = compq6.drop_duplicates() return compq6, temp2