def _get_rf(freq): ''' parse risk free rate from the database Args: freq: D (daily),W (weekly),M (monthly) Returns: ''' dic = {'D': 'Nrrdaydt', 'W': 'Nrrwkdt', 'M': 'Nrrmtdt'} tname = 'TRD_Nrrate' src = read_gta(tname) #NRI01=定期-整存整取-一年利率;TBC=国债票面利率,根据复利计算方法, # 将年度的无风险利率转化为月度数据 src = src[src['Nrr1'] == 'NRI01'] #trick: choose the type of risk free rate src = src.set_index('Clsdt') rf = src[dic[freq]][2:] #delete the first two rows rf.index.name = 't' rf.name = 'rf' + freq rf.index = pd.to_datetime(rf.index) if freq in ['W', 'M']: rf = rf.resample(freq).agg(lambda x: x[round(x.shape[0] / 2)]) return rf / 100.0 #trick:the unit of rf in the file is %,we adjust it to be actual value.
def parse_financial_report(tbname, varname, freq='Y', consolidated=True): ''' This function will parse indicator from financial report. Args: tbname: varname: freq:{'Y','Q'},'Y' means yearly,'Q' means quartly consolidated: True or False,If true,use the indicator from consolidated financial statements Returns:DataFrame ''' df = read_gta(tbname) if consolidated: df = df[df['Typrep'] == 'A'] # 合并报表 colname = 'Stkcd' indname = 'Accper' table = pd.pivot_table(df, varname, indname, colname) table.index.name = 't' table.index = pd.to_datetime(table.index) table.columns = table.columns.astype(str) table.columns.name = 'sid' if freq == 'Y': return table[table.index.month == 12] elif freq == 'Q': return table[table.index.month.isin([3, 6, 9, 12])]
def get_amihud_illiq(): df = read_gta('TRD_Dalyr') df = df[['Stkcd', 'Trddt', 'Dretwd', 'Dnvaltrd']] df.columns = ['sid', 't', 'ret', 'volume'] df['t'] = freq_end(df['t'], 'D') df = df.set_index(['t', 'sid']) if not df.index.is_monotonic_increasing: df = df.sort_index( level='t' ) #TODO: gta's data is not monotonic_increasing ,add this two row to other scripts dict = OrderedDict({'1M': 15, '3M': 50, '6M': 100, '12M': 200}) result = groupby_rolling(df, 'illiq', dict, _amihud) result.index.names = ['type', 't'] ln_result = np.log(result) ln_result = ln_result.reset_index() ln_result['type'] = 'ln_' + ln_result['type'].astype(str) ln_result = ln_result.set_index(['type', 't']) illiq = pd.concat([result, ln_result], axis=0) #TODO:use valid observation for the whole project as page 276 # adjust the format of the DataFrame illiq.columns = pd.Index(illiq.columns.astype(str), illiq.columns.name) illiq = illiq.reset_index() illiq['t'] = freq_end(illiq['t'], 'M') illiq = illiq.set_index(['type', 't']) illiq = illiq.stack().unstack(level='type') #TODO: The data is really noisy,refer to outliers figures for details save(illiq, 'illiq')
def get_ep(): df = read_gta('STK_MKT_Dalyr', encoding='gbk') df['t'] = pd.to_datetime(df['TradingDate']) df['sid'] = df['Symbol'].astype(str) df['ep'] = 1.0 / df['PE'] cfpr = pd.pivot_table(df, values='ep', index='t', columns='sid') cfpr = cfpr.sort_index().resample('M').last() save(cfpr, 'ep')
def get_tradingStatusD(): df = read_gta('TRD_Dalyr', encoding='gbk') #Trick: Trdsta==1 means "正常交易" df['is_normal'] = df['Trdsta'] == 1.0 df['t'] = pd.to_datetime(df['Trddt']) df['sid'] = df['Stkcd'].astype(str) status = pd.pivot_table(df, values='is_normal', index='t', columns='sid') save(status, 'tradingStatusD') return status
def get_ffcM(): df = read_gta('STK_MKT_CarhartFourFactors') # P9709 全部A股市场包含沪深A股和创业板 # 流通市值加权 df = df[df['MarkettypeID'] == 'P9709'][[ 'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'UMD2' ]] df.columns = ['t', 'rp', 'smb', 'hml', 'mom'] df = df.set_index('t') df.to_csv(os.path.join(DATA_PATH, 'ffcM.csv'))
def get_ff3D(): tbname = 'STK_MKT_ThrfacDay' df = read_gta(tbname) condition1 = df['MarkettypeID'] == 'P9707' # P9709 全部A股市场包含沪深A股和创业板 # 流通市值加权 df = df[condition1][['TradingDate', 'RiskPremium1', 'SMB1', 'HML1']] df.columns = ['t', 'rp', 'smb', 'hml'] df = df.set_index('t') df.to_csv(os.path.join(DATA_PATH, 'ff3D.csv'))
def get_ff5M(): df = read_gta('STK_MKT_FivefacMonth') #P9709 全部A股市场包含沪深A股和创业板 #流通市值加权 #2*3 投资组合 df = df[(df['MarkettypeID'] == 'P9709') & (df['Portfolios'] == 1)][[ 'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'RMW1', 'CMA1' ]] df.columns = ['t', 'rp', 'smb', 'hml', 'rmw', 'cma'] df = df.set_index('t') df.to_csv(os.path.join(DATA_PATH, 'ff5M.csv'))
def get_ff3D(): tbname = 'STK_MKT_ThrfacDay' df = read_gta(tbname) condition1 = df['MarkettypeID'] == 'P9707' # P9709 全部A股市场包含沪深A股和创业板. # 流通市值加权 df = df[condition1][['TradingDate', 'RiskPremium1', 'SMB1', 'HML1']] df.columns = ['t', 'rp', 'smb', 'hml'] df.columns.name = 'type' df = df.set_index('t') df.index = freq_end(df.index, 'D') save(df, 'ff3D') return df
def get_ffcM(): df = read_gta('STK_MKT_CarhartFourFactors') #trick: P9709 全部A股市场包含沪深A股和创业板 #trick: 流通市值加权 df = df[df['MarkettypeID'] == 'P9709'][[ 'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'UMD2' ]] df.columns = ['t', 'rp', 'smb', 'hml', 'mom'] df.columns.name = 'type' df = df.set_index('t') df.index = freq_end(df.index, 'M') save(df, 'ffcM')
def get_ff3M(): #fixme: there are some abnormal values df = read_gta('STK_MKT_ThrfacMonth') #trick:P9709 全部A股市场包含沪深A股和创业板 #trick:流通市值加权 df = df[df['MarkettypeID'] == 'P9709'][[ 'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1' ]] df.columns = ['t', 'rp', 'smb', 'hml'] df = df.set_index('t') df.index = freq_end(df.index, 'M') df.columns.name = 'type' save(df, 'ff3M') return df
def get_ff5M(): df = read_gta('STK_MKT_FivefacMonth') #trick:P9709 全部A股市场包含沪深A股和创业板 #trick:流通市值加权 #trick: 2*3 投资组合 df = df[(df['MarkettypeID'] == 'P9709') & (df['Portfolios'] == 1)][[ 'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'RMW1', 'CMA1' ]] df.columns = ['t', 'rp', 'smb', 'hml', 'rmw', 'cma'] df.columns.name = 'type' df = df.set_index('t') df.index = freq_end(df.index, 'M') # df.index.name='t' save(df, 'ff5M')
def get_mktRetM(): tbname = 'TRD_Cnmont' indVar = 'Trdmnt' targetVar = 'Cmretwdos' #trick:考虑现金红利再投资的综合日市场回报率(流通市值加权平均法) df = read_gta(tbname) df = df[df['Markettype'] == 21] # 21=综合A股和创业板 df = df.set_index(indVar) df.index = freq_end(df.index, 'M') df.index.name = 't' s = df[targetVar] s.name = 'mktRetM' save(s, 'mktRetM')
def get_mktRetD(): # get daily market return tbname = 'TRD_Cndalym' indVar = 'Trddt' targetVar = 'Cdretwdos' #trick 考虑现金红利再投资的综合日市场回报率(流通市值加权平均法) df = read_gta(tbname) condition1 = df['Markettype'] == 21 #trick 21=综合A股和创业板 df = df[condition1] df = df.set_index(indVar) df.index.name = 't' df.index = pd.to_datetime(df.index) s = df[targetVar] s.name = 'mktRetD' #TODO: put this line into check function or unify function? save(s, 'mktRetD')
def get_amihud_illiq(): df = read_gta('TRD_Dalyr') df = df[['Stkcd', 'Trddt', 'Dretwd', 'Dnvaltrd']] df.columns = ['sid', 't', 'ret', 'volume'] df['t'] = convert_freq(df['t'], 'D') df = df.set_index(['t', 'sid']) dict = {'1M': 15, '3M': 50, '6M': 100, '12M': 200} result = groupby_rolling(df, 'illiq', dict, _amihud) result.index.names = ['type', 't'] ln_result = np.log(result) ln_result = ln_result.reset_index() ln_result['type'] = 'ln_' + ln_result['type'] ln_result = ln_result.set_index(['type', 't']) illiq = pd.concat([result, ln_result], axis=0) #TODO:use valid observation for the whole project as page 276 illiq.to_csv(os.path.join(DATA_PATH, 'illiq.csv'))
def parse_all_financial_indicators(): tbnames=['FI_T{}'.format(i) for i in range(1,12)] for tbname in tbnames: df=read_gta(tbname) varnames=[col for col in df.columns if col not in ['Accper','Indcd','Stkcd','Typrep']] if 'Typrep' in df.columns: consolidated=True else: consolidated=False varnames=_filter_indicators(varnames) for varname in varnames: df=parse_financial_report(tbname,varname,consolidated=consolidated) df=quaterly2monthly(df) df.to_pickle(os.path.join(dirFI,'{}__{}.pkl'.format(tbname,varname))) print(tbname,varname)
def get_listInfo(): df = read_gta('IPO_Cobasic', encoding='gbk') df = df.set_index('Stkcd') df.index.name = 'sid' df.index = df.index.astype(str) df.columns.name = 'type' #TODO: refer to page12 of 动量因子_164.pdf ' 1 代表剔除金融、保险、 ST 类股票' df['not_financial'] = df['Indcd'] != 1 #financial stocks df['not_cross'] = ~df['Crcd'].notnull( ) #cross means stocks listed on multiple stock markets df['is_sh'] = df['Listexg'] == 1 #listed on shanghai df['is_sz'] = df['Listexg'] == 2 #listed on shenzhen # Listdt denotes listed date ,'Ipodt' denotes IPO date df['Listdt'] = df['Listdt'].replace( ['0000-00-00', '2100-01-01'], np.nan) #there is some invalid data in column 'Listdt df['listDate'] = pd.to_datetime(df['Listdt']) df = df[['listDate', 'not_financial', 'not_cross', 'is_sh', 'is_sz']] df = df[~df.index.duplicated( False)] #there are some duplicated items such as '600018 df = df.dropna() save(df, 'listInfo')
def get_amihud_illiq(): df = read_gta('TRD_Dalyr') df = df[['Stkcd', 'Trddt', 'Dretwd', 'Dnvaltrd']] df.columns = ['sid', 't', 'ret', 'volume'] df['t'] = freq_end(df['t'], 'D') df = df.set_index(['t', 'sid']) if not df.index.is_monotonic_increasing: df = df.sort_index( level='t' ) #TODO: gta's data is not monotonic_increasing ,add this two row to other scripts dict = OrderedDict({'1M': 15, '3M': 50, '6M': 100, '12M': 200}) result = groupby_rolling(df, 'illiq', dict, _amihud) result.index.names = ['type', 't'] ln_result = np.log(result) ln_result = ln_result.reset_index() ln_result['type'] = 'ln_' + ln_result['type'].astype(str) ln_result = ln_result.set_index(['type', 't']) illiq = pd.concat([result, ln_result], axis=0) #TODO:use valid observation for the whole project as page 276 illiq.to_csv(os.path.join(DATA_PATH, 'illiq.csv'))
def get_liquidity_ps(): df = read_gta('Liq_PSM_M') #MarketType==21 综合A股和创业板 # 流通市值加权,but on the page 310,Bali use total market capilization condition1 = (df['MarketType'] == 21) condition2 = (df['ST'] == 1) #delete the ST stocks df = df[condition1 & condition2][['Trdmnt', 'AggPS_os']] df.columns = ['t', 'rm'] df = df.set_index('t') df.index = freq_end(df.index, 'M') df = df.sort_index() df['rm_ahead'] = df['rm'].shift(1) df['delta_rm'] = df['rm'] - df['rm'].shift(1) df['delta_rm_ahead'] = df['rm_ahead'] - df['rm_ahead'].shift(1) #df.groupby(lambda x:x.year).apply(lambda df:df.shape[0]) #TODO: we don't know the length of window to regress.In this place,we use the five years history def regr(df): if df.shape[0] > 30: return sm.ols(formula='delta_rm ~ delta_rm_ahead + rm_ahead', data=df).fit().resid[0] else: return np.NaN window = 60 # not exact 5 years lm = pd.Series( [regr(df.loc[:month][-window:].dropna()) for month in df.index], index=df.index) lm.name = 'lm' ret = read_df('stockRetM', freq='M') rf = read_df('rfM', freq='M') eret = ret.sub(rf['rf'], axis=0) eret = eret.stack() eret.index.names = ['t', 'sid'] eret.name = 'eret' ff3 = read_df('ff3_gta', 'M') factors = pd.concat([ff3, lm], axis=1) comb = eret.to_frame().join(factors) def _for_one_month(df): if df.shape[0] >= 30: return sm.ols(formula='eret ~ rp + smb + hml + lm', data=df).fit().params['lm'] else: return np.NaN def _get_result(df): thresh = 30 #30 month if df.shape[0] > thresh: values = [] sid = df.index[0][1] df = df.reset_index(level='sid', drop=True) months = df.index.tolist()[thresh:] for month in months: subdf = df.loc[:month][-60:] subdf = subdf.dropna() # df=df.reset_index(level='sid',drop=True).loc[:month].last(window) values.append(_for_one_month(subdf)) print(sid) return pd.Series(values, index=months) result = comb.groupby('sid').apply(_get_result) result.unstack('sid').to_csv(os.path.join(DATA_PATH, 'liqBeta.csv'))
def get_liquidity(): # Turnover rate df1 = read_gta('Liq_Tover_M', index_col=0) df1 = df1[df1['Status'] == 'A'] # A=正常交易 df1 = df1[[ 'Stkcd', 'Trdmnt', 'ToverOsM', 'ToverTlM', 'ToverOsMAvg', 'ToverTlMAvg' ]] df1.columns = [ 'sid', 't', 'turnover1', 'turnover2', 'turnover3', 'turnover4' ] df1['t'] = freq_end(df1['t'], 'M') df1['sid'] = df1['sid'].astype(str) df1 = df1.set_index(['t', 'sid']) df1 = df1.astype(float) # Amihud df2 = read_gta('Liq_Amihud_M', index_col=0) df2 = df2[df2['Status'] == 'A'] # A=正常交易 df2 = df2[['Stkcd', 'Trdmnt', 'ILLIQ_M']] # 月内日均换手率(流通股数) df2.columns = ['sid', 't', 'amihud'] df2['t'] = freq_end(df2['t'], 'M') df2['sid'] = df2['sid'].astype(str) df2 = df2.set_index(['t', 'sid']) df2 = df2.astype(float) ''' roll1,roll2,zeros1 and zeros2 are note proper for portfolio analysis,since there are a lot of zeros in sample,which will cause errors in the program. ''' # roll # df3=read_gta('Liq_Roll_M',index_col=0) # df3 = df3[df3['Status'] == 'A'] # A=正常交易 # df3 = df3[['Stkcd', 'Trdmnt', 'Roll_M','Roll_Impact_M']] # 月内日均换手率(流通股数) # df3.columns = ['sid', 't', 'roll1','roll2'] # df3['t'] = freq_end(df3['t'], 'M') # df3['sid'] = df3['sid'].astype(str) # df3=df3.set_index(['t','sid']) # df3=df3.astype(float) # # Zeros # df4=read_gta('Liq_Zeros_M',index_col=0) # df4 = df4[df4['Status'] == 'A'] # A=正常交易 # df4 = df4[['Stkcd', 'Trdmnt', 'Zeros_M','Zeros_Impact_M']] # 月内日均换手率(流通股数) # df4.columns = ['sid', 't', 'zeros1','zeros2'] # df4['t'] = freq_end(df4['t'], 'M') # df4['sid'] = df4['sid'].astype(str) # df4=df4.set_index(['t','sid']) # df4=df4.astype(float) # Pastor Stambaugh df5 = read_gta('Liq_PS_M', index_col=0) df5 = df5[df5['Status'] == 'A'] # A=正常交易 df5 = df5[['Stkcd', 'Trdmnt', 'PSos', 'PStl']] # 月内日均换手率(流通股数) df5.columns = ['sid', 't', 'ps1', 'ps2'] df5['t'] = freq_end(df5['t'], 'M') df5['sid'] = df5['sid'].astype(str) df5 = df5.set_index(['t', 'sid']) df5 = df5.astype(float) # combine them x = pd.concat([df[~df.index.duplicated()] for df in [df1, df2, df5]], axis=1) x.columns.name = 'type' save(x, 'liquidity')