def get_bm(): ''' this function can be bookmarked as a snippet of how to manipulate date index in Pandas A little different with the book,here we use be and me for one share, but the data in the book is for all floating shares.However,it doesn't affect the bm. :return: ''' be = read_df('bps', 'M') be = be[be.index.month == 12] me = read_df('stockCloseY', 'M') be, me = get_inter_frame([be, me]) bm = be / me bm[bm <= 0] = np.nan #delete those sample with bm<0 bm = bm.shift(1, freq='6M') newIndex = pd.date_range(bm.index[0], bm.index[-1], freq='M') bm = bm.reindex(index=newIndex) bm = bm.fillna(method='ffill', limit=11) bm.to_csv(os.path.join(DATA_PATH, 'bm.csv')) logbm = np.log(bm) logbm.to_csv(os.path.join(DATA_PATH, 'logbm.csv'))
def get_eretD(): stockRetD = read_df('stockRetD', 'D') rfD = read_df('rfD', 'D') eretD = stockRetD.sub(rfD['rfD'], axis=0) # The date for stockRetD is buisiness date,but for rfD, it is calendar date. eretD = eretD.dropna( axis=0, how='all' ) # use this to ajust the index from calendar date to buisiness date eretD.to_csv(os.path.join(DATA_PATH, 'eretD.csv'))
def unify_value(self): #value bm = read_df('bm', 'M') logbm = read_df('logbm', 'M') bm = bm.stack() bm.index.names = ['t', 'sid'] logbm = logbm.stack() logbm.index.names = ['t', 'sid'] comb = pd.concat([bm, logbm], axis=1, keys=['bm', 'logbm']) return comb
def compare_wind_gta_bps(): ''' the result is different a lot!!! :return: ''' bps_wind = read_df('bps_wind', 'M') bps_gta = read_df('bps', 'M') bps_wind.columns = [str(int(col[:-3])) for col in bps_wind.columns] bps_wind = bps_wind.sort_index(axis=1) bps_gta = bps_gta.sort_index(axis=1) bps_wind, bps_gta = get_inter_frame([bps_wind, bps_gta])
def unify_size(self): #size size = read_df('size', 'M') mktCap_ff = read_df('mktCap_ff', 'M') size_ff = read_df('size_ff', 'M') #index:t #columns:sid size = size.stack() size.name = 'size' mktCap_ff = mktCap_ff.stack() mktCap_ff.name = 'mktCap_ff' size_ff = size_ff.stack() size_ff.name = 'size_ff' comb = pd.concat([size, mktCap_ff, size_ff], axis=1) return comb
def get_momentum(): stockRetM = read_df('stockRetM', 'M') stk = stockRetM.stack() stk.index.names = ['t', 'sid'] #lagged 1 month d_lag = { 'mom': [ 12, 9 ], #since the window is 11,and we do not use the value of time t,so,here we set 12 rather than 11 'r12': [13, 10], 'r6': [7, 5] } #nonlagged d_nonlag = {'R12M': [12, 10], 'R9M': [9, 7], 'R6M': [6, 5], 'R3M': [3, 3]} ss = [] names = [] for bn, bp in d_lag.items(): ser = stk.groupby('sid').apply(lambda s: _before(s, bp[0], bp[1])) ss.append(ser) names.append(bn) for un, up in d_nonlag.items(): ser = stk.groupby('sid').apply(lambda s: _upto(s, up[0], up[1])) ss.append(ser) names.append(un) momentum = pd.concat(ss, axis=1, keys=names) momentum = momentum * 100 momentum.to_csv(os.path.join(DATA_PATH, 'momentum.csv'))
def get_rev(): stockRetM = read_df('stockRetM', 'M') rev = stockRetM * 100 rev = rev.stack().to_frame() rev.columns = ['reversal'] rev.index.names = ['t', 'sid'] rev.to_csv(os.path.join(DATA_PATH, 'reversal.csv'))
def unify_eretM(self): #eretM eretM = read_df('eretM', 'M') eretM = eretM.stack().to_frame() eretM.columns = ['eretM'] eretM.index.names = ['t', 'sid'] return eretM
def unify_size(self): #size capM = read_df('capM', 'M') size = read_df('size', 'M') mktCap_ff = read_df('mktCap_ff', 'M') size_ff = read_df('size_ff', 'M') #index:t #columns:sid capM = capM.stack() capM.name = 'mktCap' size = size.stack() size.name = 'size' mktCap_ff = mktCap_ff.stack() mktCap_ff.name = 'mktCap_ff' size_ff = size_ff.stack() size_ff.name = 'size_ff' comb = pd.concat([capM, size, mktCap_ff, size_ff], axis=1) comb.index.names = ['t', 'sid'] return comb
def cal_market_states(): ''' market states: search for 'market state' in zoter 1. Cheema and Nartea, “Momentum Returns, Market States, and Market Dynamics.” chapter 3.1: Following Chui et al. (2010), we set stocks with monthly returns greater (lower) than 100 (−95) percent equal to 100 (−95) percent to avoid the influence of extreme returns and any possible data recording errors. :return: ''' upDown = read_df('upDown', 'M') pass
def _combine_all_data(self): ret = read_df('stockRetM', freq='M') rf = read_df('rfM', freq='M') eret = ret.sub(rf['rf'], axis=0) eret = eret.stack() eret.name = 'eret' #TODO: create a df to store eret all_indicators = [ ind for l_values in self.information.values() for ind in l_values ] #TODO: ''' all the data are shift forward one month except for eret,so the index denotes time t+1, and all the data except for the eret are from time t,only eret are from time t+1.We adjust the dataset for these reasons: 1. we will sort the indicators in time t to construct portfolios and analyse the eret in time t+1 2. We need to make sure that the index for eret is corresponding to the time it was calcualted. If we shift back the eret in this place (rather than shift forward the other indicators),we have to shift forward eret again when we regress the portfolio eret on mktRetM in the function _alpha in template.py ''' dfs = [eret] + [ read_df(ind, 'M').shift(1).stack() for ind in all_indicators ] data = pd.concat(dfs, axis=1, keys=['eret'] + all_indicators) data.index.names = ['t', 'sid'] #add mktRetM mktRetM = read_df('mktRetM', freq='M') mktRetM.index.name = 't' data = data.join( mktRetM) #combine multiIndex dataframe with single index dataframe #truncate the sample return data[data.index.get_level_values('t').year >= 1996]
def unify_liquidity(self): illiq = pd.read_csv(os.path.join(DATA_PATH, 'illiq.csv'), index_col=[0, 1], parse_dates=True) illiq = illiq.stack().unstack('type').head() illiq.index.names = ['t', 'sid'] liqBeta = read_df('liqBeta', 'M') liqBeta = liqBeta.stack() liqBeta.index.names = ['t', 'sid'] liqBeta.name = 'liqBeta' comb = pd.concat([illiq, liqBeta], axis=1).head() return comb
def unify_capM(self): ''' market capitalization Usually,the market capitalization is used as weight and we use this value at time t :return: ''' capM = read_df('capM', 'M') capM = capM.stack().to_frame() capM.index.name = 't' capM.columns = ['capM'] capM.index.names = ['t', 'sid'] return capM
def floor_price(df, clsPrice=5.0): ''' the minimum close price is 5 :param df: :param clsPrice: :return: ''' stockCloseM = read_df('stockCloseM', 'M') stockCloseM.columns = stockCloseM.columns.astype(str) valid = stockCloseM[stockCloseM >= 5.0].stack() df = filter_multiIndex(df, valid.index) return df
def get_upDown(): ''' 2. Cooper Michael J., Gutierrez Roberto C., and Hameed Allaudeen, “Market States and Momentum.” :return: ''' mktRetM = read_df('mktRetM', 'M') windows = [12, 24, 36] series = [] for window in windows: s = mktRetM['mktRetM'].rolling(window=window).sum() s = s.shift(1) s[s > 0] = 1 s[s < 0] = -1 series.append(s) upDown = pd.concat(series, axis=1, keys=windows) upDown.to_csv(os.path.join(DATA_PATH, 'upDown.csv'))
def get_momentum(): stockRetM = read_df('stockRetM', 'M') stk = stockRetM.stack() stk.index.names = ['t', 'sid'] #lagged 1 month d_lag = {'mom': [11, 9], 'r12': [12, 10], 'r6': [6, 5]} #nonlagged d_nonlag = {'R12M': [12, 10], 'R9M': [9, 7], 'R6M': [6, 5], 'R3M': [3, 3]} def _cal_cumulated_return(s): return np.cumprod(s + 1)[-1] - 1 def _before(s, interval, min_periods): #for d_before,do not include return of time t return s.rolling(interval, min_periods=min_periods).apply( lambda s: _cal_cumulated_return(s[:-1])) def _upto(s, interval, min_periods): return s.rolling(interval, min_periods=min_periods).apply(_cal_cumulated_return) ss = [] names = [] for bn, bp in d_lag.items(): ser = stk.groupby('sid').apply(lambda s: _before(s, bp[0], bp[1])) ss.append(ser) names.append(bn) for un, up in d_nonlag.items(): ser = stk.groupby('sid').apply(lambda s: _upto(s, up[0], up[1])) ss.append(ser) names.append(un) momentum = pd.concat(ss, axis=1, keys=names) momentum = momentum * 100 #TODO:which type to save staked or with different files for col in momentum.columns: momentum[col].unstack().to_csv(os.path.join(DATA_PATH, col + '.csv'))
def get_hxz4M(): ''' D:\app\python27\zht\researchTopics\assetPricing\calFactors.py\get_hxz4Factors() :return: ''' direc = r'E:\a\quantDb\researchTopics\assetPricing\hxz4\factor' fns = ['rsmb', 'ria', 'rroe'] dfs = [] for fn in fns: df = pd.read_csv(os.path.join(direc, fn + '.csv'), index_col=0) df.index.name = 't' df.columns = [fn] dfs.append(df) comb = pd.concat(dfs, axis=1) comb.index = pd.to_datetime(comb.index) + MonthEnd() ff3 = read_df('ff3M', 'M') comb['rp'] = ff3['rp'] comb.to_csv(os.path.join(DATA_PATH, 'hxz4M.csv'))
def unify_hxz4M(self): hxz4M = read_df('hxz4M', 'M') return hxz4M
def unify_ff5M(self): ff5M = read_df('ff5M', 'M') return ff5M
def unify_capM(self): capM = read_df('capM', 'M') capM.index.name = 't' return capM
def unify_rpM(self): rpM = read_df('rpM', 'M') return rpM
def unify_hxz4M(self): hxz4M = read_df('hxz4M', 'M') hxz4M = _add_prefix(hxz4M, 'hxz4M') return hxz4M
def get_rpD(): rpD = read_df('ff3D', 'D')[['rp']] rpD.to_csv(os.path.join(DATA_PATH, 'rpD.csv'))
def get_eretD(): stockRetD = read_df('stockRetD', 'D') rfD = read_df('rfD', 'D') eretD = stockRetD.sub(rfD['rfD'], axis=0) eretD.to_csv(os.path.join(DATA_PATH, 'eretD.csv'))
def get_eretM(): stockRetM = read_df('stockRetM', 'M') rfM = read_df('rfM', 'M') eretM = stockRetM.sub(rfM['rfM'], axis=0) eretM.to_csv(os.path.join(DATA_PATH, 'eretM.csv'))
def __init__(self): self.mktRetM = read_df('mktRetM', 'M') self.ff3 = read_df('ff3', 'M')
def get_rpM(): rpM = read_df('ff3M', 'M')[['rp']] rpM.to_csv(os.path.join(DATA_PATH, 'rpM.csv'))
def unify_rfM(self): rfM = read_df('rfM', 'M') return rfM
def get_liquidity_ps(): df = read_gta('Liq_PSM_M') #MarketType==21 综合A股和创业板 # 流通市值加权,but on the page 310,Bali use total market capilization condition1 = (df['MarketType'] == 21) condition2 = (df['ST'] == 1) #delete the ST stocks df = df[condition1 & condition2][['Trdmnt', 'AggPS_os']] df.columns = ['t', 'rm'] df = df.set_index('t') df.index = freq_end(df.index, 'M') df = df.sort_index() df['rm_ahead'] = df['rm'].shift(1) df['delta_rm'] = df['rm'] - df['rm'].shift(1) df['delta_rm_ahead'] = df['rm_ahead'] - df['rm_ahead'].shift(1) #df.groupby(lambda x:x.year).apply(lambda df:df.shape[0]) #TODO: we don't know the length of window to regress.In this place,we use the five years history def regr(df): if df.shape[0] > 30: return sm.ols(formula='delta_rm ~ delta_rm_ahead + rm_ahead', data=df).fit().resid[0] else: return np.NaN window = 60 # not exact 5 years lm = pd.Series( [regr(df.loc[:month][-window:].dropna()) for month in df.index], index=df.index) lm.name = 'lm' ret = read_df('stockRetM', freq='M') rf = read_df('rfM', freq='M') eret = ret.sub(rf['rf'], axis=0) eret = eret.stack() eret.index.names = ['t', 'sid'] eret.name = 'eret' ff3 = read_df('ff3_gta', 'M') factors = pd.concat([ff3, lm], axis=1) comb = eret.to_frame().join(factors) def _for_one_month(df): if df.shape[0] >= 30: return sm.ols(formula='eret ~ rp + smb + hml + lm', data=df).fit().params['lm'] else: return np.NaN def _get_result(df): thresh = 30 #30 month if df.shape[0] > thresh: values = [] sid = df.index[0][1] df = df.reset_index(level='sid', drop=True) months = df.index.tolist()[thresh:] for month in months: subdf = df.loc[:month][-60:] subdf = subdf.dropna() # df=df.reset_index(level='sid',drop=True).loc[:month].last(window) values.append(_for_one_month(subdf)) print(sid) return pd.Series(values, index=months) result = comb.groupby('sid').apply(_get_result) result.unstack('sid').to_csv(os.path.join(DATA_PATH, 'liqBeta.csv'))
def unify_mktRetM(self): mktRetM = read_df('mktRetM', 'M') return mktRetM