def __init__(self, sample_control=True): if sample_control: self._data = read_filtered('data_controlled') else: self._data = read_filtered('data') self.info = read_unfiltered('info') self.all_indicators = [ele for l in self.info.values() for ele in l]
def get_sd(): #sd #TODO: bookmark this function and add it to my repository (pandas handbook),use this method to upgrade the relevant functions ri = read_filtered('stockRetD', 'D') * 100 #TODO: the unit is percent (%) #filter ''' can not use sd=ri.resample('M').std() directly,since you need to filter out invalid samples before calculate std def _cal_std(x): if x.notnull().sum()>=MIN_SAMPLES: #TODO: compare with x.dropna().shape[0],which one is faster? return x.std() sd0=ri.resample('M').agg(_cal_std) sometimes,groupby can be more flexible than resample ''' #TODO: use resampling _get_monthend = lambda x: x + MonthEnd(0) sd = ri.groupby(_get_monthend).apply( lambda df: df.dropna(axis=1, thresh=MIN_SAMPLES).std()) sd.index.names = ['t', 'sid'] sd.name = 'sd' sd.to_frame().to_csv(os.path.join(PATH, 'sd.csv'))
def get_bkmt(): # monthly bkmt = read_filtered('bm', 'M') bkmt = bkmt.stack() bkmt.name = 'bkmt' bkmt.index.names = ['t', 'sid'] bkmt.to_frame().to_csv(os.path.join(PATH, 'bkmt.csv'))
def get_beta_sw_dm(): ''' refer to page 5 of cakici for details about this beta. Returns: ''' #beta rf=read_filtered('rfD') rm=read_filtered('mktRetD') ri=read_filtered('stockRetD') df=ri.stack().to_frame() df.columns=['ri'] df=df.join(pd.concat([rf,rm],axis=1)) df.columns=['ri','rf','rm'] df.index.names=['t','sid'] df['y']=df['ri']-df['rf'] df['x2']=df['rm']-df['rf'] df['x1']=df.groupby('sid')['x2'].shift(1) def _cal_beta(x): result=sm.ols('y ~ x1 + x2',data=x).fit().params[['x1','x2']] return result.sum() def _for_one_sid(x): # x is multiIndex Dataframe nx=x.reset_index('sid') sid=nx['sid'][0] print(sid) _get_monthend=lambda dt:dt+MonthEnd(0) #filter out those months with observations less than MIN_SAMPLES nx=nx.groupby(_get_monthend).filter(lambda a: a.dropna().shape[0] >= MIN_SAMPLES) if nx.shape[0]>0: result=nx.groupby(_get_monthend).apply(_cal_beta) return result beta=df.groupby('sid').apply(_for_one_sid) beta.index.names=['sid','t'] beta=beta.reorder_levels(['t','sid']).sort_index(level='t') beta.name='beta' save(beta,'beta_sw_dm')
def refine_data(): ''' apply condition on the sample, refer to function apply_condition Returns: ''' data = read_filtered('data') # data=refine(data) #TODO: filter out the abnormal values # save_to_filtered(data,'data') data_controlled = apply_condition(data) save_to_filtered(data_controlled, 'data_controlled')
def combine_all_benchmarks(): models = ['capmM', 'ff3M', 'ffcM', 'ff5M', 'hxz4M', 'ff6M'] xs = [] info = {} for model in models: x = read_filtered(model) if x.ndim == 1: # such as capmM x.name = '{}__{}'.format(model, x.name) else: x.columns = pd.Index( ['{}__{}'.format(model, col) for col in x.columns], name=x.columns.name) xs.append(x) if x.ndim == 1: # model with only one column such as capmM info[model] = [x.name] else: info[model] = x.columns.tolist() benchmark = pd.concat(xs, axis=1) return benchmark, info
def combine_all_indicators(): fns = [ 'size', 'beta', 'value', 'momentum', 'reversal', 'liquidity', 'skewness', 'idio', 'op', 'inv', 'roe' ] xs = [] info = {} for fn in fns: x = read_filtered(fn) # stack those panel with only one indicators such as reversal if not isinstance(x.index, pd.MultiIndex): if x.columns.name == 'sid': x = x.stack().to_frame() x.columns = [fn] x.columns = pd.Index(['{}__{}'.format(fn, col) for col in x.columns], x.columns.name) xs.append(x) info[fn] = x.columns.tolist() indicators = pd.concat(xs, axis=1) return indicators, info
def join_all(): ''' We use the indicators,and weight in time t to predict the adjusted return in time t+1,so, for time T we have: 1. weight 2. indicators For time T+1: 1. stock excess return 2. rp 3. benchmark all the indicators are shift forward one month except for eret,rf and other base data, so the index denotes time t+1,and all the indicators are from time t,the base data are from time t+1.We adjust the indicators rather than the base data for these reasons: 1. we will sort the indicators in time t to construct portfolios and analyse the eret in time t+1 2. We need to make sure that the index for eret and benchmark is corresponding to the time when it was calcualted. If we shift back the base data in this place (rather than shift forward the indicators),we would have to shift forward eret again when we regress the portfolio eret on benckmark model in the function _alpha in template.py For simply,we use the information at t to predict the eret of time t+1.In our DATA.data,the index denotes time t,and the values for eretM,benchmark model and so on is from time t+1. Notice: To calculate value-weighted result,we use the market capitalization of the time t (portfolio formation period) as weight.So,in this place,we should shift the capM forward for one step as well.For more details,refer to page 40 of Bali. ''' # --------------------time T-1 (Backward) --------------------------------- weight = read_filtered('size')['mktCap'] weight.name = 'weight' indicators, info = combine_all_indicators() # -----------------------------time T-------------------------------------- stockEretM = read_filtered('stockEretM') stockEretM = stockEretM.stack() stockEretM.name = 'stockEretM' rfM = read_filtered('rfM') mktRetM = read_filtered('mktRetM') rpM = read_filtered('rpM') #combine singleIndexedr single = pd.concat([rfM, mktRetM, rpM], axis=1) #combine multiIndexed multi = pd.concat([weight, indicators, stockEretM], axis=1) data = multi.join(single, how='outer') data.index.name = ['t', 'sid'] data.columns.name = 'type' pickle.dump(info, open(os.path.join(PKL_UNFILTERED_PATH, 'info.pkl'), 'wb')) # save info as df infoDf = pd.concat([pd.Series(v, name=k) for k, v in info.items()], axis=1) infoDf.to_csv('info.csv') save_to_filtered(data, 'data')
def get_sd(): ri = read_filtered('stockRetD') * 100 sd = ri.rolling(20, min_periods=15).std().resample('M').last() save(sd, 'sd')
def get_ret(): ret = read_filtered('stockRetM', 'M') ret = ret.stack() ret.name = 'ret' ret.index.names = ['t', 'sid'] ret.to_frame().to_csv(os.path.join(PATH, 'ret.csv'))
# -*-coding: utf-8 -*- # Python 3.6 # Author:Zhang Haitao # Email:[email protected] # TIME:2018-07-24 10:39 # NAME:assetPricing2-combine_data.py from data.dataTools import read_filtered size = read_filtered('size')['size'] price = read_filtered('stockCloseM').stack() beta = read_filtered('beta_sw_dm') sd = read_filtered('sd') see = read_filtered('see')