예제 #1
0
 def __init__(self, sample_control=True):
     if sample_control:
         self._data = read_filtered('data_controlled')
     else:
         self._data = read_filtered('data')
     self.info = read_unfiltered('info')
     self.all_indicators = [ele for l in self.info.values() for ele in l]
예제 #2
0
def get_sd():
    #sd
    #TODO: bookmark this function and add it to my repository (pandas handbook),use this method to upgrade the relevant functions
    ri = read_filtered('stockRetD', 'D') * 100  #TODO: the unit is percent (%)

    #filter
    '''
    can not use 

    sd=ri.resample('M').std()
    
    directly,since you need to filter out invalid samples before calculate std

    def _cal_std(x):
        if x.notnull().sum()>=MIN_SAMPLES: #TODO: compare with x.dropna().shape[0],which one is faster?
            return x.std()

    sd0=ri.resample('M').agg(_cal_std)

    sometimes,groupby can be more flexible than resample
    '''
    #TODO: use resampling
    _get_monthend = lambda x: x + MonthEnd(0)
    sd = ri.groupby(_get_monthend).apply(
        lambda df: df.dropna(axis=1, thresh=MIN_SAMPLES).std())

    sd.index.names = ['t', 'sid']
    sd.name = 'sd'
    sd.to_frame().to_csv(os.path.join(PATH, 'sd.csv'))
예제 #3
0
def get_bkmt():
    # monthly
    bkmt = read_filtered('bm', 'M')
    bkmt = bkmt.stack()
    bkmt.name = 'bkmt'
    bkmt.index.names = ['t', 'sid']
    bkmt.to_frame().to_csv(os.path.join(PATH, 'bkmt.csv'))
예제 #4
0
def get_beta_sw_dm():
    '''
    refer to page 5 of cakici for details about this beta.

    Returns:

    '''
    #beta
    rf=read_filtered('rfD')
    rm=read_filtered('mktRetD')
    ri=read_filtered('stockRetD')
    df=ri.stack().to_frame()
    df.columns=['ri']
    df=df.join(pd.concat([rf,rm],axis=1))
    df.columns=['ri','rf','rm']
    df.index.names=['t','sid']

    df['y']=df['ri']-df['rf']
    df['x2']=df['rm']-df['rf']
    df['x1']=df.groupby('sid')['x2'].shift(1)

    def _cal_beta(x):
        result=sm.ols('y ~ x1 + x2',data=x).fit().params[['x1','x2']]
        return result.sum()

    def _for_one_sid(x):
        # x is multiIndex Dataframe
        nx=x.reset_index('sid')
        sid=nx['sid'][0]
        print(sid)
        _get_monthend=lambda dt:dt+MonthEnd(0)
        #filter out those months with observations less than MIN_SAMPLES
        nx=nx.groupby(_get_monthend).filter(lambda a: a.dropna().shape[0] >= MIN_SAMPLES)
        if nx.shape[0]>0:
            result=nx.groupby(_get_monthend).apply(_cal_beta)
            return result

    beta=df.groupby('sid').apply(_for_one_sid)
    beta.index.names=['sid','t']
    beta=beta.reorder_levels(['t','sid']).sort_index(level='t')
    beta.name='beta'

    save(beta,'beta_sw_dm')
예제 #5
0
def refine_data():
    '''
    apply condition on the sample, refer to function apply_condition
    Returns:

    '''
    data = read_filtered('data')
    # data=refine(data) #TODO: filter out the abnormal values
    # save_to_filtered(data,'data')

    data_controlled = apply_condition(data)
    save_to_filtered(data_controlled, 'data_controlled')
예제 #6
0
def combine_all_benchmarks():
    models = ['capmM', 'ff3M', 'ffcM', 'ff5M', 'hxz4M', 'ff6M']
    xs = []
    info = {}
    for model in models:
        x = read_filtered(model)
        if x.ndim == 1:  # such as capmM
            x.name = '{}__{}'.format(model, x.name)
        else:
            x.columns = pd.Index(
                ['{}__{}'.format(model, col) for col in x.columns],
                name=x.columns.name)
        xs.append(x)

        if x.ndim == 1:  # model with only one column such as capmM
            info[model] = [x.name]
        else:
            info[model] = x.columns.tolist()
    benchmark = pd.concat(xs, axis=1)
    return benchmark, info
예제 #7
0
def combine_all_indicators():
    fns = [
        'size', 'beta', 'value', 'momentum', 'reversal', 'liquidity',
        'skewness', 'idio', 'op', 'inv', 'roe'
    ]

    xs = []
    info = {}
    for fn in fns:
        x = read_filtered(fn)
        # stack those panel with only one indicators such as reversal
        if not isinstance(x.index, pd.MultiIndex):
            if x.columns.name == 'sid':
                x = x.stack().to_frame()
                x.columns = [fn]

        x.columns = pd.Index(['{}__{}'.format(fn, col) for col in x.columns],
                             x.columns.name)
        xs.append(x)
        info[fn] = x.columns.tolist()

    indicators = pd.concat(xs, axis=1)
    return indicators, info
예제 #8
0
def join_all():
    '''
        We use the indicators,and weight in time t to predict the adjusted return in time t+1,so,
    for time T we have:
        1. weight
        2. indicators

    For time T+1:
        1. stock excess return
        2. rp
        3. benchmark

        all the indicators are shift forward one month except for eret,rf and other base data,
    so the index denotes time t+1,and all the indicators are from time t,the base data are from
    time t+1.We adjust the indicators rather than the base data for these reasons:
    1. we will sort the indicators in time t to construct portfolios and analyse the eret in time
        t+1
    2. We need to make sure that the index for eret and benchmark is corresponding to the time when
    it was calcualted. If we shift back the base data in this place (rather than shift forward the
    indicators),we would have to shift forward eret again when we regress the portfolio eret on
    benckmark model in the function _alpha in template.py

    For simply,we use the information at t to predict the eret of time t+1.In our DATA.data,the index
    denotes time t,and the values for eretM,benchmark model and so on is from time t+1.

    Notice:
        To calculate value-weighted result,we use the market capitalization of the time t (portfolio
        formation period) as weight.So,in this place,we should shift the capM forward for one step
        as well.For more details,refer to page 40 of Bali.

    '''

    # --------------------time T-1 (Backward) ---------------------------------
    weight = read_filtered('size')['mktCap']
    weight.name = 'weight'
    indicators, info = combine_all_indicators()

    # -----------------------------time T--------------------------------------
    stockEretM = read_filtered('stockEretM')
    stockEretM = stockEretM.stack()
    stockEretM.name = 'stockEretM'

    rfM = read_filtered('rfM')
    mktRetM = read_filtered('mktRetM')
    rpM = read_filtered('rpM')

    #combine singleIndexedr
    single = pd.concat([rfM, mktRetM, rpM], axis=1)

    #combine multiIndexed
    multi = pd.concat([weight, indicators, stockEretM], axis=1)
    data = multi.join(single, how='outer')
    data.index.name = ['t', 'sid']
    data.columns.name = 'type'

    pickle.dump(info, open(os.path.join(PKL_UNFILTERED_PATH, 'info.pkl'),
                           'wb'))

    # save info as df
    infoDf = pd.concat([pd.Series(v, name=k) for k, v in info.items()], axis=1)
    infoDf.to_csv('info.csv')

    save_to_filtered(data, 'data')
예제 #9
0
def get_sd():
    ri = read_filtered('stockRetD') * 100
    sd = ri.rolling(20, min_periods=15).std().resample('M').last()
    save(sd, 'sd')
예제 #10
0
def get_ret():
    ret = read_filtered('stockRetM', 'M')
    ret = ret.stack()
    ret.name = 'ret'
    ret.index.names = ['t', 'sid']
    ret.to_frame().to_csv(os.path.join(PATH, 'ret.csv'))
예제 #11
0
# -*-coding: utf-8 -*-
# Python 3.6
# Author:Zhang Haitao
# Email:[email protected]
# TIME:2018-07-24  10:39
# NAME:assetPricing2-combine_data.py
from data.dataTools import read_filtered

size = read_filtered('size')['size']
price = read_filtered('stockCloseM').stack()
beta = read_filtered('beta_sw_dm')
sd = read_filtered('sd')
see = read_filtered('see')