예제 #1
0
def _get_rf(freq):
    '''
    parse risk free rate from the database
    Args:
        freq: D (daily),W (weekly),M (monthly)

    Returns:

    '''
    dic = {'D': 'Nrrdaydt', 'W': 'Nrrwkdt', 'M': 'Nrrmtdt'}

    tname = 'TRD_Nrrate'
    src = read_gta(tname)
    #NRI01=定期-整存整取-一年利率;TBC=国债票面利率,根据复利计算方法,
    # 将年度的无风险利率转化为月度数据
    src = src[src['Nrr1'] ==
              'NRI01']  #trick: choose the type of risk free rate
    src = src.set_index('Clsdt')

    rf = src[dic[freq]][2:]  #delete the first two rows
    rf.index.name = 't'
    rf.name = 'rf' + freq

    rf.index = pd.to_datetime(rf.index)
    if freq in ['W', 'M']:
        rf = rf.resample(freq).agg(lambda x: x[round(x.shape[0] / 2)])

    return rf / 100.0  #trick:the unit of rf in the file is %,we adjust it to be actual value.
예제 #2
0
def parse_financial_report(tbname, varname, freq='Y', consolidated=True):
    '''
    This function will parse indicator from financial report.

    Args:
        tbname:
        varname:
        freq:{'Y','Q'},'Y' means yearly,'Q' means quartly
        consolidated: True or False,If true,use the indicator from consolidated
        financial statements

    Returns:DataFrame

    '''
    df = read_gta(tbname)
    if consolidated:
        df = df[df['Typrep'] == 'A']  # 合并报表

    colname = 'Stkcd'
    indname = 'Accper'
    table = pd.pivot_table(df, varname, indname, colname)
    table.index.name = 't'
    table.index = pd.to_datetime(table.index)
    table.columns = table.columns.astype(str)
    table.columns.name = 'sid'
    if freq == 'Y':
        return table[table.index.month == 12]
    elif freq == 'Q':
        return table[table.index.month.isin([3, 6, 9, 12])]
예제 #3
0
def get_amihud_illiq():
    df = read_gta('TRD_Dalyr')
    df = df[['Stkcd', 'Trddt', 'Dretwd', 'Dnvaltrd']]
    df.columns = ['sid', 't', 'ret', 'volume']
    df['t'] = freq_end(df['t'], 'D')
    df = df.set_index(['t', 'sid'])
    if not df.index.is_monotonic_increasing:
        df = df.sort_index(
            level='t'
        )  #TODO: gta's data is not monotonic_increasing ,add this two row to other scripts

    dict = OrderedDict({'1M': 15, '3M': 50, '6M': 100, '12M': 200})

    result = groupby_rolling(df, 'illiq', dict, _amihud)
    result.index.names = ['type', 't']

    ln_result = np.log(result)
    ln_result = ln_result.reset_index()
    ln_result['type'] = 'ln_' + ln_result['type'].astype(str)
    ln_result = ln_result.set_index(['type', 't'])
    illiq = pd.concat([result, ln_result], axis=0)
    #TODO:use valid observation for the whole project as page 276

    # adjust the format of the DataFrame
    illiq.columns = pd.Index(illiq.columns.astype(str), illiq.columns.name)
    illiq = illiq.reset_index()
    illiq['t'] = freq_end(illiq['t'], 'M')
    illiq = illiq.set_index(['type', 't'])
    illiq = illiq.stack().unstack(level='type')

    #TODO: The data is really noisy,refer to outliers figures for details
    save(illiq, 'illiq')
예제 #4
0
def get_ep():
    df = read_gta('STK_MKT_Dalyr', encoding='gbk')
    df['t'] = pd.to_datetime(df['TradingDate'])
    df['sid'] = df['Symbol'].astype(str)
    df['ep'] = 1.0 / df['PE']
    cfpr = pd.pivot_table(df, values='ep', index='t', columns='sid')
    cfpr = cfpr.sort_index().resample('M').last()
    save(cfpr, 'ep')
예제 #5
0
def get_tradingStatusD():
    df = read_gta('TRD_Dalyr', encoding='gbk')
    #Trick: Trdsta==1 means "正常交易"
    df['is_normal'] = df['Trdsta'] == 1.0
    df['t'] = pd.to_datetime(df['Trddt'])
    df['sid'] = df['Stkcd'].astype(str)
    status = pd.pivot_table(df, values='is_normal', index='t', columns='sid')
    save(status, 'tradingStatusD')
    return status
예제 #6
0
def get_ffcM():
    df = read_gta('STK_MKT_CarhartFourFactors')
    # P9709 全部A股市场包含沪深A股和创业板
    # 流通市值加权
    df = df[df['MarkettypeID'] == 'P9709'][[
        'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'UMD2'
    ]]
    df.columns = ['t', 'rp', 'smb', 'hml', 'mom']
    df = df.set_index('t')
    df.to_csv(os.path.join(DATA_PATH, 'ffcM.csv'))
예제 #7
0
def get_ff3D():
    tbname = 'STK_MKT_ThrfacDay'
    df = read_gta(tbname)
    condition1 = df['MarkettypeID'] == 'P9707'
    # P9709 全部A股市场包含沪深A股和创业板
    # 流通市值加权
    df = df[condition1][['TradingDate', 'RiskPremium1', 'SMB1', 'HML1']]
    df.columns = ['t', 'rp', 'smb', 'hml']
    df = df.set_index('t')
    df.to_csv(os.path.join(DATA_PATH, 'ff3D.csv'))
예제 #8
0
def get_ff5M():
    df = read_gta('STK_MKT_FivefacMonth')
    #P9709 全部A股市场包含沪深A股和创业板
    #流通市值加权
    #2*3 投资组合
    df = df[(df['MarkettypeID'] == 'P9709') & (df['Portfolios'] == 1)][[
        'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'RMW1', 'CMA1'
    ]]
    df.columns = ['t', 'rp', 'smb', 'hml', 'rmw', 'cma']
    df = df.set_index('t')
    df.to_csv(os.path.join(DATA_PATH, 'ff5M.csv'))
예제 #9
0
def get_ff3D():
    tbname = 'STK_MKT_ThrfacDay'
    df = read_gta(tbname)
    condition1 = df['MarkettypeID'] == 'P9707'
    # P9709 全部A股市场包含沪深A股和创业板.
    # 流通市值加权
    df = df[condition1][['TradingDate', 'RiskPremium1', 'SMB1', 'HML1']]
    df.columns = ['t', 'rp', 'smb', 'hml']
    df.columns.name = 'type'
    df = df.set_index('t')
    df.index = freq_end(df.index, 'D')
    save(df, 'ff3D')
    return df
예제 #10
0
def get_ffcM():
    df = read_gta('STK_MKT_CarhartFourFactors')
    #trick: P9709 全部A股市场包含沪深A股和创业板
    #trick: 流通市值加权
    df = df[df['MarkettypeID'] == 'P9709'][[
        'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'UMD2'
    ]]
    df.columns = ['t', 'rp', 'smb', 'hml', 'mom']
    df.columns.name = 'type'
    df = df.set_index('t')
    df.index = freq_end(df.index, 'M')

    save(df, 'ffcM')
예제 #11
0
def get_ff3M():  #fixme: there are some abnormal values
    df = read_gta('STK_MKT_ThrfacMonth')
    #trick:P9709 全部A股市场包含沪深A股和创业板
    #trick:流通市值加权
    df = df[df['MarkettypeID'] == 'P9709'][[
        'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1'
    ]]
    df.columns = ['t', 'rp', 'smb', 'hml']
    df = df.set_index('t')
    df.index = freq_end(df.index, 'M')
    df.columns.name = 'type'
    save(df, 'ff3M')
    return df
예제 #12
0
def get_ff5M():
    df = read_gta('STK_MKT_FivefacMonth')
    #trick:P9709 全部A股市场包含沪深A股和创业板
    #trick:流通市值加权
    #trick: 2*3 投资组合
    df = df[(df['MarkettypeID'] == 'P9709') & (df['Portfolios'] == 1)][[
        'TradingMonth', 'RiskPremium1', 'SMB1', 'HML1', 'RMW1', 'CMA1'
    ]]
    df.columns = ['t', 'rp', 'smb', 'hml', 'rmw', 'cma']
    df.columns.name = 'type'
    df = df.set_index('t')
    df.index = freq_end(df.index, 'M')
    # df.index.name='t'
    save(df, 'ff5M')
예제 #13
0
def get_mktRetM():
    tbname = 'TRD_Cnmont'
    indVar = 'Trdmnt'
    targetVar = 'Cmretwdos'  #trick:考虑现金红利再投资的综合日市场回报率(流通市值加权平均法)

    df = read_gta(tbname)
    df = df[df['Markettype'] == 21]  # 21=综合A股和创业板

    df = df.set_index(indVar)
    df.index = freq_end(df.index, 'M')
    df.index.name = 't'

    s = df[targetVar]
    s.name = 'mktRetM'

    save(s, 'mktRetM')
예제 #14
0
def get_mktRetD():
    # get daily market return

    tbname = 'TRD_Cndalym'
    indVar = 'Trddt'
    targetVar = 'Cdretwdos'  #trick 考虑现金红利再投资的综合日市场回报率(流通市值加权平均法)
    df = read_gta(tbname)

    condition1 = df['Markettype'] == 21  #trick 21=综合A股和创业板
    df = df[condition1]

    df = df.set_index(indVar)
    df.index.name = 't'
    df.index = pd.to_datetime(df.index)

    s = df[targetVar]
    s.name = 'mktRetD'  #TODO: put this line into check function or unify function?
    save(s, 'mktRetD')
예제 #15
0
def get_amihud_illiq():
    df = read_gta('TRD_Dalyr')
    df = df[['Stkcd', 'Trddt', 'Dretwd', 'Dnvaltrd']]
    df.columns = ['sid', 't', 'ret', 'volume']
    df['t'] = convert_freq(df['t'], 'D')
    df = df.set_index(['t', 'sid'])
    dict = {'1M': 15, '3M': 50, '6M': 100, '12M': 200}

    result = groupby_rolling(df, 'illiq', dict, _amihud)
    result.index.names = ['type', 't']

    ln_result = np.log(result)
    ln_result = ln_result.reset_index()
    ln_result['type'] = 'ln_' + ln_result['type']
    ln_result = ln_result.set_index(['type', 't'])
    illiq = pd.concat([result, ln_result], axis=0)
    #TODO:use valid observation for the whole project as page 276
    illiq.to_csv(os.path.join(DATA_PATH, 'illiq.csv'))
예제 #16
0
def parse_all_financial_indicators():
    tbnames=['FI_T{}'.format(i) for i in range(1,12)]
    for tbname in tbnames:
        df=read_gta(tbname)
        varnames=[col for col in df.columns if col not in
                  ['Accper','Indcd','Stkcd','Typrep']]

        if 'Typrep' in df.columns:
            consolidated=True
        else:
            consolidated=False

        varnames=_filter_indicators(varnames)
        for varname in varnames:
            df=parse_financial_report(tbname,varname,consolidated=consolidated)
            df=quaterly2monthly(df)
            df.to_pickle(os.path.join(dirFI,'{}__{}.pkl'.format(tbname,varname)))
            print(tbname,varname)
예제 #17
0
def get_listInfo():
    df = read_gta('IPO_Cobasic', encoding='gbk')
    df = df.set_index('Stkcd')
    df.index.name = 'sid'
    df.index = df.index.astype(str)
    df.columns.name = 'type'
    #TODO: refer to page12 of 动量因子_164.pdf   ' 1 代表剔除金融、保险、 ST 类股票'
    df['not_financial'] = df['Indcd'] != 1  #financial stocks
    df['not_cross'] = ~df['Crcd'].notnull(
    )  #cross means stocks listed on multiple stock markets
    df['is_sh'] = df['Listexg'] == 1  #listed on shanghai
    df['is_sz'] = df['Listexg'] == 2  #listed on shenzhen
    # Listdt denotes listed date ,'Ipodt' denotes IPO date
    df['Listdt'] = df['Listdt'].replace(
        ['0000-00-00', '2100-01-01'],
        np.nan)  #there is some invalid data in column 'Listdt
    df['listDate'] = pd.to_datetime(df['Listdt'])
    df = df[['listDate', 'not_financial', 'not_cross', 'is_sh', 'is_sz']]
    df = df[~df.index.duplicated(
        False)]  #there are some duplicated items such as '600018
    df = df.dropna()
    save(df, 'listInfo')
예제 #18
0
def get_amihud_illiq():
    df = read_gta('TRD_Dalyr')
    df = df[['Stkcd', 'Trddt', 'Dretwd', 'Dnvaltrd']]
    df.columns = ['sid', 't', 'ret', 'volume']
    df['t'] = freq_end(df['t'], 'D')
    df = df.set_index(['t', 'sid'])
    if not df.index.is_monotonic_increasing:
        df = df.sort_index(
            level='t'
        )  #TODO: gta's data is not monotonic_increasing ,add this two row to other scripts

    dict = OrderedDict({'1M': 15, '3M': 50, '6M': 100, '12M': 200})

    result = groupby_rolling(df, 'illiq', dict, _amihud)
    result.index.names = ['type', 't']

    ln_result = np.log(result)
    ln_result = ln_result.reset_index()
    ln_result['type'] = 'ln_' + ln_result['type'].astype(str)
    ln_result = ln_result.set_index(['type', 't'])
    illiq = pd.concat([result, ln_result], axis=0)
    #TODO:use valid observation for the whole project as page 276
    illiq.to_csv(os.path.join(DATA_PATH, 'illiq.csv'))
예제 #19
0
def get_liquidity_ps():
    df = read_gta('Liq_PSM_M')
    #MarketType==21   综合A股和创业板
    # 流通市值加权,but on the page 310,Bali use total market capilization
    condition1 = (df['MarketType'] == 21)
    condition2 = (df['ST'] == 1)  #delete the ST stocks

    df = df[condition1 & condition2][['Trdmnt', 'AggPS_os']]
    df.columns = ['t', 'rm']
    df = df.set_index('t')

    df.index = freq_end(df.index, 'M')
    df = df.sort_index()
    df['rm_ahead'] = df['rm'].shift(1)
    df['delta_rm'] = df['rm'] - df['rm'].shift(1)
    df['delta_rm_ahead'] = df['rm_ahead'] - df['rm_ahead'].shift(1)

    #df.groupby(lambda x:x.year).apply(lambda df:df.shape[0])
    #TODO: we don't know the length of window to regress.In this place,we use the five years history
    def regr(df):
        if df.shape[0] > 30:
            return sm.ols(formula='delta_rm ~ delta_rm_ahead + rm_ahead',
                          data=df).fit().resid[0]
        else:
            return np.NaN

    window = 60  # not exact 5 years
    lm = pd.Series(
        [regr(df.loc[:month][-window:].dropna()) for month in df.index],
        index=df.index)
    lm.name = 'lm'

    ret = read_df('stockRetM', freq='M')
    rf = read_df('rfM', freq='M')
    eret = ret.sub(rf['rf'], axis=0)
    eret = eret.stack()
    eret.index.names = ['t', 'sid']
    eret.name = 'eret'

    ff3 = read_df('ff3_gta', 'M')
    factors = pd.concat([ff3, lm], axis=1)

    comb = eret.to_frame().join(factors)

    def _for_one_month(df):
        if df.shape[0] >= 30:
            return sm.ols(formula='eret ~ rp + smb + hml + lm',
                          data=df).fit().params['lm']
        else:
            return np.NaN

    def _get_result(df):
        thresh = 30  #30 month
        if df.shape[0] > thresh:
            values = []
            sid = df.index[0][1]
            df = df.reset_index(level='sid', drop=True)
            months = df.index.tolist()[thresh:]
            for month in months:
                subdf = df.loc[:month][-60:]
                subdf = subdf.dropna()
                # df=df.reset_index(level='sid',drop=True).loc[:month].last(window)
                values.append(_for_one_month(subdf))
            print(sid)
            return pd.Series(values, index=months)

    result = comb.groupby('sid').apply(_get_result)
    result.unstack('sid').to_csv(os.path.join(DATA_PATH, 'liqBeta.csv'))
예제 #20
0
def get_liquidity():
    # Turnover rate
    df1 = read_gta('Liq_Tover_M', index_col=0)
    df1 = df1[df1['Status'] == 'A']  # A=正常交易
    df1 = df1[[
        'Stkcd', 'Trdmnt', 'ToverOsM', 'ToverTlM', 'ToverOsMAvg', 'ToverTlMAvg'
    ]]
    df1.columns = [
        'sid', 't', 'turnover1', 'turnover2', 'turnover3', 'turnover4'
    ]
    df1['t'] = freq_end(df1['t'], 'M')
    df1['sid'] = df1['sid'].astype(str)
    df1 = df1.set_index(['t', 'sid'])
    df1 = df1.astype(float)

    # Amihud
    df2 = read_gta('Liq_Amihud_M', index_col=0)
    df2 = df2[df2['Status'] == 'A']  # A=正常交易
    df2 = df2[['Stkcd', 'Trdmnt', 'ILLIQ_M']]  # 月内日均换手率(流通股数)
    df2.columns = ['sid', 't', 'amihud']
    df2['t'] = freq_end(df2['t'], 'M')
    df2['sid'] = df2['sid'].astype(str)
    df2 = df2.set_index(['t', 'sid'])
    df2 = df2.astype(float)
    '''
    roll1,roll2,zeros1 and zeros2 are note proper for portfolio analysis,since there are a lot of
    zeros in sample,which will cause errors in the program.
    '''
    # roll
    # df3=read_gta('Liq_Roll_M',index_col=0)
    # df3 = df3[df3['Status'] == 'A']  # A=正常交易
    # df3 = df3[['Stkcd', 'Trdmnt', 'Roll_M','Roll_Impact_M']]  # 月内日均换手率(流通股数)
    # df3.columns = ['sid', 't', 'roll1','roll2']
    # df3['t'] = freq_end(df3['t'], 'M')
    # df3['sid'] = df3['sid'].astype(str)
    # df3=df3.set_index(['t','sid'])
    # df3=df3.astype(float)

    # # Zeros
    # df4=read_gta('Liq_Zeros_M',index_col=0)
    # df4 = df4[df4['Status'] == 'A']  # A=正常交易
    # df4 = df4[['Stkcd', 'Trdmnt', 'Zeros_M','Zeros_Impact_M']]  # 月内日均换手率(流通股数)
    # df4.columns = ['sid', 't', 'zeros1','zeros2']
    # df4['t'] = freq_end(df4['t'], 'M')
    # df4['sid'] = df4['sid'].astype(str)
    # df4=df4.set_index(['t','sid'])
    # df4=df4.astype(float)

    # Pastor Stambaugh
    df5 = read_gta('Liq_PS_M', index_col=0)
    df5 = df5[df5['Status'] == 'A']  # A=正常交易
    df5 = df5[['Stkcd', 'Trdmnt', 'PSos', 'PStl']]  # 月内日均换手率(流通股数)
    df5.columns = ['sid', 't', 'ps1', 'ps2']
    df5['t'] = freq_end(df5['t'], 'M')
    df5['sid'] = df5['sid'].astype(str)
    df5 = df5.set_index(['t', 'sid'])
    df5 = df5.astype(float)

    # combine them
    x = pd.concat([df[~df.index.duplicated()] for df in [df1, df2, df5]],
                  axis=1)
    x.columns.name = 'type'

    save(x, 'liquidity')