Пример #1
0
def get_bm():
    '''
    this function can be bookmarked as a snippet of how to manipulate date index
    in Pandas

    A little different with the book,here we use be and me for one share,
    but the data in the book is for all floating shares.However,it doesn't
    affect the bm.

    :return:
    '''
    be = read_df('bps', 'M')
    be = be[be.index.month == 12]
    me = read_df('stockCloseY', 'M')
    be, me = get_inter_frame([be, me])
    bm = be / me
    bm[bm <= 0] = np.nan  #delete those sample with bm<0
    bm = bm.shift(1, freq='6M')

    newIndex = pd.date_range(bm.index[0], bm.index[-1], freq='M')
    bm = bm.reindex(index=newIndex)
    bm = bm.fillna(method='ffill', limit=11)
    bm.to_csv(os.path.join(DATA_PATH, 'bm.csv'))

    logbm = np.log(bm)
    logbm.to_csv(os.path.join(DATA_PATH, 'logbm.csv'))
Пример #2
0
def get_op():
    '''
    calculate operating profitability as in FF5

    Returns:

    '''

    # --------------operating probability---------------
    tbname = 'FS_Comins'
    # var1='B001101000' # 营业收入
    # var2='B001201000' # 营业成本
    # var3='B001209000' # 销售费用
    # var4='B001210000' # 管理费用
    # var5='B001211000' # 财务费用
    var = 'B001300000'  # 营业利润
    # var7='Bbd1102203' # 利息支出
    OP = parse_financial_report(tbname, var)

    # ----------------book value---------------
    tbname = 'FS_Combas'
    # var1 = 'A003000000'  # 所有者权益合计
    var = 'A003100000'  # 归属于母公司所有者权益合计
    BV = parse_financial_report(tbname, var)
    BV[BV <=
       0] = np.nan  #Trick: delete those samples with a negative denominator
    OP, BV = get_inter_frame([OP, BV])
    op = OP / BV
    op.index.name = 't'
    op.columns.name = 'sid'
    op = quaterly2monthly(op)
    save(op, 'op')
Пример #3
0
def get_bm():
    '''
    this function can be bookmarked as a snippet of how to manipulate date index
    in Pandas

    A little different with the book,here we use be and me for one share,
    but the data in the book is for all floating shares.However,it doesn't
    affect the bm.

    :return:
    '''
    # be=load_data('bps')
    be = read_unfiltered('bps')
    be = be[be.index.month == 12]
    me = read_unfiltered('stockCloseY')
    # me=load_data('stockCloseY')
    be, me = get_inter_frame([be, me])
    # me[me<=0]=np.nan
    bm = be / me
    bm[bm <= 0] = np.nan  #delete those samples with bm<0
    bm = quaterly2monthly(bm, shift='6M')
    logbm = np.log(bm)

    bm = bm.stack()
    logbm = logbm.stack()
    x = pd.concat([bm, logbm], axis=1, keys=['bm', 'logbm'])
    x.index.names = ['t', 'sid']
    x.columns.name = 'type'

    save(x, 'value')
Пример #4
0
def get_predicted(history):
    params = pd.read_csv(fn, index_col=0, parse_dates=True)
    params = params.rolling(window=history, min_periods=int(
        history / 2)).mean()  #TODO:min_periods
    # we will use the parameters of time t to predict the
    # returns in time t,so shift forward params for 1 step.
    params = params.shift(1)
    indicators = load_data('data')[l]
    indicators['Intercept'] = 1.0
    cols = params.columns
    indicators = indicators.reindex(columns=cols)

    #TODO: predict return rather than eret,
    groups = list(indicators.groupby('sid'))
    ss = []
    names = []
    for name, g in groups:
        g = g.reset_index(level='sid', drop=True)
        p, g = get_inter_frame([params, g.dropna()])
        s = (p * g).sum(axis=1)
        ss.append(s)
        names.append(name)
        print(name)

    predicted = pd.concat(ss, axis=1, keys=names)
    predicted.to_pickle(
        os.path.join(directory, 'predicted_{}.pkl'.format(history)))
Пример #5
0
def compare_wind_gta_bps():
    '''
    the result is different a lot!!!

    :return:
    '''
    bps_wind = read_df('bps_wind', 'M')
    bps_gta = read_df('bps', 'M')
    bps_wind.columns = [str(int(col[:-3])) for col in bps_wind.columns]
    bps_wind = bps_wind.sort_index(axis=1)
    bps_gta = bps_gta.sort_index(axis=1)
    bps_wind, bps_gta = get_inter_frame([bps_wind, bps_gta])
Пример #6
0
def combine_condition(freq):
    '''
    :param freq:
    :return: DataFrame filled with True or False
    '''
    sids=control_sid(['not_financial'])
    t=control_t(start='1997-01-01',freq=freq)
    cross1=cross_closePrice_floor(freq=freq)
    cross2=cross_year_after_list(freq=freq)
    cross3=cross_is_normal(freq=freq)
    cross1,cross2,cross3=get_inter_frame([cross1,cross2,cross3])
    comb=cross1 & cross2 & cross3
    comb=comb.reindex(index=pd.Index(t,name='t'),columns=pd.Index(sids,name='sid'))
    comb=comb.dropna(axis=0,how='all')
    comb=comb.dropna(axis=1,how='all')
    return comb
Пример #7
0
def apply_condition(x):
    '''
    combine all types of sample controling methods
    :param x:
    :return:
    '''
    freq=detect_freq(x.index)
    condition=combine_condition(freq)
    if isinstance(x.index,pd.MultiIndex):
        stk=condition.stack()
        interIndex=x.index.intersection(stk.index)
        x=x.reindex(index=interIndex)
        stk=stk.reindex(index=interIndex)
        return x[stk]
    else:
        x,condition=get_inter_frame([x,condition])
        return x[condition]
Пример #8
0
def regress_predicted_on_realized():
    predicted = pd.read_pickle(
        os.path.join(directory, 'predicted_{}.pkl'.format(history)))
    stockEret = load_data('stockEretM')
    predicted, stockEret = get_inter_frame([predicted, stockEret])

    months = []
    models = []
    count = []
    for month, p in predicted.iterrows():
        # p denotes predicted return
        # r denotes realized return
        r = stockEret.loc[month]
        df = pd.concat([p, r], axis=1, keys=['predicted', 'realized'])
        df = df.dropna()
        model = sm.ols(formula='realized ~ predicted', data=df).fit(use_t=True)
        months.append(month)
        models.append(model)
        count.append(df.shape[0])
        print(month)

    slope = pd.Series([m.params['predicted'] for m in models], index=months)
    r2 = pd.Series([m.rsquared for m in models], index=months)
    n = pd.Series(count, index=months)

    plt.plot(slope.index, slope.values, 'o')
    plt.show()

    plt.plot(r2.index, r2.values, 'o')
    plt.show()

    plt.plot(n.index, n.values)
    plt.show()

    slope.max().max()
    slope.min().min()

    slope.describe()
    r2.describe()

    nw = newey_west(formula='predicted ~ 1',
                    df=pd.DataFrame(slope, columns=['predicted']),
                    lags=5)
Пример #9
0
def compare_wind_gta_bps():
    '''
    the result is different a lot!!!

    :return:
    '''
    bps_wind = read_unfiltered('bps_wind')
    # bps_wind=load_data('bps_wind')
    # bps=load_data('bps')
    bps = read_unfiltered('bps')
    # bps_wind.columns=[str(int(col[:-3])) for col in bps_wind.columns] #this method will lead to the missing of columns.name
    bps_wind.columns = pd.Index(
        [str(int(col[:-3])) for col in bps_wind.columns],
        name=bps_wind.columns.name)

    bps_wind = bps_wind.sort_index(axis=1)
    bps = bps.sort_index(axis=1)
    bps_wind, bps = get_inter_frame([bps_wind, bps])

    detect_outliers(bps_wind, 'a1')
    detect_outliers(bps, 'a2')