def get_bm(): ''' this function can be bookmarked as a snippet of how to manipulate date index in Pandas A little different with the book,here we use be and me for one share, but the data in the book is for all floating shares.However,it doesn't affect the bm. :return: ''' be = read_df('bps', 'M') be = be[be.index.month == 12] me = read_df('stockCloseY', 'M') be, me = get_inter_frame([be, me]) bm = be / me bm[bm <= 0] = np.nan #delete those sample with bm<0 bm = bm.shift(1, freq='6M') newIndex = pd.date_range(bm.index[0], bm.index[-1], freq='M') bm = bm.reindex(index=newIndex) bm = bm.fillna(method='ffill', limit=11) bm.to_csv(os.path.join(DATA_PATH, 'bm.csv')) logbm = np.log(bm) logbm.to_csv(os.path.join(DATA_PATH, 'logbm.csv'))
def get_op(): ''' calculate operating profitability as in FF5 Returns: ''' # --------------operating probability--------------- tbname = 'FS_Comins' # var1='B001101000' # 营业收入 # var2='B001201000' # 营业成本 # var3='B001209000' # 销售费用 # var4='B001210000' # 管理费用 # var5='B001211000' # 财务费用 var = 'B001300000' # 营业利润 # var7='Bbd1102203' # 利息支出 OP = parse_financial_report(tbname, var) # ----------------book value--------------- tbname = 'FS_Combas' # var1 = 'A003000000' # 所有者权益合计 var = 'A003100000' # 归属于母公司所有者权益合计 BV = parse_financial_report(tbname, var) BV[BV <= 0] = np.nan #Trick: delete those samples with a negative denominator OP, BV = get_inter_frame([OP, BV]) op = OP / BV op.index.name = 't' op.columns.name = 'sid' op = quaterly2monthly(op) save(op, 'op')
def get_bm(): ''' this function can be bookmarked as a snippet of how to manipulate date index in Pandas A little different with the book,here we use be and me for one share, but the data in the book is for all floating shares.However,it doesn't affect the bm. :return: ''' # be=load_data('bps') be = read_unfiltered('bps') be = be[be.index.month == 12] me = read_unfiltered('stockCloseY') # me=load_data('stockCloseY') be, me = get_inter_frame([be, me]) # me[me<=0]=np.nan bm = be / me bm[bm <= 0] = np.nan #delete those samples with bm<0 bm = quaterly2monthly(bm, shift='6M') logbm = np.log(bm) bm = bm.stack() logbm = logbm.stack() x = pd.concat([bm, logbm], axis=1, keys=['bm', 'logbm']) x.index.names = ['t', 'sid'] x.columns.name = 'type' save(x, 'value')
def get_predicted(history): params = pd.read_csv(fn, index_col=0, parse_dates=True) params = params.rolling(window=history, min_periods=int( history / 2)).mean() #TODO:min_periods # we will use the parameters of time t to predict the # returns in time t,so shift forward params for 1 step. params = params.shift(1) indicators = load_data('data')[l] indicators['Intercept'] = 1.0 cols = params.columns indicators = indicators.reindex(columns=cols) #TODO: predict return rather than eret, groups = list(indicators.groupby('sid')) ss = [] names = [] for name, g in groups: g = g.reset_index(level='sid', drop=True) p, g = get_inter_frame([params, g.dropna()]) s = (p * g).sum(axis=1) ss.append(s) names.append(name) print(name) predicted = pd.concat(ss, axis=1, keys=names) predicted.to_pickle( os.path.join(directory, 'predicted_{}.pkl'.format(history)))
def compare_wind_gta_bps(): ''' the result is different a lot!!! :return: ''' bps_wind = read_df('bps_wind', 'M') bps_gta = read_df('bps', 'M') bps_wind.columns = [str(int(col[:-3])) for col in bps_wind.columns] bps_wind = bps_wind.sort_index(axis=1) bps_gta = bps_gta.sort_index(axis=1) bps_wind, bps_gta = get_inter_frame([bps_wind, bps_gta])
def combine_condition(freq): ''' :param freq: :return: DataFrame filled with True or False ''' sids=control_sid(['not_financial']) t=control_t(start='1997-01-01',freq=freq) cross1=cross_closePrice_floor(freq=freq) cross2=cross_year_after_list(freq=freq) cross3=cross_is_normal(freq=freq) cross1,cross2,cross3=get_inter_frame([cross1,cross2,cross3]) comb=cross1 & cross2 & cross3 comb=comb.reindex(index=pd.Index(t,name='t'),columns=pd.Index(sids,name='sid')) comb=comb.dropna(axis=0,how='all') comb=comb.dropna(axis=1,how='all') return comb
def apply_condition(x): ''' combine all types of sample controling methods :param x: :return: ''' freq=detect_freq(x.index) condition=combine_condition(freq) if isinstance(x.index,pd.MultiIndex): stk=condition.stack() interIndex=x.index.intersection(stk.index) x=x.reindex(index=interIndex) stk=stk.reindex(index=interIndex) return x[stk] else: x,condition=get_inter_frame([x,condition]) return x[condition]
def regress_predicted_on_realized(): predicted = pd.read_pickle( os.path.join(directory, 'predicted_{}.pkl'.format(history))) stockEret = load_data('stockEretM') predicted, stockEret = get_inter_frame([predicted, stockEret]) months = [] models = [] count = [] for month, p in predicted.iterrows(): # p denotes predicted return # r denotes realized return r = stockEret.loc[month] df = pd.concat([p, r], axis=1, keys=['predicted', 'realized']) df = df.dropna() model = sm.ols(formula='realized ~ predicted', data=df).fit(use_t=True) months.append(month) models.append(model) count.append(df.shape[0]) print(month) slope = pd.Series([m.params['predicted'] for m in models], index=months) r2 = pd.Series([m.rsquared for m in models], index=months) n = pd.Series(count, index=months) plt.plot(slope.index, slope.values, 'o') plt.show() plt.plot(r2.index, r2.values, 'o') plt.show() plt.plot(n.index, n.values) plt.show() slope.max().max() slope.min().min() slope.describe() r2.describe() nw = newey_west(formula='predicted ~ 1', df=pd.DataFrame(slope, columns=['predicted']), lags=5)
def compare_wind_gta_bps(): ''' the result is different a lot!!! :return: ''' bps_wind = read_unfiltered('bps_wind') # bps_wind=load_data('bps_wind') # bps=load_data('bps') bps = read_unfiltered('bps') # bps_wind.columns=[str(int(col[:-3])) for col in bps_wind.columns] #this method will lead to the missing of columns.name bps_wind.columns = pd.Index( [str(int(col[:-3])) for col in bps_wind.columns], name=bps_wind.columns.name) bps_wind = bps_wind.sort_index(axis=1) bps = bps.sort_index(axis=1) bps_wind, bps = get_inter_frame([bps_wind, bps]) detect_outliers(bps_wind, 'a1') detect_outliers(bps, 'a2')