def load_train_test(self, indexed_df, enable_load_model, rebuild_model=False, enable_train_if_load_not_suss=True, enable_train_even_load_succ=False, enable_test=False): if rebuild_model: self.get_model(rebuild_model=True) trade_date = str_2_date(indexed_df.index[-1]) # 加载模型 if enable_load_model: is_load = self.load_model_if_exist(trade_date) else: is_load = False if enable_train_even_load_succ or (enable_train_if_load_not_suss and not is_load): factor_df_dic = get_factor(indexed_df, ohlcav_col_name_list=self.ohlcav_col_name_list, trade_date_series=self.trade_date_series, delivery_date_series=self.delivery_date_series, do_multiple_factors=True) factor_df = factor_df_dic[1] num = 0 while True: num += 1 if num > 1: self.get_model(rebuild_model=True) # 训练模型 train_acc, val_acc = self.train(factor_df_dic, predict_test_random_state=num) if self.over_fitting_train_acc is not None and train_acc > self.over_fitting_train_acc: self.logger.warning('第 %d 次训练,训练集精度 train_acc=%.2f%% 过高,可能存在过拟合,重新采样训练', num, train_acc * 100) continue if self.validation_accuracy_base_line is not None: if val_acc < self.validation_accuracy_base_line: self.logger.warning('第 %d 次训练,训练结果不及预期,重新采样训练', num) continue # elif train_acc - val_acc > 0.15 and val_acc < 0.75: # self.logger.warning('第 %d 次训练,train_acc=%.2f%%, val_acc=%.2f%% 相差大于 15%% 且验证集正确率小于75%%,重新采样训练', # num, train_acc * 100, val_acc * 100) # continue else: break else: break self.save_model(trade_date) self.trade_date_last_train = trade_date else: factor_df = get_factor(indexed_df, ohlcav_col_name_list=self.ohlcav_col_name_list, trade_date_series=self.trade_date_series, delivery_date_series=self.delivery_date_series) train_acc, val_acc = self.valid_model_acc(factor_df) self.trade_date_acc_list[trade_date] = [train_acc, val_acc] # enable_test 默认为 False # self.valid_model_acc(factor_df) 以及完全取代 self.predict_test # self.predict_test 仅用于内部测试使用 if enable_test: self.predict_test(factor_df) return factor_df
def _test_account(): # 建立相关数据 n_step = 60 ohlcav_col_name_list = ["open", "high", "low", "close", "amount", "volume"] from ibats_common.example.data import load_data md_df = load_data('RB.csv').set_index('trade_date')[ohlcav_col_name_list] md_df.index = pd.DatetimeIndex(md_df.index) from ibats_common.backend.factor import get_factor, transfer_2_batch factors_df = get_factor(md_df, dropna=True) df_index, df_columns, data_arr_batch = transfer_2_batch(factors_df, n_step=n_step) md_df = md_df.loc[df_index, :] shape = [ data_arr_batch.shape[0], 5, int(n_step / 5), data_arr_batch.shape[2] ] data_factors = np.transpose(data_arr_batch.reshape(shape), [0, 2, 3, 1]) print(data_arr_batch.shape, '->', shape, '->', data_factors.shape) # 建立 Account env = Account(md_df, data_factors) next_observation = env.reset() print('next_observation.shape:', next_observation.shape) assert next_observation.shape == (1, 12, 78, 5) next_state, reward, done = env.step(1) assert next_observation.shape == (1, 12, 78, 5) assert not done
def on_min1(self, md_df, context): if self.do_nothing_on_min_bar: # 仅供调试使用 return # 数据整理 indexed_df = md_df.set_index('trade_date').drop('instrument_type', axis=1) indexed_df.index = pd.DatetimeIndex(indexed_df.index) # 获取最新交易日 trade_date = str_2_date(indexed_df.index[-1]) days_after_last_train = (trade_date - self.trade_date_last_train).days if self.retrain_period is not None and 0 < self.retrain_period < days_after_last_train: # 重新训练 self.logger.info('当前日期 %s 距离上一次训练 %s 已经过去 %d 天,重新训练', trade_date, self.trade_date_last_train, days_after_last_train) factor_df = self.load_train_test(indexed_df, rebuild_model=True, enable_load_model=self.enable_load_model_if_exist) else: factor_df = get_factor(indexed_df, ohlcav_col_name_list=self.ohlcav_col_name_list, trade_date_series=self.trade_date_series, delivery_date_series=self.delivery_date_series) # 预测 pred_mark = self.predict_latest(factor_df) is_holding, is_buy, is_sell = pred_mark == 0, pred_mark == 1, pred_mark == 2 # self.logger.info('%s is_buy=%s, is_sell=%s', trade_date, str(is_buy), str(is_sell)) close = md_df['close'].iloc[-1] instrument_id = context[ContextKey.instrument_id_list][0] if is_buy: # is_buy position_date_pos_info_dic = self.get_position(instrument_id) no_target_position = True if position_date_pos_info_dic is not None: for position_date, pos_info in position_date_pos_info_dic.items(): direction = pos_info.direction if direction == Direction.Short: self.close_short(instrument_id, close, pos_info.position) elif direction == Direction.Long: no_target_position = False if no_target_position: self.open_long(instrument_id, close, self.unit) else: self.logger.debug("%s %s %.2f holding", self.trade_agent.curr_timestamp, instrument_id, close) if is_sell: # is_sell position_date_pos_info_dic = self.get_position(instrument_id) no_holding_target_position = True if position_date_pos_info_dic is not None: for position_date, pos_info in position_date_pos_info_dic.items(): direction = pos_info.direction if direction == Direction.Long: self.close_long(instrument_id, close, pos_info.position) elif direction == Direction.Short: no_holding_target_position = False if no_holding_target_position: self.open_short(instrument_id, close, self.unit) else: self.logger.debug("%s %s %.2f holding", self.trade_agent.curr_timestamp, instrument_id, close) if is_holding: self.logger.debug("%s %s * * %.2f holding", self.trade_agent.curr_timestamp, instrument_id, close)
def _test_factor_analysis(): from ibats_common.example.data import load_data from ibats_common.backend.factor import get_factor df = load_data("RB.csv", index_col='trade_date', parse_index_to_datetime=True).drop(['instrument_type'], axis=1) factor_df = get_factor(df, price_factor_kwargs={ 'with_diff_n': False }).dropna() ana_dic = factor_analysis(factor_df)
def _test_summary_md_2_docx(auto_open_file=True): from ibats_common.example.data import load_data instrument_type = 'RU' # 'RB' 'RU' file_name = f"{instrument_type}.csv" factor_df = load_data(file_name).set_index('trade_date').drop( 'instrument_type', axis=1) factor_df.index = pd.DatetimeIndex(factor_df.index) column_list_oraginal = list(factor_df.columns) ohlcav_col_name_list = ["open", "high", "low", "close", "amount", "volume"] from ibats_common.backend.factor import get_factor from ibats_common.example.data import get_trade_date_series from ibats_common.example.data import get_delivery_date_series factor_df = get_factor( factor_df, ohlcav_col_name_list=ohlcav_col_name_list, trade_date_series=get_trade_date_series(), delivery_date_series=get_delivery_date_series(instrument_type)) col_transfer_dic = {'return': ['open', 'high', 'low', 'close', 'volume']} file_path = summary_md_2_docx( factor_df, enable_show_plot=False, enable_save_plot=True, close_key='close', name=instrument_type, func_kwargs_dic={ "hist": { "figure_4_each_col": False, "columns": column_list_oraginal, "col_transfer_dic": col_transfer_dic, }, "drawdown": { "col_name_list": ['close'], }, "rr": { "col_name_list": ['close'], }, "hist_future_n_rr": { 'n_days': [3, 5], "columns": ['close'], }, "rr_quantile": { 'columns': ['close'] }, "validation": { 'trade_date_max_gap': 10 }, # "": {}, }) if auto_open_file: open_file_with_system_app(file_path)
def _test_quote_market(): n_step = 60 ohlcav_col_name_list = ["open", "high", "low", "close", "amount", "volume"] from ibats_common.example.data import load_data md_df = load_data('RB.csv').set_index('trade_date')[ohlcav_col_name_list] md_df.index = pd.DatetimeIndex(md_df.index) from ibats_common.backend.factor import get_factor, transfer_2_batch factors_df = get_factor(md_df, dropna=True) df_index, df_columns, data_arr_batch = transfer_2_batch(factors_df, n_step=n_step) md_df = md_df.loc[df_index, :] # 建立 QuotesMarket qm = QuotesMarket(md_df=md_df[['close', 'open']], data_factors=data_arr_batch, state_with_flag=True) next_observation = qm.reset() assert len(next_observation) == 2 assert next_observation[0].shape[0] == n_step assert next_observation[1] == 0 next_observation, reward, done = qm.step(1) assert len(next_observation) == 2 assert next_observation[1] == 1 assert not done next_observation, reward, done = qm.step(0) assert next_observation[1] == 0 assert reward != 0 next_observation, reward, done = qm.step(0) assert next_observation[1] == 0 assert reward == 0 next_observation, reward, done = qm.step(3) assert next_observation[1] == 0 assert reward == 0 next_observation, reward, done = qm.step(2) assert next_observation[1] == -1 assert not done next_observation, reward, done = qm.step(3) assert next_observation[1] == -1 assert reward != 0 try: qm.step(4) except ValueError: print('is ok for not supporting action>3')
def _test_quote_market(): import os n_step = 60 from ibats_common.example.data import load_data md_df = load_data( 'RB.csv', folder_path=os.path.join(os.pardir, os.pardir, os.pardir, 'example', 'data') # r'..\..\..\example\data' ).set_index('trade_date')[DEFAULT_MD_OHLCVA_LABELS] md_df.index = pd.DatetimeIndex(md_df.index) from ibats_common.backend.factor import get_factor, transfer_2_batch factors_df = get_factor(md_df, dropna=True) df_index, df_columns, data_arr_batch = transfer_2_batch(factors_df, n_step=n_step) md_df = md_df.loc[df_index, :] # 建立 QuotesMarket qm = QuotesMarket(md_df=md_df[['close', 'open']], data_factors=data_arr_batch, state_with_flag=True) next_observation = qm.reset() assert len(next_observation) == 3 assert next_observation[0].shape[0] == n_step assert next_observation[1] == FLAG_EMPTY next_observation, reward, done = qm.step(ACTION_LONG) assert len(next_observation) == 3 assert next_observation[1] == FLAG_LONG assert not done next_observation, reward, done = qm.step(ACTION_CLOSE) assert next_observation[1] == FLAG_EMPTY assert reward != 0 next_observation, reward, done = qm.step(ACTION_CLOSE) assert next_observation[1] == FLAG_EMPTY assert reward == 0 next_observation, reward, done = qm.step(ACTION_KEEP) assert next_observation[1] == FLAG_EMPTY assert reward == 0 next_observation, reward, done = qm.step(ACTION_SHORT) assert next_observation[1] == FLAG_SHORT assert not done next_observation, reward, done = qm.step(ACTION_KEEP) assert next_observation[1] == FLAG_SHORT assert reward != 0 try: qm.step(4) except ValueError: print('is ok for not supporting action>3')
def get_xy(df: pd.DataFrame, target_n_bars=5, get_factor_kwargs=None, get_y_kwargs=None): """ 生成 X Y 数据并进行筛选,对齐等 :param df: :param target_n_bars: :param get_factor_kwargs: :param get_y_kwargs: :return: """ get_factor_kwargs = {} if get_factor_kwargs is None else get_factor_kwargs factor_df = get_factor(df, **get_factor_kwargs) get_y_kwargs = {} if get_y_kwargs is None else get_y_kwargs get_y_kwargs.update({"target_n_bars": target_n_bars}) y_s = get_y(df, **get_factor_kwargs) # 数据切片 hist_bar_df, factor_df = df.iloc[: -target_n_bars], factor_df.iloc[: -target_n_bars] logger.info("hist_bar_df.shape=%s, factor_df.shape=%s", hist_bar_df.shape, factor_df.shape) logger.info("y_s.shape=%s", y_s.shape) assert factor_df.shape[0] == y_s.shape[0], \ f"因子数据 x 长度 {factor_df.shape[0]} 要与训练目标 y 数据长度 {y_s.shape[0]} 一致" # 剔除无效数据,并根据 target_n_bars 进行数据切片 is_available = ~(np.isinf(y_s.to_numpy()) | np.isnan(y_s.to_numpy()) | np.any(np.isnan(factor_df.to_numpy()), axis=1) | np.any(np.isinf(factor_df.to_numpy()), axis=1)) available_df = hist_bar_df[is_available] available_factor_df = factor_df[is_available] x_arr = available_factor_df.to_numpy() y_arr = y_s[is_available] assert x_arr.shape[0] == y_arr.shape[0], \ f"因子数据 x 长度 {x_arr.shape[0]} 要与训练目标 y 数据长度 {y_arr.shape[0]} 一致" logger.info("x_arr.shape=%s, y_arr.shape=%s", x_arr.shape, y_arr.shape) return available_df, available_factor_df, x_arr, y_arr
def generate_factors(self): """整理缓存数据,生成相应的因子""" df = pd.DataFrame([{key: getattr(_, key) for key in BAR_ATTRIBUTES} for _ in self._hist_bar_list]).set_index('datetime') df.index = pd.to_datetime(df.index) # 重置缓冲区状态 self._hist_bar_list = [] self._hist_bar_days = 0 # 扩展 hist_bar_df if self.hist_bar_df is None: self.hist_bar_df = df else: self.hist_bar_df = self.hist_bar_df.append(df).sort_index() # 生成因子 self._factor_df = get_factor(self.hist_bar_df, ohlcav_col_name_list=[ 'open_price', 'high_price', 'low_price', 'close_price', None, 'volume' ], dropna=False)
def _test_account2(): """测试 plot_data 返回数据是否符合预期""" n_step = 60 ohlcav_col_name_list = ["open", "high", "low", "close", "amount", "volume"] from ibats_common.example.data import load_data md_df = load_data('RB.csv').set_index('trade_date')[ohlcav_col_name_list] md_df.index = pd.DatetimeIndex(md_df.index) from ibats_common.backend.factor import get_factor, transfer_2_batch factors_df = get_factor(md_df, dropna=True) df_index, df_columns, data_arr_batch = transfer_2_batch(factors_df, n_step=n_step) md_df = md_df.loc[df_index, :] shape = [ data_arr_batch.shape[0], 5, int(n_step / 5), data_arr_batch.shape[2] ] data_factors = np.transpose(data_arr_batch.reshape(shape), [0, 2, 3, 1]) print(data_arr_batch.shape, '->', shape, '->', data_factors.shape) # 建立 Account env = Account(md_df, data_factors) next_observation = env.reset() # 做空 env.step(2) for n in range(int(md_df.shape[0] / 2)): env.step(3) # 做多 next_observation, reward, done = env.step(1) while not done: next_observation, reward, done = env.step(3) # 展示结果 reward_df = env.plot_data() value_s = reward_df.iloc[:, 0] from ibats_utils.mess import datetime_2_str from datetime import datetime dt_str = datetime_2_str(datetime.now(), '%Y-%m-%d %H_%M_%S') title = f'test_account_{dt_str}' from ibats_common.analysis.plot import plot_twin plot_twin(value_s, md_df["close"], name=title)
def on_min1_release(self, md_df): """ 增加模型对未来数据预测成功率走势图展示 :param md_df: :return: """ if md_df is None or md_df.shape[0] == 0: self.logger.warning('md_df is None or shape[0] == 0') return else: self.logger.debug('md_df.shape= %s', md_df.shape) # 获取各个模型训练时间点及路径 date_file_path_pair_list = self.get_date_file_path_pair_list() if len(date_file_path_pair_list) > 0: # 按日期排序 date_file_path_pair_list.sort(key=lambda x: x[0]) # 建立数据集 indexed_df = md_df.set_index('trade_date').drop('instrument_type', axis=1) trade_date_end = indexed_df.index[-1] factor_df = get_factor(indexed_df, ohlcav_col_name_list=self.ohlcav_col_name_list, trade_date_series=self.trade_date_series, delivery_date_series=self.delivery_date_series) xs, ys_onehot, trade_date_index = self.get_x_y(factor_df) ys = np.argmax(ys_onehot, axis=1) data_len = len(trade_date_index) if data_len == 0: self.logger.warning('ys 长度为0,请检查是否存在数据错误') return trade_date2_list = [_[0] for _ in date_file_path_pair_list][1:] trade_date2_list.append(None) # 预测结果 self.logger.info("按日期分段验证检验预测结果") pred_ys_tot, real_ys_tot, img_meta_dic_list = [], [], [] # 根据模型 trade_date_last_train 进行分段预测,并将结果记录到 pred_ys for num, ((trade_date_last_train, file_path, predict_test_random_state), trade_date_next) in enumerate(zip( date_file_path_pair_list, trade_date2_list)): # 以模型训练日期为基准,后面的数据作为验证集数据(样本外数据) # 获取有效的日期范围 from - to range_from_arr = trade_date_index >= pd.to_datetime(trade_date_last_train) range_from_len = len(range_from_arr) if range_from_len == 0: # range_from_len 应该与 trade_date_list_count 等长度,所以这个条件应该永远不会满足 self.logger.error('总共%d条数据,%s 开始后面没有可验证数据', data_len, trade_date_last_train) continue true_count = sum(range_from_arr) self.logger.debug("len(range_from)=%d, True Count=%d", len(range_from_arr), true_count) if true_count == 0: self.logger.warning('总共%d条数据,%s 开始后面没有可验证数据', data_len, trade_date_last_train) continue # 自 trade_date_last_train 起的所有有效日期 trade_date_list_sub = trade_date_index[range_from_arr] # 获取 in_range,作为 range_from, range_to 的交集 if trade_date_next is None: in_range_arr = None in_range_count = true_count else: in_range_arr = trade_date_list_sub < pd.to_datetime(trade_date_next) in_range_count = sum(in_range_arr) if in_range_count == 0: self.logger.warning('总共%d条数据,[%s - %s) 之间没有可用数据', data_len, trade_date_last_train, trade_date_next) continue else: self.logger.debug('总共%d条数据,[%s - %s) 之间有 %d 条数据将被验证 model path:%s', data_len, trade_date_last_train, trade_date_next, in_range_count, file_path) # 获取当前时段对应的 xs # 进行验证时,对 range_from 开始的全部数据进行预测,按照 range_to 为分界线分区着色显示 xs_sub, real_ys = xs[range_from_arr, :, :], ys[range_from_arr] close_df = indexed_df.loc[trade_date_list_sub, 'close'] # 加载模型 is_load = self.load_model_if_exist(trade_date_last_train, enable_load_model_if_exist=True) if not is_load: self.logger.error('%s 模型加载失败:%s', trade_date_last_train, file_path) continue # 预测 pred_ys_one_hot = self.model.predict(xs_sub) pred_ys = np.argmax(pred_ys_one_hot, axis=1) if in_range_arr is not None and in_range_count > 0: pred_ys_tot.extend(pred_ys[in_range_arr]) else: pred_ys_tot.extend(pred_ys) # 为每一个时段单独验证成功率,以当前模型为基准,验证后面全部历史数据成功率走势 if trade_date_next is None: split_point_list = None else: split_point_list = [close_df.index[0], trade_date_next, close_df.index[-1]] base_line_list = self.trade_date_acc_list[trade_date_last_train] img_file_path = show_dl_accuracy(real_ys, pred_ys, close_df, split_point_list, base_line_list=base_line_list) img_meta_dic_list.append({ 'img_file_path': img_file_path, 'trade_date_last_train': trade_date_last_train, 'module_file_path': file_path, 'predict_test_random_state': predict_test_random_state, 'split_point_list': split_point_list, 'in_range_count': in_range_count, 'trade_date_end': trade_date_end, }) pred_ys_tot = np.array(pred_ys_tot) trade_date_last_train_first = pd.to_datetime(date_file_path_pair_list[0][0]) split_point_list = [_[0] for _ in date_file_path_pair_list] split_point_list.append(trade_date_index[-1]) # 获取 real_ys real_ys = ys[trade_date_index >= trade_date_last_train_first] close_df = indexed_df.loc[trade_date_index[trade_date_index >= trade_date_last_train_first], 'close'] img_file_path = show_dl_accuracy(real_ys, pred_ys_tot, close_df, split_point_list) img_meta_dic_list.append({ 'img_file_path': img_file_path, 'trade_date_last_train': trade_date_last_train_first, 'module_file_path': date_file_path_pair_list[0][1], 'predict_test_random_state': date_file_path_pair_list[0][2], 'split_point_list': split_point_list, 'in_range_count': close_df.shape[0], 'trade_date_end': trade_date_end, }) is_output_docx = True if is_output_docx: title = f"[{self.stg_run_id}] predict accuracy trend report" file_path = summary_release_2_docx(title, img_meta_dic_list) copy_file_to(file_path, self.base_folder_path)