Пример #1
0
    def load_train_test(self, indexed_df, enable_load_model, rebuild_model=False, enable_train_if_load_not_suss=True,
                        enable_train_even_load_succ=False, enable_test=False):
        if rebuild_model:
            self.get_model(rebuild_model=True)

        trade_date = str_2_date(indexed_df.index[-1])
        # 加载模型
        if enable_load_model:
            is_load = self.load_model_if_exist(trade_date)
        else:
            is_load = False

        if enable_train_even_load_succ or (enable_train_if_load_not_suss and not is_load):
            factor_df_dic = get_factor(indexed_df, ohlcav_col_name_list=self.ohlcav_col_name_list,
                                       trade_date_series=self.trade_date_series,
                                       delivery_date_series=self.delivery_date_series, do_multiple_factors=True)
            factor_df = factor_df_dic[1]
            num = 0
            while True:
                num += 1
                if num > 1:
                    self.get_model(rebuild_model=True)
                # 训练模型
                train_acc, val_acc = self.train(factor_df_dic, predict_test_random_state=num)
                if self.over_fitting_train_acc is not None and train_acc > self.over_fitting_train_acc:
                    self.logger.warning('第 %d 次训练,训练集精度 train_acc=%.2f%% 过高,可能存在过拟合,重新采样训练',
                                        num, train_acc * 100)
                    continue
                if self.validation_accuracy_base_line is not None:
                    if val_acc < self.validation_accuracy_base_line:
                        self.logger.warning('第 %d 次训练,训练结果不及预期,重新采样训练', num)
                        continue
                    # elif train_acc - val_acc > 0.15 and val_acc < 0.75:
                    #     self.logger.warning('第 %d 次训练,train_acc=%.2f%%, val_acc=%.2f%% 相差大于 15%% 且验证集正确率小于75%%,重新采样训练',
                    #                    num, train_acc * 100, val_acc * 100)
                    #     continue
                    else:
                        break
                else:
                    break

            self.save_model(trade_date)
            self.trade_date_last_train = trade_date
        else:
            factor_df = get_factor(indexed_df, ohlcav_col_name_list=self.ohlcav_col_name_list,
                                   trade_date_series=self.trade_date_series,
                                   delivery_date_series=self.delivery_date_series)
            train_acc, val_acc = self.valid_model_acc(factor_df)

        self.trade_date_acc_list[trade_date] = [train_acc, val_acc]

        # enable_test 默认为 False
        # self.valid_model_acc(factor_df) 以及完全取代 self.predict_test
        # self.predict_test 仅用于内部测试使用
        if enable_test:
            self.predict_test(factor_df)

        return factor_df
Пример #2
0
def _test_account():
    # 建立相关数据
    n_step = 60
    ohlcav_col_name_list = ["open", "high", "low", "close", "amount", "volume"]
    from ibats_common.example.data import load_data
    md_df = load_data('RB.csv').set_index('trade_date')[ohlcav_col_name_list]
    md_df.index = pd.DatetimeIndex(md_df.index)
    from ibats_common.backend.factor import get_factor, transfer_2_batch
    factors_df = get_factor(md_df, dropna=True)
    df_index, df_columns, data_arr_batch = transfer_2_batch(factors_df,
                                                            n_step=n_step)
    md_df = md_df.loc[df_index, :]
    shape = [
        data_arr_batch.shape[0], 5,
        int(n_step / 5), data_arr_batch.shape[2]
    ]
    data_factors = np.transpose(data_arr_batch.reshape(shape), [0, 2, 3, 1])
    print(data_arr_batch.shape, '->', shape, '->', data_factors.shape)
    # 建立 Account
    env = Account(md_df, data_factors)
    next_observation = env.reset()
    print('next_observation.shape:', next_observation.shape)
    assert next_observation.shape == (1, 12, 78, 5)
    next_state, reward, done = env.step(1)
    assert next_observation.shape == (1, 12, 78, 5)
    assert not done
Пример #3
0
    def on_min1(self, md_df, context):
        if self.do_nothing_on_min_bar:  # 仅供调试使用
            return

        # 数据整理
        indexed_df = md_df.set_index('trade_date').drop('instrument_type', axis=1)
        indexed_df.index = pd.DatetimeIndex(indexed_df.index)
        # 获取最新交易日
        trade_date = str_2_date(indexed_df.index[-1])
        days_after_last_train = (trade_date - self.trade_date_last_train).days
        if self.retrain_period is not None and 0 < self.retrain_period < days_after_last_train:
            # 重新训练
            self.logger.info('当前日期 %s 距离上一次训练 %s 已经过去 %d 天,重新训练',
                             trade_date, self.trade_date_last_train, days_after_last_train)
            factor_df = self.load_train_test(indexed_df, rebuild_model=True,
                                             enable_load_model=self.enable_load_model_if_exist)
        else:
            factor_df = get_factor(indexed_df, ohlcav_col_name_list=self.ohlcav_col_name_list,
                                   trade_date_series=self.trade_date_series,
                                   delivery_date_series=self.delivery_date_series)

        # 预测
        pred_mark = self.predict_latest(factor_df)
        is_holding, is_buy, is_sell = pred_mark == 0, pred_mark == 1, pred_mark == 2
        # self.logger.info('%s is_buy=%s, is_sell=%s', trade_date, str(is_buy), str(is_sell))
        close = md_df['close'].iloc[-1]
        instrument_id = context[ContextKey.instrument_id_list][0]
        if is_buy:  # is_buy
            position_date_pos_info_dic = self.get_position(instrument_id)
            no_target_position = True
            if position_date_pos_info_dic is not None:
                for position_date, pos_info in position_date_pos_info_dic.items():
                    direction = pos_info.direction
                    if direction == Direction.Short:
                        self.close_short(instrument_id, close, pos_info.position)
                    elif direction == Direction.Long:
                        no_target_position = False
            if no_target_position:
                self.open_long(instrument_id, close, self.unit)
            else:
                self.logger.debug("%s %s     %.2f holding", self.trade_agent.curr_timestamp, instrument_id, close)

        if is_sell:  # is_sell
            position_date_pos_info_dic = self.get_position(instrument_id)
            no_holding_target_position = True
            if position_date_pos_info_dic is not None:
                for position_date, pos_info in position_date_pos_info_dic.items():
                    direction = pos_info.direction
                    if direction == Direction.Long:
                        self.close_long(instrument_id, close, pos_info.position)
                    elif direction == Direction.Short:
                        no_holding_target_position = False
            if no_holding_target_position:
                self.open_short(instrument_id, close, self.unit)
            else:
                self.logger.debug("%s %s     %.2f holding", self.trade_agent.curr_timestamp, instrument_id, close)

        if is_holding:
            self.logger.debug("%s %s * * %.2f holding", self.trade_agent.curr_timestamp, instrument_id, close)
Пример #4
0
def _test_factor_analysis():
    from ibats_common.example.data import load_data
    from ibats_common.backend.factor import get_factor
    df = load_data("RB.csv",
                   index_col='trade_date',
                   parse_index_to_datetime=True).drop(['instrument_type'],
                                                      axis=1)
    factor_df = get_factor(df, price_factor_kwargs={
        'with_diff_n': False
    }).dropna()
    ana_dic = factor_analysis(factor_df)
Пример #5
0
def _test_summary_md_2_docx(auto_open_file=True):
    from ibats_common.example.data import load_data
    instrument_type = 'RU'  # 'RB' 'RU'
    file_name = f"{instrument_type}.csv"

    factor_df = load_data(file_name).set_index('trade_date').drop(
        'instrument_type', axis=1)
    factor_df.index = pd.DatetimeIndex(factor_df.index)
    column_list_oraginal = list(factor_df.columns)
    ohlcav_col_name_list = ["open", "high", "low", "close", "amount", "volume"]

    from ibats_common.backend.factor import get_factor
    from ibats_common.example.data import get_trade_date_series
    from ibats_common.example.data import get_delivery_date_series
    factor_df = get_factor(
        factor_df,
        ohlcav_col_name_list=ohlcav_col_name_list,
        trade_date_series=get_trade_date_series(),
        delivery_date_series=get_delivery_date_series(instrument_type))

    col_transfer_dic = {'return': ['open', 'high', 'low', 'close', 'volume']}
    file_path = summary_md_2_docx(
        factor_df,
        enable_show_plot=False,
        enable_save_plot=True,
        close_key='close',
        name=instrument_type,
        func_kwargs_dic={
            "hist": {
                "figure_4_each_col": False,
                "columns": column_list_oraginal,
                "col_transfer_dic": col_transfer_dic,
            },
            "drawdown": {
                "col_name_list": ['close'],
            },
            "rr": {
                "col_name_list": ['close'],
            },
            "hist_future_n_rr": {
                'n_days': [3, 5],
                "columns": ['close'],
            },
            "rr_quantile": {
                'columns': ['close']
            },
            "validation": {
                'trade_date_max_gap': 10
            },
            # "": {},
        })
    if auto_open_file:
        open_file_with_system_app(file_path)
Пример #6
0
def _test_quote_market():
    n_step = 60
    ohlcav_col_name_list = ["open", "high", "low", "close", "amount", "volume"]
    from ibats_common.example.data import load_data
    md_df = load_data('RB.csv').set_index('trade_date')[ohlcav_col_name_list]
    md_df.index = pd.DatetimeIndex(md_df.index)
    from ibats_common.backend.factor import get_factor, transfer_2_batch
    factors_df = get_factor(md_df, dropna=True)
    df_index, df_columns, data_arr_batch = transfer_2_batch(factors_df,
                                                            n_step=n_step)
    md_df = md_df.loc[df_index, :]
    # 建立 QuotesMarket
    qm = QuotesMarket(md_df=md_df[['close', 'open']],
                      data_factors=data_arr_batch,
                      state_with_flag=True)
    next_observation = qm.reset()
    assert len(next_observation) == 2
    assert next_observation[0].shape[0] == n_step
    assert next_observation[1] == 0
    next_observation, reward, done = qm.step(1)
    assert len(next_observation) == 2
    assert next_observation[1] == 1
    assert not done
    next_observation, reward, done = qm.step(0)
    assert next_observation[1] == 0
    assert reward != 0
    next_observation, reward, done = qm.step(0)
    assert next_observation[1] == 0
    assert reward == 0
    next_observation, reward, done = qm.step(3)
    assert next_observation[1] == 0
    assert reward == 0
    next_observation, reward, done = qm.step(2)
    assert next_observation[1] == -1
    assert not done
    next_observation, reward, done = qm.step(3)
    assert next_observation[1] == -1
    assert reward != 0
    try:
        qm.step(4)
    except ValueError:
        print('is ok for not supporting action>3')
Пример #7
0
def _test_quote_market():
    import os
    n_step = 60
    from ibats_common.example.data import load_data
    md_df = load_data(
        'RB.csv',
        folder_path=os.path.join(os.pardir, os.pardir, os.pardir, 'example', 'data')  # r'..\..\..\example\data'
    ).set_index('trade_date')[DEFAULT_MD_OHLCVA_LABELS]
    md_df.index = pd.DatetimeIndex(md_df.index)
    from ibats_common.backend.factor import get_factor, transfer_2_batch
    factors_df = get_factor(md_df, dropna=True)
    df_index, df_columns, data_arr_batch = transfer_2_batch(factors_df, n_step=n_step)
    md_df = md_df.loc[df_index, :]
    # 建立 QuotesMarket
    qm = QuotesMarket(md_df=md_df[['close', 'open']], data_factors=data_arr_batch, state_with_flag=True)
    next_observation = qm.reset()
    assert len(next_observation) == 3
    assert next_observation[0].shape[0] == n_step
    assert next_observation[1] == FLAG_EMPTY
    next_observation, reward, done = qm.step(ACTION_LONG)
    assert len(next_observation) == 3
    assert next_observation[1] == FLAG_LONG
    assert not done
    next_observation, reward, done = qm.step(ACTION_CLOSE)
    assert next_observation[1] == FLAG_EMPTY
    assert reward != 0
    next_observation, reward, done = qm.step(ACTION_CLOSE)
    assert next_observation[1] == FLAG_EMPTY
    assert reward == 0
    next_observation, reward, done = qm.step(ACTION_KEEP)
    assert next_observation[1] == FLAG_EMPTY
    assert reward == 0
    next_observation, reward, done = qm.step(ACTION_SHORT)
    assert next_observation[1] == FLAG_SHORT
    assert not done
    next_observation, reward, done = qm.step(ACTION_KEEP)
    assert next_observation[1] == FLAG_SHORT
    assert reward != 0
    try:
        qm.step(4)
    except ValueError:
        print('is ok for not supporting action>3')
Пример #8
0
def get_xy(df: pd.DataFrame,
           target_n_bars=5,
           get_factor_kwargs=None,
           get_y_kwargs=None):
    """
    生成 X Y 数据并进行筛选,对齐等
    :param df:
    :param target_n_bars:
    :param get_factor_kwargs:
    :param get_y_kwargs:
    :return:
    """
    get_factor_kwargs = {} if get_factor_kwargs is None else get_factor_kwargs
    factor_df = get_factor(df, **get_factor_kwargs)
    get_y_kwargs = {} if get_y_kwargs is None else get_y_kwargs
    get_y_kwargs.update({"target_n_bars": target_n_bars})
    y_s = get_y(df, **get_factor_kwargs)
    # 数据切片
    hist_bar_df, factor_df = df.iloc[:
                                     -target_n_bars], factor_df.iloc[:
                                                                     -target_n_bars]
    logger.info("hist_bar_df.shape=%s, factor_df.shape=%s", hist_bar_df.shape,
                factor_df.shape)
    logger.info("y_s.shape=%s", y_s.shape)
    assert factor_df.shape[0] == y_s.shape[0], \
        f"因子数据 x 长度 {factor_df.shape[0]} 要与训练目标 y 数据长度 {y_s.shape[0]} 一致"
    # 剔除无效数据,并根据 target_n_bars 进行数据切片
    is_available = ~(np.isinf(y_s.to_numpy())
                     | np.isnan(y_s.to_numpy())
                     | np.any(np.isnan(factor_df.to_numpy()), axis=1)
                     | np.any(np.isinf(factor_df.to_numpy()), axis=1))
    available_df = hist_bar_df[is_available]
    available_factor_df = factor_df[is_available]
    x_arr = available_factor_df.to_numpy()
    y_arr = y_s[is_available]
    assert x_arr.shape[0] == y_arr.shape[0], \
        f"因子数据 x 长度 {x_arr.shape[0]} 要与训练目标 y 数据长度 {y_arr.shape[0]} 一致"
    logger.info("x_arr.shape=%s, y_arr.shape=%s", x_arr.shape, y_arr.shape)
    return available_df, available_factor_df, x_arr, y_arr
Пример #9
0
 def generate_factors(self):
     """整理缓存数据,生成相应的因子"""
     df = pd.DataFrame([{key: getattr(_, key)
                         for key in BAR_ATTRIBUTES}
                        for _ in self._hist_bar_list]).set_index('datetime')
     df.index = pd.to_datetime(df.index)
     # 重置缓冲区状态
     self._hist_bar_list = []
     self._hist_bar_days = 0
     # 扩展 hist_bar_df
     if self.hist_bar_df is None:
         self.hist_bar_df = df
     else:
         self.hist_bar_df = self.hist_bar_df.append(df).sort_index()
     # 生成因子
     self._factor_df = get_factor(self.hist_bar_df,
                                  ohlcav_col_name_list=[
                                      'open_price', 'high_price',
                                      'low_price', 'close_price', None,
                                      'volume'
                                  ],
                                  dropna=False)
Пример #10
0
def _test_account2():
    """测试 plot_data 返回数据是否符合预期"""
    n_step = 60
    ohlcav_col_name_list = ["open", "high", "low", "close", "amount", "volume"]
    from ibats_common.example.data import load_data
    md_df = load_data('RB.csv').set_index('trade_date')[ohlcav_col_name_list]
    md_df.index = pd.DatetimeIndex(md_df.index)
    from ibats_common.backend.factor import get_factor, transfer_2_batch
    factors_df = get_factor(md_df, dropna=True)
    df_index, df_columns, data_arr_batch = transfer_2_batch(factors_df,
                                                            n_step=n_step)
    md_df = md_df.loc[df_index, :]
    shape = [
        data_arr_batch.shape[0], 5,
        int(n_step / 5), data_arr_batch.shape[2]
    ]
    data_factors = np.transpose(data_arr_batch.reshape(shape), [0, 2, 3, 1])
    print(data_arr_batch.shape, '->', shape, '->', data_factors.shape)
    # 建立 Account
    env = Account(md_df, data_factors)
    next_observation = env.reset()
    # 做空
    env.step(2)
    for n in range(int(md_df.shape[0] / 2)):
        env.step(3)
    # 做多
    next_observation, reward, done = env.step(1)
    while not done:
        next_observation, reward, done = env.step(3)

    # 展示结果
    reward_df = env.plot_data()
    value_s = reward_df.iloc[:, 0]
    from ibats_utils.mess import datetime_2_str
    from datetime import datetime
    dt_str = datetime_2_str(datetime.now(), '%Y-%m-%d %H_%M_%S')
    title = f'test_account_{dt_str}'
    from ibats_common.analysis.plot import plot_twin
    plot_twin(value_s, md_df["close"], name=title)
Пример #11
0
    def on_min1_release(self, md_df):
        """
        增加模型对未来数据预测成功率走势图展示
        :param md_df:
        :return:
        """
        if md_df is None or md_df.shape[0] == 0:
            self.logger.warning('md_df is None or shape[0] == 0')
            return
        else:
            self.logger.debug('md_df.shape= %s', md_df.shape)

        # 获取各个模型训练时间点及路径
        date_file_path_pair_list = self.get_date_file_path_pair_list()
        if len(date_file_path_pair_list) > 0:
            # 按日期排序
            date_file_path_pair_list.sort(key=lambda x: x[0])

        # 建立数据集
        indexed_df = md_df.set_index('trade_date').drop('instrument_type', axis=1)
        trade_date_end = indexed_df.index[-1]
        factor_df = get_factor(indexed_df, ohlcav_col_name_list=self.ohlcav_col_name_list,
                               trade_date_series=self.trade_date_series,
                               delivery_date_series=self.delivery_date_series)
        xs, ys_onehot, trade_date_index = self.get_x_y(factor_df)
        ys = np.argmax(ys_onehot, axis=1)
        data_len = len(trade_date_index)
        if data_len == 0:
            self.logger.warning('ys 长度为0,请检查是否存在数据错误')
            return
        trade_date2_list = [_[0] for _ in date_file_path_pair_list][1:]
        trade_date2_list.append(None)
        # 预测结果
        self.logger.info("按日期分段验证检验预测结果")
        pred_ys_tot, real_ys_tot, img_meta_dic_list = [], [], []
        # 根据模型 trade_date_last_train 进行分段预测,并将结果记录到 pred_ys
        for num, ((trade_date_last_train, file_path, predict_test_random_state), trade_date_next) in enumerate(zip(
                date_file_path_pair_list, trade_date2_list)):
            # 以模型训练日期为基准,后面的数据作为验证集数据(样本外数据)
            # 获取有效的日期范围 from - to
            range_from_arr = trade_date_index >= pd.to_datetime(trade_date_last_train)
            range_from_len = len(range_from_arr)
            if range_from_len == 0:  # range_from_len 应该与 trade_date_list_count 等长度,所以这个条件应该永远不会满足
                self.logger.error('总共%d条数据,%s 开始后面没有可验证数据', data_len, trade_date_last_train)
                continue
            true_count = sum(range_from_arr)
            self.logger.debug("len(range_from)=%d, True Count=%d", len(range_from_arr), true_count)
            if true_count == 0:
                self.logger.warning('总共%d条数据,%s 开始后面没有可验证数据', data_len, trade_date_last_train)
                continue
            # 自 trade_date_last_train 起的所有有效日期
            trade_date_list_sub = trade_date_index[range_from_arr]

            # 获取 in_range,作为 range_from, range_to 的交集
            if trade_date_next is None:
                in_range_arr = None
                in_range_count = true_count
            else:
                in_range_arr = trade_date_list_sub < pd.to_datetime(trade_date_next)
                in_range_count = sum(in_range_arr)
                if in_range_count == 0:
                    self.logger.warning('总共%d条数据,[%s - %s) 之间没有可用数据',
                                        data_len, trade_date_last_train, trade_date_next)
                    continue
                else:
                    self.logger.debug('总共%d条数据,[%s - %s) 之间有 %d 条数据将被验证 model path:%s',
                                      data_len, trade_date_last_train, trade_date_next, in_range_count, file_path)

            # 获取当前时段对应的 xs
            # 进行验证时,对 range_from 开始的全部数据进行预测,按照 range_to 为分界线分区着色显示
            xs_sub, real_ys = xs[range_from_arr, :, :], ys[range_from_arr]
            close_df = indexed_df.loc[trade_date_list_sub, 'close']

            # 加载模型
            is_load = self.load_model_if_exist(trade_date_last_train, enable_load_model_if_exist=True)
            if not is_load:
                self.logger.error('%s 模型加载失败:%s', trade_date_last_train, file_path)
                continue
            # 预测
            pred_ys_one_hot = self.model.predict(xs_sub)
            pred_ys = np.argmax(pred_ys_one_hot, axis=1)
            if in_range_arr is not None and in_range_count > 0:
                pred_ys_tot.extend(pred_ys[in_range_arr])
            else:
                pred_ys_tot.extend(pred_ys)

            # 为每一个时段单独验证成功率,以当前模型为基准,验证后面全部历史数据成功率走势
            if trade_date_next is None:
                split_point_list = None
            else:
                split_point_list = [close_df.index[0], trade_date_next, close_df.index[-1]]
            base_line_list = self.trade_date_acc_list[trade_date_last_train]
            img_file_path = show_dl_accuracy(real_ys, pred_ys, close_df, split_point_list,
                                             base_line_list=base_line_list)
            img_meta_dic_list.append({
                'img_file_path': img_file_path,
                'trade_date_last_train': trade_date_last_train,
                'module_file_path': file_path,
                'predict_test_random_state': predict_test_random_state,
                'split_point_list': split_point_list,
                'in_range_count': in_range_count,
                'trade_date_end': trade_date_end,
            })

        pred_ys_tot = np.array(pred_ys_tot)
        trade_date_last_train_first = pd.to_datetime(date_file_path_pair_list[0][0])
        split_point_list = [_[0] for _ in date_file_path_pair_list]
        split_point_list.append(trade_date_index[-1])
        # 获取 real_ys
        real_ys = ys[trade_date_index >= trade_date_last_train_first]
        close_df = indexed_df.loc[trade_date_index[trade_date_index >= trade_date_last_train_first], 'close']
        img_file_path = show_dl_accuracy(real_ys, pred_ys_tot, close_df, split_point_list)

        img_meta_dic_list.append({
            'img_file_path': img_file_path,
            'trade_date_last_train': trade_date_last_train_first,
            'module_file_path': date_file_path_pair_list[0][1],
            'predict_test_random_state': date_file_path_pair_list[0][2],
            'split_point_list': split_point_list,
            'in_range_count': close_df.shape[0],
            'trade_date_end': trade_date_end,
        })
        is_output_docx = True
        if is_output_docx:
            title = f"[{self.stg_run_id}] predict accuracy trend report"
            file_path = summary_release_2_docx(title, img_meta_dic_list)
            copy_file_to(file_path, self.base_folder_path)