예제 #1
0
    def gen_preprocessed_data(self, data, batch_size):
        length = len(data)
        num_batch = int(math.ceil(length / batch_size))
        steps = int(num_batch * self.__sample_rate)

        print(f'\nstart generating preprocessed data ({steps} files) ... ')

        for i in range(steps):
            # show progress
            if i % 10 == 0:
                progress = float(i + 1) / steps * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            # get a batch
            index_of_batch = i % num_batch
            index_start = int(index_of_batch * batch_size)
            index_end = index_start + batch_size
            batch_src, batch_tar = list(zip(*data[index_start:index_end]))

            # preprocess data
            batch_x, batch_y, _, _ = utils.pipeline(
                self.__encoder_pl,
                batch_src,
                batch_tar, {
                    **self.__data_params, 'tokenizer': self.__tokenizer
                },
                verbose=i == 0)

            # save data to file
            file_path = os.path.join(self.__processed_dir_path,
                                     f'batch_{i}.pkl')
            write_pkl(file_path, [batch_x, batch_y])

        print('finish generating preprocessed data ')
예제 #2
0
    def __convert(self, docs, pkl_path):
        """ convert the doc list to trainable data format """
        docs = list(
            map(
                lambda x: list((np.array(self.dict.doc2idx(x)) + self.voc_size)
                               % self.voc_size) if x else x, docs))
        X = []
        y = []

        for i, doc in enumerate(docs):
            len_doc = len(doc)
            len_data = len_doc - (2 * self.__window + 1) + 1
            if len_data <= 0:
                continue

            X += [
                doc[j:j + self.__window] +
                doc[j + self.__window + 1:j + 2 * self.__window + 1]
                for j in range(len_data)
            ]
            y += [doc[j + self.__window] for j in range(len_data)]

        X = np.array(X)
        y = np.array(y)

        X, y = self.__shuffle(X, y)

        # cache data for faster processing next time
        write_pkl(pkl_path, [X, y, self.dict, self.voc_size])

        return X, y
예제 #3
0
    def __init__(self, input_days=20):
        self.__input_days = input_days
        self.__output_days = input_days // 10

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_TEMPORAL_INPUT
        volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT
        no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT
        data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_emb_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl')

        self.__test_emb_pkl_path = os.path.join(path.PATH_TMP_DIR,
                                                f'emb_test_data_w_3_no_below_1000.pkl')

        if os.path.isfile(self.__data_pkl_path):
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, \
            self.emb_dict, self.emb_voc_size, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            print('\nStart loading embedding model ... ')
            self.__load_emb_model()
            print('Finish loading embedding model ')

            _, _, self.emb_dict, self.emb_voc_size = utils.load_pkl(self.__test_emb_pkl_path)

            data_root_dir = os.path.join(data_subset, year, volume_level, data_index)

            print(f'\nStart loading data from {data_root_dir} ...')

            # load doc list
            train_doc_list = self.__load_dir(os.path.join(data_root_dir, 'train'))
            test_doc_list = self.__load_dir(os.path.join(data_root_dir, 'test'))

            print(f'Finish loading \n\nStart generating dict for output ... ')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(train_doc_list)

            print('Finish generating\n\nStart converting data ...')

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_doc_list)
            self.__train_y = self.__convert_output(train_doc_list)
            self.__test_X = self.__convert_input(test_doc_list)
            self.__test_y = self.__convert_output(test_doc_list)

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                                                   self.emb_dict, self.emb_voc_size, self.dict, self.voc_size])

        self.__statistic()
    def __init__(self, input_days=20, force_refresh=False):
        self.__input_days = input_days
        self.__output_days = input_days // 10

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_DEALER_PRED
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_DEALER_PRED
        volume_level = load.VOLUME_LEVEL_FOR_DEALER_PRED
        no_below = load.NO_BELOW_FOR_DEALER_PRED
        data_index = load.DATA_INDEX_FOR_DEALER_PRED

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        if os.path.isfile(self.__data_pkl_path) and not force_refresh:
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            data_root_dir = os.path.join(data_subset, year, volume_level,
                                         data_index)

            print(f'\nStart loading data from {data_root_dir} ...')

            # load doc list
            train_doc_list = self.__load_dir(
                os.path.join(data_root_dir, 'train'))
            test_doc_list = self.__load_dir(os.path.join(
                data_root_dir, 'test'))

            print(f'Finish loading \n\nStart processing data ... ')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(train_doc_list,
                                                       no_below)

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_doc_list)
            self.__train_y = self.__convert_output(train_doc_list)
            self.__test_X = self.__convert_input(test_doc_list)
            self.__test_y = self.__convert_output(test_doc_list)

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()
    def get_tokenizer(self):
        print('\nstart training tokenizer ... ')

        self.__tokenizer = utils.pipeline(
            self.__tokenizer_pl, self.__tokenizer_src, self.__tokenizer_tar, self.__data_params,
        )

        del self.__tokenizer_src
        del self.__tokenizer_tar

        print('finish training tokenizer')

        # saving the tokenizer to file
        write_pkl(self.__tokenizer_path, self.__tokenizer)

        return self.__tokenizer
    def __init__(self, input_days=20):
        self.__input_days = input_days
        self.__output_days = input_days // 10

        # get the path of the cache data
        subset = os.path.split(load.DATA_SUBSET)[1]
        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'interval_input_output_{subset}_{load.YEAR}_{load.VOLUME_LEVEL}_{load.DATA_INDEX}_no_below_{load.NO_BELOW}_input_days_{input_days}.pkl'
        )

        if os.path.isfile(self.__data_pkl_path):
            self.__train_data, self.__test_data, self.dict, self.voc_size = utils.load_pkl(
                self.__data_pkl_path)

        else:

            data_root_dir = os.path.join(load.DATA_SUBSET, load.YEAR,
                                         load.VOLUME_LEVEL, load.DATA_INDEX)

            print(f'\nStart loading data from {data_root_dir} ...')

            # load doc list
            train_doc_list = self.__load_dir(
                os.path.join(data_root_dir, 'train'))
            test_doc_list = self.__load_dir(os.path.join(
                data_root_dir, 'test'))

            print(f'Finish loading \n\nStart processing data ... ')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(train_doc_list)

            # convert doc list to trainable interval summed one-hot vector
            self.__train_data = self.__convert(train_doc_list)
            self.__test_data = self.__convert(test_doc_list)

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_data, self.__test_data, self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()
    def gen_data(self, data, batch_size):
        length = len(data)
        num_batch = int(math.ceil(length / batch_size))

        print(f'\nstart generating preprocessed data ({num_batch} files) ... ')

        for i in range(num_batch):
            # show progress
            if i % 10 == 0:
                progress = float(i + 1) / num_batch * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            # get a batch
            index_of_batch = i % num_batch
            index_start = int(index_of_batch * batch_size)
            index_end = index_start + batch_size
            batch_src, batch_tar = list(zip(*data[index_start: index_end]))

            # save data to file
            file_path = os.path.join(self.__processed_dir_path, f'batch_{i}.pkl')
            write_pkl(file_path, [batch_src, batch_tar])

        print('finish generating preprocessed data ')
예제 #8
0
def gen_group_according_to(file_path):
    print('loading data ...')
    dict_dealer_index_2_group = utils.load_json(file_path)

    data, d_dealers, total_volume, total_transaction_count, bound_timestamp, d_new_bonds = utils.load_pkl(
        os.path.join(path.ROOT_DIR, 'runtime', 'tmp123.pkl'))

    utils.write_pkl(
        os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'), d_dealers)
    # d_dealers = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'))

    labels = set(list(map(lambda x: x[1], dict_dealer_index_2_group.items())))
    group_list = [{} for i in range(len(labels))]

    print('traversing data ...')

    length = len(d_dealers)
    cur = 0
    for dealer_index, trace_list in d_dealers.items():
        # show progress
        if cur % 5 == 0:
            progress = float(cur + 1) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')
        cur += 1

        if dealer_index not in dict_dealer_index_2_group:
            continue

        group_index = dict_dealer_index_2_group[dealer_index]
        group_list[group_index][dealer_index] = trace_list

    print('\rprogress: 100.0%  \nsaving data ...')

    plan_name = os.path.splitext(os.path.split(file_path)[1])[0] + '.json'
    group_path = os.path.join(path.DATA_ROOT_DIR, 'groups', plan_name)
    utils.write_json(group_path, group_list)
예제 #9
0
            'total_transaction_count': tmp_trace_count,
            'total_volume': tmp_volume,
            'dictionary': tmp_dictionary,
            'no_below_transaction_count': new_l[-1][0],
            'no_below_volume': new_l[-1][1],
            'no_below_num_bonds': new_l[-1][2],
            'trace_list': tmp_trace_list,
        }

    if new_l[-1][0] == 0 or new_l[-1][2] <= 5:
        continue
    l_dealers.append(new_l)
l_dealers.sort(key=lambda x: -x[1])

print(f'len of d_dealer_for_gen_input: {len(d_dealer_for_gen_input)}')
utils.write_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'd_dealer_for_gen_input_with_no_below_50_25_10.pkl'), d_dealer_for_gen_input)
# print('done')
# exit()


string = 'num_of_dealers,dealers_total_transaction_count,dealers_total_transaction_count(percentage),'
string += 'dealer_total_volume,dealer_total_volume(percentage),'
string += 'num_of_old_bonds,num_of_old_bonds(percentage),'
string += 'dealers_total_transaction_count_2,dealers_total_transaction_count_2(percentage),'
string += 'dealer_total_volume_2,dealer_total_volume_2(percentage),'
string += 'num_of_old_bonds_2,num_of_old_bonds_2(percentage)\n'

for num_of_dealers in range(20, 270, 20):

    tmp_dealers = l_dealers[:num_of_dealers]
    dict_first_num_dealers = {}
    def __init__(self, input_days=20, force_refresh=False):
        self.__input_days = input_days
        self.__output_days = input_days // 10
        self.__input_mode = 0

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_TEMPORAL_INPUT
        volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT
        no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT
        data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT

        data_all_root_dir = os.path.join(data_subset, year, volume_level)
        all_level = os.path.split(data_all_root_dir)[1]

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        if os.path.isfile(self.__data_pkl_path) and not force_refresh:
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            print(f'\nStart loading data from {data_all_root_dir} ...')

            train_start_timestamp = utils.date_2_timestamp('2015-01-02')
            train_end_timestamp = utils.date_2_timestamp('2015-10-14', True)

            test_start_timestamp = utils.date_2_timestamp('2015-10-14')
            test_end_timestamp = utils.date_2_timestamp('2015-12-31', True)

            data_all_pkl_path = os.path.join(
                path.PATH_TMP_DIR,
                f'all_doc_list_for_pretrain_{subset}_{year}_{all_level}.pkl')

            if os.path.isfile(data_all_pkl_path):
                train_all_doc_list, test_all_doc_list = utils.load_pkl(
                    data_all_pkl_path)

            else:
                train_all_doc_list = self.__load_dir_all(
                    data_all_root_dir, train_start_timestamp,
                    train_end_timestamp, 'train')
                test_all_doc_list = self.__load_dir_all(
                    data_all_root_dir, test_start_timestamp,
                    test_end_timestamp, 'test')

                # train_all_doc_list = []
                # test_all_doc_list = []
                # for _volume in os.listdir(data_all_root_dir):
                #     sub_all_root_dir = os.path.join(data_all_root_dir, _volume)
                #     sub_train_all_doc_list = self.__load_dir_all(sub_all_root_dir, train_start_timestamp,
                #                                                  train_end_timestamp,
                #                                                  'train')
                #     sub_test_all_doc_list = self.__load_dir_all(sub_all_root_dir, test_start_timestamp,
                #                                                 test_end_timestamp,
                #                                                 'test')
                #
                #     train_all_doc_list += sub_train_all_doc_list
                #     test_all_doc_list += sub_test_all_doc_list

                utils.write_pkl(data_all_pkl_path,
                                [train_all_doc_list, test_all_doc_list])

            print(f'Finish loading \n\nStart processing data ... ')

            train_all_docs = []
            for v in train_all_doc_list:
                train_all_docs += v
            test_all_docs = []
            for v in test_all_doc_list:
                test_all_docs += v
            del train_all_doc_list
            del test_all_doc_list

            self.dict, self.voc_size = self.__gen_dict(train_all_docs, 150)

            # # generate the dictionary which maps the bond_id to index
            # self.dict, self.voc_size = self.__gen_dict(train_all_doc_list, no_below)

            print(self.voc_size)
            # print(self.voc_size_all)
            print(
                'Finish generating dict\n\nStart converting input output ...')

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_all_docs, self.dict,
                                                  self.voc_size,
                                                  'allow_unknown')
            self.__train_y = self.__convert_output(train_all_docs)
            self.__test_X = self.__convert_input(test_all_docs, self.dict,
                                                 self.voc_size,
                                                 'allow_unknown')
            self.__test_y = self.__convert_output(test_all_docs)

            # self.__train_X = 0.
            # self.__test_X = 0.
            # self.__train_y = 0.
            # self.__test_y = 0.
            #
            # for doc_list in train_all_doc_list:
            #     self.__train_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown')
            #     self.__train_y += self.__convert_output(doc_list)
            # self.__train_X /= len(train_all_doc_list)
            #
            # for doc_list in test_all_doc_list:
            #     self.__test_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown')
            #     self.__test_y += self.__convert_output(doc_list)
            # self.__test_X /= len(test_all_doc_list)

            # self.__train_all_X = np.array(
            #     list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'),
            #              train_all_doc_list)))
            # self.__train_all_y = np.array(list(map(self.__convert_output, train_all_doc_list)))
            # self.__test_all_X = np.array(
            #     list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'),
            #              test_all_doc_list)))
            # self.__test_all_y = np.array(list(map(self.__convert_output, test_all_doc_list)))

            # self.__train_all_X = np.mean(self.__train_all_X, axis=0)
            # self.__train_all_y = np.mean(self.__train_all_y, axis=0)
            # self.__test_all_X = np.mean(self.__test_all_X, axis=0)
            # self.__test_all_y = np.mean(self.__test_all_y, axis=0)

            # # convert doc list to trainable interval summed one-hot vector
            # self.__train_X = self.__convert_input(train_doc_list, self.dict, self.voc_size, 'allow_unknown')
            # self.__train_y = self.__convert_output(train_doc_list)
            # self.__test_X = self.__convert_input(test_doc_list, self.dict, self.voc_size, 'allow_unknown')
            # self.__test_y = self.__convert_output(test_doc_list)

            # self.__train_X = np.vstack([self.__train_X.transpose([2, 0, 1]), self.__train_all_X.transpose([2, 0, 1])])
            # self.__test_X = np.vstack([self.__test_X.transpose([2, 0, 1]), self.__test_all_X.transpose([2, 0, 1])])
            # self.__train_X = self.__train_X.transpose([1, 2, 0])
            # self.__test_X = self.__test_X.transpose([1, 2, 0])

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()
예제 #11
0
def gen_inputs(group_file_path,
               group_index,
               input_time_steps_list,
               output_time_steps_list,
               with_day_off=True,
               buy_sell_plan=2,
               use_volume=False,
               save_path='',
               split_ratio=0.9,
               is_train=True):
    d_dealer_index_2_group_label = utils.load_json(group_file_path)
    # d_dealer_index_2_trace_list = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'))

    d_dealer_for_gen_input = utils.load_pkl(
        os.path.join(path.ROOT_DIR, 'runtime',
                     'd_dealer_for_gen_input_with_no_below_50_25_10.pkl'))

    tmp_list = [
        dealer_index
        for dealer_index, group_label in d_dealer_index_2_group_label.items()
        if group_label == group_index
    ]
    print(f'len_group_member: {len(tmp_list)}')

    # get total trace list
    train_trace_list = []
    test_trace_list = []
    for dealer_index, val in d_dealer_for_gen_input.items():
        if dealer_index not in d_dealer_index_2_group_label or \
                d_dealer_index_2_group_label[dealer_index] != group_index:
            continue
        trace_list = val['trace_list']
        trace_list.sort(key=lambda x: x[-1])

        num_samples = len(trace_list) - max(input_time_steps_list) - max(
            output_time_steps_list)
        split_index = int(num_samples * split_ratio +
                          max(input_time_steps_list))
        train_trace_list += trace_list[:split_index]
        test_trace_list += trace_list[split_index:]

        # print(dealer_index, len(trace_list), trace_list)

    train_trace_list.sort(key=lambda x: x[-1])

    # get dictionary
    train_doc_list = [list(map(lambda x: x[0], train_trace_list))]
    dictionary = corpora.Dictionary(train_doc_list)
    len_bonds = len(dictionary)
    print(f'total bond num (group {group_index}): {len_bonds}')

    X = []
    X_mask = []
    Y = []

    for dealer_index, val in d_dealer_for_gen_input.items():
        if dealer_index not in d_dealer_index_2_group_label or \
                d_dealer_index_2_group_label[dealer_index] != group_index:
            continue

        # filter bonds that only appear in test set

        trace_list = val['trace_list']
        num_samples = len(trace_list) - max(input_time_steps_list) - max(
            output_time_steps_list)
        split_index = int(num_samples * split_ratio +
                          max(input_time_steps_list))

        if is_train:
            trace_list = trace_list[:split_index]
        else:
            trace_list = trace_list[split_index:]
            trace_list = [
                v for v in trace_list if dictionary.doc2idx([v[0]])[0] != -1
            ]
        trace_list.sort(key=lambda x: x[-1])

        start_date = trace_list[0][-1]
        end_date = trace_list[-1][-1]

        # Format the data in date structure
        date_matrix, date_mask, dict_date_2_input_m_index = __generate_date_structure(
            len_bonds, start_date, end_date, with_day_off, buy_sell_plan)

        # according to the transaction history, fill the data into date structure
        for i, trace in enumerate(trace_list):
            bond_id = trace[0]
            volume = trace[1]
            _date = trace[-1]
            trace_type = trace[2]
            bond_index = dictionary.doc2idx([bond_id])[0]

            value = 1 if not use_volume else np.log10(volume)
            if _date not in dict_date_2_input_m_index:
                continue

            date_mask = __change_mask(buy_sell_plan, date_mask, bond_index,
                                      len_bonds, dict_date_2_input_m_index,
                                      _date, trace_type, value)

        # sample the data
        longest_input_length = max(input_time_steps_list) + 2
        for input_time_steps in input_time_steps_list:
            for output_time_steps in output_time_steps_list:
                # input_list = __sample_list(date_matrix, input_time_steps, 0, output_time_steps,
                #                            __token(date_matrix.shape[-1]), __token(date_matrix.shape[-1]))
                input_mask_list = __sample_list(
                    date_mask,
                    input_time_steps,
                    0,
                    output_time_steps,
                    __start_token_mask(date_mask.shape[1:], with_day_off),
                    __end_token_mask(date_mask.shape[1:], with_day_off),
                    longest_len=longest_input_length)

                convert_fn = __convert_2_zero_one if buy_sell_plan in [
                    0, 2
                ] else None
                output_list = __sample_list(date_mask,
                                            output_time_steps,
                                            input_time_steps,
                                            0,
                                            convert_fn=convert_fn)

                if len(input_mask_list) != len(output_list):
                    continue

                # ...
                # X += input_list
                X_mask += input_mask_list
                Y += output_list

        # d_dealer_index_2_trace_list_ordered[dealer_index] = [date_matrix, date_mask]

    if save_path:
        del d_dealer_for_gen_input
        del d_dealer_index_2_group_label
        del X

        X_mask = np.asarray(X_mask, dtype=np.int32)
        Y = np.asarray(Y, dtype=np.int32)

        print('\n------------------------------')
        # print(len(X))
        print(X_mask.shape)
        print(Y.shape)

        len_X = len(X_mask)
        num_files = int(np.ceil(len_X / 2000.))
        for i in range(num_files):
            start_index = i * 2000
            end_index = (i + 1) * 2000
            utils.write_pkl(
                save_path + f'_{i}.pkl',
                [X_mask[start_index:end_index], Y[start_index:end_index]])

    # return d_dealer_index_2_trace_list_ordered
    return X_mask, Y
    def __init__(self, input_days=20, force_refresh=False):
        self.__input_days = input_days
        self.__output_days = input_days // 10

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_TEMPORAL_INPUT
        volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT
        no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT
        data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_has_pretrain_same_volume_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        data_all_root_dir = os.path.join(data_subset, year, volume_level)
        all_level = os.path.split(data_all_root_dir)[1]

        self.__pretrain_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_dealer_83_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        _, _, _, _, self.dict_pretrain, self.voc_size_pretrain = utils.load_pkl(
            self.__pretrain_pkl_path)

        if os.path.isfile(self.__data_pkl_path) and not force_refresh:
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            data_root_dir = os.path.join(data_subset, year, volume_level,
                                         data_index)

            print(f'\nStart loading data from {data_root_dir} ...')

            # load doc list
            train_doc_list = self.__load_dir(
                os.path.join(data_root_dir, 'train'))
            test_doc_list = self.__load_dir(os.path.join(
                data_root_dir, 'test'))

            print(f'Finish loading \n\nStart processing data ... ')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(train_doc_list,
                                                       no_below)

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_doc_list, self.dict,
                                                  self.voc_size)
            self.__train_y = self.__convert_output(train_doc_list)
            self.__test_X = self.__convert_input(test_doc_list, self.dict,
                                                 self.voc_size)
            self.__test_y = self.__convert_output(test_doc_list)

            self.__train_X_pretrain = self.__convert_input(
                train_doc_list, self.dict_pretrain, self.voc_size_pretrain)
            self.__test_X_pretrain = self.__convert_input(
                test_doc_list, self.dict_pretrain, self.voc_size_pretrain)

            o_lstm = LSTM('2020_01_12_18_49_46',
                          'lstm_for_pretrain_with_same_volume', 2007)
            o_lstm.compile(0.001)
            o_lstm.load_model(
                r'D:\Github\bond_prediction\runtime\models\lstm_for_pretrain_with_same_volume\2020_01_12_18_49_46\lstm_for_pretrain_with_same_volume.030-0.0596.hdf5',
                np.zeros([1, 20, 2007]), np.zeros([1, 20, 2007]))

            self.__train_X_pretrain = o_lstm.predict(self.__train_X_pretrain)
            self.__test_X_pretrain = o_lstm.predict(self.__test_X_pretrain)

            self.__train_X_pretrain = np.array([
                self.__train_X_pretrain for i in range(20)
            ]).transpose([1, 0, 2])
            self.__test_X_pretrain = np.array([
                self.__test_X_pretrain for i in range(20)
            ]).transpose([1, 0, 2])

            # print(self.__train_X_pretrain.shape)
            # print(self.__test_X_pretrain.shape)

            self.__train_X = np.vstack([
                self.__train_X.transpose([2, 0, 1]),
                self.__train_X_pretrain.transpose([2, 0, 1])
            ])
            self.__test_X = np.vstack([
                self.__test_X.transpose([2, 0, 1]),
                self.__test_X_pretrain.transpose([2, 0, 1])
            ])
            self.__train_X = self.__train_X.transpose([1, 2, 0])
            self.__test_X = self.__test_X.transpose([1, 2, 0])

            # print(self.__train_X.shape)
            # print(self.__test_X.shape)

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()