예제 #1
0
    def __init__(self, input_days=20):
        self.__input_days = input_days
        self.__output_days = input_days // 10

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_TEMPORAL_INPUT
        volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT
        no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT
        data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_emb_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl')

        self.__test_emb_pkl_path = os.path.join(path.PATH_TMP_DIR,
                                                f'emb_test_data_w_3_no_below_1000.pkl')

        if os.path.isfile(self.__data_pkl_path):
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, \
            self.emb_dict, self.emb_voc_size, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            print('\nStart loading embedding model ... ')
            self.__load_emb_model()
            print('Finish loading embedding model ')

            _, _, self.emb_dict, self.emb_voc_size = utils.load_pkl(self.__test_emb_pkl_path)

            data_root_dir = os.path.join(data_subset, year, volume_level, data_index)

            print(f'\nStart loading data from {data_root_dir} ...')

            # load doc list
            train_doc_list = self.__load_dir(os.path.join(data_root_dir, 'train'))
            test_doc_list = self.__load_dir(os.path.join(data_root_dir, 'test'))

            print(f'Finish loading \n\nStart generating dict for output ... ')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(train_doc_list)

            print('Finish generating\n\nStart converting data ...')

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_doc_list)
            self.__train_y = self.__convert_output(train_doc_list)
            self.__test_X = self.__convert_input(test_doc_list)
            self.__test_y = self.__convert_output(test_doc_list)

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                                                   self.emb_dict, self.emb_voc_size, self.dict, self.voc_size])

        self.__statistic()
예제 #2
0
    def __load(self):
        data_queue = []
        max_queue_size = min(self.size(), self.queue_size)
        max_buffer_size = min(self.size(), self.buffer_size)

        while self.__running:
            while len(data_queue) < max_queue_size:
                file_path = self.__file_list[self.__cur_index]
                self.__cur_index = (self.__cur_index + 1) % self.__len_files

                batch_src, batch_tar = load_pkl(file_path)

                # preprocess data
                batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y = utils.pipeline(
                    self.__encoder_pl, batch_src, batch_tar, {**self.__data_params, 'tokenizer': self.__tokenizer},
                    verbose=False
                )

                data_queue += list(zip(batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y))

            if len(self.__data) < max_buffer_size:
                random.seed(42)
                random.shuffle(data_queue)

                self.__data += data_queue
                data_queue = []

            time.sleep(0.1)

        print('Stop thread for loading data ')
예제 #3
0
    def __load_train(self):
        """ load train data """
        print('\nStart loading train data')

        if os.path.isfile(self.__emb_pkl_path):
            self.__train_X, self.__train_y, self.dict, self.voc_size = load_pkl(
                self.__emb_pkl_path)

        else:
            print('loading doc list ...')

            # load the doc_list
            emb_json_path = os.path.join(path.PATH_TMP_DIR, 'emb_data.json')
            if os.path.isfile(emb_json_path):
                docs = load_json(emb_json_path)
            else:
                path_list = self.__get_path_list()
                docs = self.__load_docs(path_list, emb_json_path)

            print('generating dictionary ...')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(docs)

            print('converting docs to trainable data format ...')

            # convert the doc list to trainable data format
            self.__train_X, self.__train_y = self.__convert(
                docs, self.__emb_pkl_path)

        print('Finish loading train data')
예제 #4
0
    def __load_test(self):
        """ load test data """
        print('\nStart loading test data ...')

        if os.path.isfile(self.__test_emb_pkl_path):
            self.__test_X, self.__test_y, _, _ = load_pkl(
                self.__test_emb_pkl_path)

        else:
            print('loading test doc list ...')

            # load the doc_list
            emb_json_path = os.path.join(path.PATH_TMP_DIR,
                                         'emb_test_data.json')
            if os.path.isfile(emb_json_path):
                docs = load_json(emb_json_path)
            else:
                path_list = self.__get_path_list('test')
                docs = self.__load_docs(path_list, emb_json_path)

            print('converting test docs to trainable test data format ...')

            # convert the doc list to trainable data format
            self.__test_X, self.__test_y = self.__convert(
                docs, self.__test_emb_pkl_path)

        print('Finish loading test data')
예제 #5
0
    def __init__(self):
        # load the data
        self.train_loader = self.Loader(*self.train_preprocess_dirs)
        self.val_loader = self.Loader(*self.val_preprocess_dirs)

        # get the generator of the dataset
        self.train_data = self.train_loader.generator(
            self.M.pos_emb, self.M.train_params['batch_size'])
        self.val_data = self.val_loader.generator(
            self.M.pos_emb, self.M.train_params['batch_size'])

        # get the data size
        self.train_size = self.train_loader.size()
        self.val_size = self.val_loader.size()

        # get an example of a batch
        self.train_example_x, self.train_example_y = self.train_loader.batch_example(
            self.M.pos_emb)
        self.train_batch = self.train_loader.batch_data(self.M.pos_emb)
        self.val_batch = self.val_loader.batch_data(self.M.pos_emb)

        # load the tokenizer
        self.tokenizer = load_pkl(
            get_file_path(data_dir, 'tokenizer', self.tokenizer_dir,
                          'tokenizer.pkl'))
        self.vocab_size = self.tokenizer.vocab_size

        # show some statistics for dataset
        print(f'vocab_size: {self.vocab_size}\n')
        self.train_stats = self.train_loader.show_statistics(self.M.pos_emb)
        self.val_stats = self.val_loader.show_statistics(self.M.pos_emb)
예제 #6
0
    def __init__(self,
                 start_ratio=0.0,
                 end_ratio=0.98,
                 _sample_rate=1.0,
                 data_params={},
                 tokenizer_pl=[],
                 encoder_pl=[],
                 _tokenizer_dir='cdlm',
                 _dataset='cdlm'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.__encoder_pl = encoder_pl
        self.__sample_rate = _sample_rate

        self.__tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl')
        self.__processed_dir_path = create_dir(data_dir, 'preprocessed',
                                               _dataset)

        # load data from files
        # zh_en_dict = load_json(filtered_pos_union_en_zh_dict_path)
        zh_en_dict = load_json(filtered_pos_union_zh_en_dict_path)
        zh_en_list = list(
            filter(lambda x: 'translation' in x[1] and x[1]['translation'],
                   zh_en_dict.items()))
        zh_en_list = list(
            map(
                lambda x: [[x[0]] * len(x[1]['translation']), x[1][
                    'translation']], zh_en_list))
        # data = reduce(lambda x, y: [x[0] + y[0], x[1] + y[1]], zh_en_list)

        zh_data = []
        en_data = []
        length = len(zh_en_list)
        for i, val in enumerate(zh_en_list):
            if i % 50 == 0:
                progress = float(i + 1) / length * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            zh_data += val[0]
            en_data += val[1]

        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        if os.path.isfile(self.__tokenizer_path):
            self.__tokenizer = load_pkl(self.__tokenizer_path)
        else:
            self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
            self.get_tokenizer()

        # get the data set (train or validation or test)
        data = self.__split_data(data, start_ratio, end_ratio)

        self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
예제 #7
0
    def __init__(self, tokenizer_dir, un_preprocess_dirs,
                 data_params={}, pretrain_params={}, encoder_pl=[]):
        # initialize variables
        self.__data_params = data_params
        self.__pretrain_params = pretrain_params
        self.__encoder_pl = encoder_pl
        self.__dirs = un_preprocess_dirs

        self.__running = True
        self.__cur_index = 0
        self.__data = []
        self.__file_list = []

        self.__tokenizer = load_pkl(get_file_path(data_dir, 'tokenizer', tokenizer_dir, 'tokenizer.pkl'))

        # get the list of all files
        for dir_name in self.__dirs:
            _dir_path = create_dir(data_dir, 'un_preprocessed', dir_name)
            self.__file_list += list(map(lambda x: os.path.join(_dir_path, x), os.listdir(_dir_path)))
        self.__len_files = len(self.__file_list)

        random.seed(self.RANDOM_STATE)
        random.shuffle(self.__file_list)

        self.start()
예제 #8
0
    def __load(self):
        data_queue = []
        max_queue_size = min(self.size(), self.queue_size)
        max_buffer_size = min(self.size(), self.buffer_size)

        while self.__running:
            while len(data_queue) < max_queue_size:
                file_path = self.__file_list[self.__cur_index]
                self.__cur_index = (self.__cur_index + 1) % self.__len_files

                batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y = load_pkl(
                    file_path)
                data_queue += list(
                    zip(batch_x, batch_y, batch_lan_x, batch_lan_y,
                        batch_pos_y))

            if len(self.__data) < max_buffer_size:
                random.seed(42)
                random.shuffle(data_queue)

                self.__data += data_queue
                data_queue = []

            time.sleep(0.1)

        print('Stop thread for loading data ')
    def __init__(self, input_days=20, force_refresh=False):
        self.__input_days = input_days
        self.__output_days = input_days // 10

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_DEALER_PRED
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_DEALER_PRED
        volume_level = load.VOLUME_LEVEL_FOR_DEALER_PRED
        no_below = load.NO_BELOW_FOR_DEALER_PRED
        data_index = load.DATA_INDEX_FOR_DEALER_PRED

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        if os.path.isfile(self.__data_pkl_path) and not force_refresh:
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            data_root_dir = os.path.join(data_subset, year, volume_level,
                                         data_index)

            print(f'\nStart loading data from {data_root_dir} ...')

            # load doc list
            train_doc_list = self.__load_dir(
                os.path.join(data_root_dir, 'train'))
            test_doc_list = self.__load_dir(os.path.join(
                data_root_dir, 'test'))

            print(f'Finish loading \n\nStart processing data ... ')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(train_doc_list,
                                                       no_below)

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_doc_list)
            self.__train_y = self.__convert_output(train_doc_list)
            self.__test_X = self.__convert_input(test_doc_list)
            self.__test_y = self.__convert_output(test_doc_list)

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()
예제 #10
0
    def __preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        if self.tokenizer_dir:
            self.__src_tokenizer = load_pkl(
                get_file_path(data_dir, 'tokenizer', self.tokenizer_dir,
                              'tokenizer.pkl'))
            self.__tar_tokenizer = self.__src_tokenizer

        elif self.M.checkpoint_params['load_model']:
            load_model_params = self.M.checkpoint_params['load_model']

            tokenizer_path = create_dir_in_root('runtime', 'tokenizer',
                                                load_model_params[0],
                                                load_model_params[1],
                                                'tokenizer.pkl')
            self.__src_tokenizer = self.__tar_tokenizer = read_cache(
                tokenizer_path)

        else:
            self.__src_tokenizer = utils.pipeline(
                self.M.tokenizer_pl,
                self.__tokenizer_data_src,
                self.__tokenizer_data_tar,
                self.M.data_params,
            )
            self.__tar_tokenizer = self.__src_tokenizer
            del self.__tokenizer_data_src
            del self.__tokenizer_data_tar

        params = {
            **self.M.data_params,
            'tokenizer': self.__src_tokenizer,
            'src_tokenizer': self.__src_tokenizer,
            'tar_tokenizer': self.__tar_tokenizer,
        }

        self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__train_src, self.__train_tar, params)

        self.__val_src_encode, self.__val_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__val_src, self.__val_tar, params)

        self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__test_src, self.__test_tar, params)

        # get vocabulary size
        self.__src_vocab_size = self.__src_tokenizer.vocab_size
        self.__tar_vocab_size = self.__tar_tokenizer.vocab_size

        print('\nFinish preprocessing ')
예제 #11
0
    def __init__(self,
                 _is_train,
                 _sample_rate=1.0,
                 data_params={},
                 tokenizer_pl=[],
                 encoder_pl=[],
                 _tokenizer_dir='cdlm',
                 _dataset='cdlm'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.__encoder_pl = encoder_pl
        self.__sample_rate = _sample_rate

        self.__tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl')
        self.__processed_dir_path = create_dir(data_dir, 'preprocessed',
                                               _dataset)

        # initialize wmt news loader
        start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio)

        # initialize news commentary loader
        start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_news_commentary_loader = zh_en_news_commentary.Loader(
            start_ratio, end_ratio, 0.2)

        # load the data
        zh_data, en_data = zh_en_wmt_loader.data()
        zh_data_2, en_data_2 = zh_en_news_commentary_loader.data()

        # combine data
        zh_data += zh_data_2
        en_data += en_data_2
        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        if os.path.isfile(self.__tokenizer_path):
            self.__tokenizer = load_pkl(self.__tokenizer_path)
        else:
            self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
            self.get_tokenizer()

        self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
예제 #12
0
    def __load(self):
        while self.__running:

            while len(self.__X) < self.__buffer_size:
                file_path = self.__file_list[self.__cur_index]
                self.__cur_index = (self.__cur_index + 1) % self.__len_files

                X_mask, Y = utils.load_pkl(file_path)

                self.__X = np.vstack([self.__X, X_mask]) if len(
                    self.__X) else X_mask
                self.__y = np.vstack([self.__y, Y]) if len(self.__y) else Y

            time.sleep(1)

        print('Stop thread for loading data ')
    def __init__(self, input_days=20):
        self.__input_days = input_days
        self.__output_days = input_days // 10

        # get the path of the cache data
        subset = os.path.split(load.DATA_SUBSET)[1]
        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'interval_input_output_{subset}_{load.YEAR}_{load.VOLUME_LEVEL}_{load.DATA_INDEX}_no_below_{load.NO_BELOW}_input_days_{input_days}.pkl'
        )

        if os.path.isfile(self.__data_pkl_path):
            self.__train_data, self.__test_data, self.dict, self.voc_size = utils.load_pkl(
                self.__data_pkl_path)

        else:

            data_root_dir = os.path.join(load.DATA_SUBSET, load.YEAR,
                                         load.VOLUME_LEVEL, load.DATA_INDEX)

            print(f'\nStart loading data from {data_root_dir} ...')

            # load doc list
            train_doc_list = self.__load_dir(
                os.path.join(data_root_dir, 'train'))
            test_doc_list = self.__load_dir(os.path.join(
                data_root_dir, 'test'))

            print(f'Finish loading \n\nStart processing data ... ')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(train_doc_list)

            # convert doc list to trainable interval summed one-hot vector
            self.__train_data = self.__convert(train_doc_list)
            self.__test_data = self.__convert(test_doc_list)

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_data, self.__test_data, self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()
예제 #14
0
    def __init__(self,
                 start_ratio=0.0,
                 end_ratio=0.98,
                 _sample_rate=1.0,
                 data_params={},
                 tokenizer_pl=[],
                 encoder_pl=[],
                 _tokenizer_dir='cdlm',
                 _dataset='cdlm'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.__encoder_pl = encoder_pl
        self.__sample_rate = _sample_rate

        self.__tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl')
        self.__processed_dir_path = create_dir(data_dir, 'preprocessed',
                                               _dataset)

        # load data from files
        data = news_commentary.zh_en()

        data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        if os.path.isfile(self.__tokenizer_path):
            self.__tokenizer = load_pkl(self.__tokenizer_path)
        else:
            tmp_data = reduce(lambda x, y: x + y, data)
            self.__tokenizer_src, self.__tokenizer_tar = list(zip(*tmp_data))
            self.get_tokenizer()

        # get the data set (train or validation or test)
        data = self.__split_data(data, start_ratio, end_ratio)

        data = reduce(lambda x, y: x + y, data)

        self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
예제 #15
0
    def all(self):
        if self.__has_load_all:
            return self.__X, self.__y

        print(f'Loading all data from {self.__dir_path} ...')

        for i, file_path in enumerate(self.__file_list):
            if i % 2 == 0:
                progress = float(i + 1) / self.__len_files * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            X_mask, Y = utils.load_pkl(file_path)
            self.__X = np.vstack([self.__X, X_mask]) if len(
                self.__X) else X_mask
            self.__y = np.vstack([self.__y, Y]) if len(self.__y) else Y

        print('\rprogress: 100.0%%\nFinish loading ')

        self.__has_load_all = True

        # add statistics to log
        self.__statistic()

        return self.__X, self.__y
    def __init__(self, input_days=20, force_refresh=False):
        self.__input_days = input_days
        self.__output_days = input_days // 10

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_TEMPORAL_INPUT
        volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT
        no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT
        data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_has_pretrain_same_volume_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        data_all_root_dir = os.path.join(data_subset, year, volume_level)
        all_level = os.path.split(data_all_root_dir)[1]

        self.__pretrain_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_dealer_83_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        _, _, _, _, self.dict_pretrain, self.voc_size_pretrain = utils.load_pkl(
            self.__pretrain_pkl_path)

        if os.path.isfile(self.__data_pkl_path) and not force_refresh:
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            data_root_dir = os.path.join(data_subset, year, volume_level,
                                         data_index)

            print(f'\nStart loading data from {data_root_dir} ...')

            # load doc list
            train_doc_list = self.__load_dir(
                os.path.join(data_root_dir, 'train'))
            test_doc_list = self.__load_dir(os.path.join(
                data_root_dir, 'test'))

            print(f'Finish loading \n\nStart processing data ... ')

            # generate the dictionary which maps the bond_id to index
            self.dict, self.voc_size = self.__gen_dict(train_doc_list,
                                                       no_below)

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_doc_list, self.dict,
                                                  self.voc_size)
            self.__train_y = self.__convert_output(train_doc_list)
            self.__test_X = self.__convert_input(test_doc_list, self.dict,
                                                 self.voc_size)
            self.__test_y = self.__convert_output(test_doc_list)

            self.__train_X_pretrain = self.__convert_input(
                train_doc_list, self.dict_pretrain, self.voc_size_pretrain)
            self.__test_X_pretrain = self.__convert_input(
                test_doc_list, self.dict_pretrain, self.voc_size_pretrain)

            o_lstm = LSTM('2020_01_12_18_49_46',
                          'lstm_for_pretrain_with_same_volume', 2007)
            o_lstm.compile(0.001)
            o_lstm.load_model(
                r'D:\Github\bond_prediction\runtime\models\lstm_for_pretrain_with_same_volume\2020_01_12_18_49_46\lstm_for_pretrain_with_same_volume.030-0.0596.hdf5',
                np.zeros([1, 20, 2007]), np.zeros([1, 20, 2007]))

            self.__train_X_pretrain = o_lstm.predict(self.__train_X_pretrain)
            self.__test_X_pretrain = o_lstm.predict(self.__test_X_pretrain)

            self.__train_X_pretrain = np.array([
                self.__train_X_pretrain for i in range(20)
            ]).transpose([1, 0, 2])
            self.__test_X_pretrain = np.array([
                self.__test_X_pretrain for i in range(20)
            ]).transpose([1, 0, 2])

            # print(self.__train_X_pretrain.shape)
            # print(self.__test_X_pretrain.shape)

            self.__train_X = np.vstack([
                self.__train_X.transpose([2, 0, 1]),
                self.__train_X_pretrain.transpose([2, 0, 1])
            ])
            self.__test_X = np.vstack([
                self.__test_X.transpose([2, 0, 1]),
                self.__test_X_pretrain.transpose([2, 0, 1])
            ])
            self.__train_X = self.__train_X.transpose([1, 2, 0])
            self.__test_X = self.__test_X.transpose([1, 2, 0])

            # print(self.__train_X.shape)
            # print(self.__test_X.shape)

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()
예제 #17
0
def gen_inputs(group_file_path,
               group_index,
               input_time_steps_list,
               output_time_steps_list,
               with_day_off=True,
               buy_sell_plan=2,
               use_volume=False,
               save_path='',
               split_ratio=0.9,
               is_train=True):
    d_dealer_index_2_group_label = utils.load_json(group_file_path)
    # d_dealer_index_2_trace_list = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'))

    d_dealer_for_gen_input = utils.load_pkl(
        os.path.join(path.ROOT_DIR, 'runtime',
                     'd_dealer_for_gen_input_with_no_below_50_25_10.pkl'))

    tmp_list = [
        dealer_index
        for dealer_index, group_label in d_dealer_index_2_group_label.items()
        if group_label == group_index
    ]
    print(f'len_group_member: {len(tmp_list)}')

    # get total trace list
    train_trace_list = []
    test_trace_list = []
    for dealer_index, val in d_dealer_for_gen_input.items():
        if dealer_index not in d_dealer_index_2_group_label or \
                d_dealer_index_2_group_label[dealer_index] != group_index:
            continue
        trace_list = val['trace_list']
        trace_list.sort(key=lambda x: x[-1])

        num_samples = len(trace_list) - max(input_time_steps_list) - max(
            output_time_steps_list)
        split_index = int(num_samples * split_ratio +
                          max(input_time_steps_list))
        train_trace_list += trace_list[:split_index]
        test_trace_list += trace_list[split_index:]

        # print(dealer_index, len(trace_list), trace_list)

    train_trace_list.sort(key=lambda x: x[-1])

    # get dictionary
    train_doc_list = [list(map(lambda x: x[0], train_trace_list))]
    dictionary = corpora.Dictionary(train_doc_list)
    len_bonds = len(dictionary)
    print(f'total bond num (group {group_index}): {len_bonds}')

    X = []
    X_mask = []
    Y = []

    for dealer_index, val in d_dealer_for_gen_input.items():
        if dealer_index not in d_dealer_index_2_group_label or \
                d_dealer_index_2_group_label[dealer_index] != group_index:
            continue

        # filter bonds that only appear in test set

        trace_list = val['trace_list']
        num_samples = len(trace_list) - max(input_time_steps_list) - max(
            output_time_steps_list)
        split_index = int(num_samples * split_ratio +
                          max(input_time_steps_list))

        if is_train:
            trace_list = trace_list[:split_index]
        else:
            trace_list = trace_list[split_index:]
            trace_list = [
                v for v in trace_list if dictionary.doc2idx([v[0]])[0] != -1
            ]
        trace_list.sort(key=lambda x: x[-1])

        start_date = trace_list[0][-1]
        end_date = trace_list[-1][-1]

        # Format the data in date structure
        date_matrix, date_mask, dict_date_2_input_m_index = __generate_date_structure(
            len_bonds, start_date, end_date, with_day_off, buy_sell_plan)

        # according to the transaction history, fill the data into date structure
        for i, trace in enumerate(trace_list):
            bond_id = trace[0]
            volume = trace[1]
            _date = trace[-1]
            trace_type = trace[2]
            bond_index = dictionary.doc2idx([bond_id])[0]

            value = 1 if not use_volume else np.log10(volume)
            if _date not in dict_date_2_input_m_index:
                continue

            date_mask = __change_mask(buy_sell_plan, date_mask, bond_index,
                                      len_bonds, dict_date_2_input_m_index,
                                      _date, trace_type, value)

        # sample the data
        longest_input_length = max(input_time_steps_list) + 2
        for input_time_steps in input_time_steps_list:
            for output_time_steps in output_time_steps_list:
                # input_list = __sample_list(date_matrix, input_time_steps, 0, output_time_steps,
                #                            __token(date_matrix.shape[-1]), __token(date_matrix.shape[-1]))
                input_mask_list = __sample_list(
                    date_mask,
                    input_time_steps,
                    0,
                    output_time_steps,
                    __start_token_mask(date_mask.shape[1:], with_day_off),
                    __end_token_mask(date_mask.shape[1:], with_day_off),
                    longest_len=longest_input_length)

                convert_fn = __convert_2_zero_one if buy_sell_plan in [
                    0, 2
                ] else None
                output_list = __sample_list(date_mask,
                                            output_time_steps,
                                            input_time_steps,
                                            0,
                                            convert_fn=convert_fn)

                if len(input_mask_list) != len(output_list):
                    continue

                # ...
                # X += input_list
                X_mask += input_mask_list
                Y += output_list

        # d_dealer_index_2_trace_list_ordered[dealer_index] = [date_matrix, date_mask]

    if save_path:
        del d_dealer_for_gen_input
        del d_dealer_index_2_group_label
        del X

        X_mask = np.asarray(X_mask, dtype=np.int32)
        Y = np.asarray(Y, dtype=np.int32)

        print('\n------------------------------')
        # print(len(X))
        print(X_mask.shape)
        print(Y.shape)

        len_X = len(X_mask)
        num_files = int(np.ceil(len_X / 2000.))
        for i in range(num_files):
            start_index = i * 2000
            end_index = (i + 1) * 2000
            utils.write_pkl(
                save_path + f'_{i}.pkl',
                [X_mask[start_index:end_index], Y[start_index:end_index]])

    # return d_dealer_index_2_trace_list_ordered
    return X_mask, Y
예제 #18
0
def gen_group_according_to(file_path):
    print('loading data ...')
    dict_dealer_index_2_group = utils.load_json(file_path)

    data, d_dealers, total_volume, total_transaction_count, bound_timestamp, d_new_bonds = utils.load_pkl(
        os.path.join(path.ROOT_DIR, 'runtime', 'tmp123.pkl'))

    utils.write_pkl(
        os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'), d_dealers)
    # d_dealers = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'))

    labels = set(list(map(lambda x: x[1], dict_dealer_index_2_group.items())))
    group_list = [{} for i in range(len(labels))]

    print('traversing data ...')

    length = len(d_dealers)
    cur = 0
    for dealer_index, trace_list in d_dealers.items():
        # show progress
        if cur % 5 == 0:
            progress = float(cur + 1) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')
        cur += 1

        if dealer_index not in dict_dealer_index_2_group:
            continue

        group_index = dict_dealer_index_2_group[dealer_index]
        group_list[group_index][dealer_index] = trace_list

    print('\rprogress: 100.0%  \nsaving data ...')

    plan_name = os.path.splitext(os.path.split(file_path)[1])[0] + '.json'
    group_path = os.path.join(path.DATA_ROOT_DIR, 'groups', plan_name)
    utils.write_json(group_path, group_list)
    def __init__(self, input_days=20, force_refresh=False):
        self.__input_days = input_days
        self.__output_days = input_days // 10
        self.__input_mode = 0

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_TEMPORAL_INPUT
        volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT
        no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT
        data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT

        data_all_root_dir = os.path.join(data_subset, year, volume_level)
        all_level = os.path.split(data_all_root_dir)[1]

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        if os.path.isfile(self.__data_pkl_path) and not force_refresh:
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            print(f'\nStart loading data from {data_all_root_dir} ...')

            train_start_timestamp = utils.date_2_timestamp('2015-01-02')
            train_end_timestamp = utils.date_2_timestamp('2015-10-14', True)

            test_start_timestamp = utils.date_2_timestamp('2015-10-14')
            test_end_timestamp = utils.date_2_timestamp('2015-12-31', True)

            data_all_pkl_path = os.path.join(
                path.PATH_TMP_DIR,
                f'all_doc_list_for_pretrain_{subset}_{year}_{all_level}.pkl')

            if os.path.isfile(data_all_pkl_path):
                train_all_doc_list, test_all_doc_list = utils.load_pkl(
                    data_all_pkl_path)

            else:
                train_all_doc_list = self.__load_dir_all(
                    data_all_root_dir, train_start_timestamp,
                    train_end_timestamp, 'train')
                test_all_doc_list = self.__load_dir_all(
                    data_all_root_dir, test_start_timestamp,
                    test_end_timestamp, 'test')

                # train_all_doc_list = []
                # test_all_doc_list = []
                # for _volume in os.listdir(data_all_root_dir):
                #     sub_all_root_dir = os.path.join(data_all_root_dir, _volume)
                #     sub_train_all_doc_list = self.__load_dir_all(sub_all_root_dir, train_start_timestamp,
                #                                                  train_end_timestamp,
                #                                                  'train')
                #     sub_test_all_doc_list = self.__load_dir_all(sub_all_root_dir, test_start_timestamp,
                #                                                 test_end_timestamp,
                #                                                 'test')
                #
                #     train_all_doc_list += sub_train_all_doc_list
                #     test_all_doc_list += sub_test_all_doc_list

                utils.write_pkl(data_all_pkl_path,
                                [train_all_doc_list, test_all_doc_list])

            print(f'Finish loading \n\nStart processing data ... ')

            train_all_docs = []
            for v in train_all_doc_list:
                train_all_docs += v
            test_all_docs = []
            for v in test_all_doc_list:
                test_all_docs += v
            del train_all_doc_list
            del test_all_doc_list

            self.dict, self.voc_size = self.__gen_dict(train_all_docs, 150)

            # # generate the dictionary which maps the bond_id to index
            # self.dict, self.voc_size = self.__gen_dict(train_all_doc_list, no_below)

            print(self.voc_size)
            # print(self.voc_size_all)
            print(
                'Finish generating dict\n\nStart converting input output ...')

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_all_docs, self.dict,
                                                  self.voc_size,
                                                  'allow_unknown')
            self.__train_y = self.__convert_output(train_all_docs)
            self.__test_X = self.__convert_input(test_all_docs, self.dict,
                                                 self.voc_size,
                                                 'allow_unknown')
            self.__test_y = self.__convert_output(test_all_docs)

            # self.__train_X = 0.
            # self.__test_X = 0.
            # self.__train_y = 0.
            # self.__test_y = 0.
            #
            # for doc_list in train_all_doc_list:
            #     self.__train_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown')
            #     self.__train_y += self.__convert_output(doc_list)
            # self.__train_X /= len(train_all_doc_list)
            #
            # for doc_list in test_all_doc_list:
            #     self.__test_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown')
            #     self.__test_y += self.__convert_output(doc_list)
            # self.__test_X /= len(test_all_doc_list)

            # self.__train_all_X = np.array(
            #     list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'),
            #              train_all_doc_list)))
            # self.__train_all_y = np.array(list(map(self.__convert_output, train_all_doc_list)))
            # self.__test_all_X = np.array(
            #     list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'),
            #              test_all_doc_list)))
            # self.__test_all_y = np.array(list(map(self.__convert_output, test_all_doc_list)))

            # self.__train_all_X = np.mean(self.__train_all_X, axis=0)
            # self.__train_all_y = np.mean(self.__train_all_y, axis=0)
            # self.__test_all_X = np.mean(self.__test_all_X, axis=0)
            # self.__test_all_y = np.mean(self.__test_all_y, axis=0)

            # # convert doc list to trainable interval summed one-hot vector
            # self.__train_X = self.__convert_input(train_doc_list, self.dict, self.voc_size, 'allow_unknown')
            # self.__train_y = self.__convert_output(train_doc_list)
            # self.__test_X = self.__convert_input(test_doc_list, self.dict, self.voc_size, 'allow_unknown')
            # self.__test_y = self.__convert_output(test_doc_list)

            # self.__train_X = np.vstack([self.__train_X.transpose([2, 0, 1]), self.__train_all_X.transpose([2, 0, 1])])
            # self.__test_X = np.vstack([self.__test_X.transpose([2, 0, 1]), self.__test_all_X.transpose([2, 0, 1])])
            # self.__train_X = self.__train_X.transpose([1, 2, 0])
            # self.__test_X = self.__test_X.transpose([1, 2, 0])

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()
#         _idx = total_dictionary.doc2idx([bond_id])[0]
#         if tmp_date not in dict_date_2_input_index or _idx == -1:
#             continue
#         tmp_inputs[dict_date_2_input_index[tmp_date]][_idx] = 1
#     d_dealer_index_2_input[dealer_index] = tmp_inputs
#
# for v in origin_l_dealers:
#     dealer_index = v[0]
#     v.append(d_dealer_index_2_input[dealer_index])

# --------------------------------

print('Loading variables ...')

# utils.write_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_origin_l_dealers.pkl'), origin_l_dealers)
origin_l_dealers = utils.load_pkl(
    os.path.join(path.ROOT_DIR, 'runtime', 'tmp_origin_l_dealers.pkl'))

print('Converting ...')

new_l_dealers = []
for v in origin_l_dealers:
    if v[-3] <= 5:
        continue
    new_l_dealers.append(v)

origin_l_dealers = new_l_dealers[:240]
l_dealers = list(
    map(
        lambda x:
        # [x[0], np.log(x[4]), np.log10(x[5] + 1.1), x[6], x[7], x[8]],
        [
import os
import numpy as np
from config import path
from lib import utils

path_d_issue_id_offering_date = os.path.join(
    path.DATA_ROOT_DIR, 'dict_issue_id_offering_date.json')
dict_issue_id_offering_date = utils.load_json(path_d_issue_id_offering_date)

path_pkl_2015 = os.path.join(path.TRACE_DIR, 'finra_trace_2015.pkl')
data = utils.load_pkl(path_pkl_2015)

print('\nstart converting data ...')
data = np.array(data)
data = list(map(lambda x: {'bond_id': x[0], 'issue_id': x[16]}, data))
print('finish converting ')

dict_skip_bond = {}

dict_bond_id_offering_date = {}
for v in data:
    bond_id = v['bond_id']
    issue_id = str(int(v['issue_id']))

    if issue_id not in dict_issue_id_offering_date:
        dict_skip_bond[bond_id] = True
        print('------------------------')
        print(bond_id, issue_id)
        continue

    offering_date = dict_issue_id_offering_date[issue_id]