def gen_preprocessed_data(self, data, batch_size): length = len(data) num_batch = int(math.ceil(length / batch_size)) steps = int(num_batch * self.__sample_rate) print(f'\nstart generating preprocessed data ({steps} files) ... ') for i in range(steps): # show progress if i % 10 == 0: progress = float(i + 1) / steps * 100. print('\rprogress: %.2f%% ' % progress, end='') # get a batch index_of_batch = i % num_batch index_start = int(index_of_batch * batch_size) index_end = index_start + batch_size batch_src, batch_tar = list(zip(*data[index_start:index_end])) # preprocess data batch_x, batch_y, _, _ = utils.pipeline( self.__encoder_pl, batch_src, batch_tar, { **self.__data_params, 'tokenizer': self.__tokenizer }, verbose=i == 0) # save data to file file_path = os.path.join(self.__processed_dir_path, f'batch_{i}.pkl') write_pkl(file_path, [batch_x, batch_y]) print('finish generating preprocessed data ')
def __convert(self, docs, pkl_path): """ convert the doc list to trainable data format """ docs = list( map( lambda x: list((np.array(self.dict.doc2idx(x)) + self.voc_size) % self.voc_size) if x else x, docs)) X = [] y = [] for i, doc in enumerate(docs): len_doc = len(doc) len_data = len_doc - (2 * self.__window + 1) + 1 if len_data <= 0: continue X += [ doc[j:j + self.__window] + doc[j + self.__window + 1:j + 2 * self.__window + 1] for j in range(len_data) ] y += [doc[j + self.__window] for j in range(len_data)] X = np.array(X) y = np.array(y) X, y = self.__shuffle(X, y) # cache data for faster processing next time write_pkl(pkl_path, [X, y, self.dict, self.voc_size]) return X, y
def __init__(self, input_days=20): self.__input_days = input_days self.__output_days = input_days // 10 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_TEMPORAL_INPUT volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_emb_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl') self.__test_emb_pkl_path = os.path.join(path.PATH_TMP_DIR, f'emb_test_data_w_3_no_below_1000.pkl') if os.path.isfile(self.__data_pkl_path): self.__train_X, self.__train_y, self.__test_X, self.__test_y, \ self.emb_dict, self.emb_voc_size, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: print('\nStart loading embedding model ... ') self.__load_emb_model() print('Finish loading embedding model ') _, _, self.emb_dict, self.emb_voc_size = utils.load_pkl(self.__test_emb_pkl_path) data_root_dir = os.path.join(data_subset, year, volume_level, data_index) print(f'\nStart loading data from {data_root_dir} ...') # load doc list train_doc_list = self.__load_dir(os.path.join(data_root_dir, 'train')) test_doc_list = self.__load_dir(os.path.join(data_root_dir, 'test')) print(f'Finish loading \n\nStart generating dict for output ... ') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(train_doc_list) print('Finish generating\n\nStart converting data ...') # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_doc_list) self.__train_y = self.__convert_output(train_doc_list) self.__test_X = self.__convert_input(test_doc_list) self.__test_y = self.__convert_output(test_doc_list) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.emb_dict, self.emb_voc_size, self.dict, self.voc_size]) self.__statistic()
def __init__(self, input_days=20, force_refresh=False): self.__input_days = input_days self.__output_days = input_days // 10 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_DEALER_PRED subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_DEALER_PRED volume_level = load.VOLUME_LEVEL_FOR_DEALER_PRED no_below = load.NO_BELOW_FOR_DEALER_PRED data_index = load.DATA_INDEX_FOR_DEALER_PRED self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl' ) if os.path.isfile(self.__data_pkl_path) and not force_refresh: self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: data_root_dir = os.path.join(data_subset, year, volume_level, data_index) print(f'\nStart loading data from {data_root_dir} ...') # load doc list train_doc_list = self.__load_dir( os.path.join(data_root_dir, 'train')) test_doc_list = self.__load_dir(os.path.join( data_root_dir, 'test')) print(f'Finish loading \n\nStart processing data ... ') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(train_doc_list, no_below) # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_doc_list) self.__train_y = self.__convert_output(train_doc_list) self.__test_X = self.__convert_input(test_doc_list) self.__test_y = self.__convert_output(test_doc_list) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()
def get_tokenizer(self): print('\nstart training tokenizer ... ') self.__tokenizer = utils.pipeline( self.__tokenizer_pl, self.__tokenizer_src, self.__tokenizer_tar, self.__data_params, ) del self.__tokenizer_src del self.__tokenizer_tar print('finish training tokenizer') # saving the tokenizer to file write_pkl(self.__tokenizer_path, self.__tokenizer) return self.__tokenizer
def __init__(self, input_days=20): self.__input_days = input_days self.__output_days = input_days // 10 # get the path of the cache data subset = os.path.split(load.DATA_SUBSET)[1] self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'interval_input_output_{subset}_{load.YEAR}_{load.VOLUME_LEVEL}_{load.DATA_INDEX}_no_below_{load.NO_BELOW}_input_days_{input_days}.pkl' ) if os.path.isfile(self.__data_pkl_path): self.__train_data, self.__test_data, self.dict, self.voc_size = utils.load_pkl( self.__data_pkl_path) else: data_root_dir = os.path.join(load.DATA_SUBSET, load.YEAR, load.VOLUME_LEVEL, load.DATA_INDEX) print(f'\nStart loading data from {data_root_dir} ...') # load doc list train_doc_list = self.__load_dir( os.path.join(data_root_dir, 'train')) test_doc_list = self.__load_dir(os.path.join( data_root_dir, 'test')) print(f'Finish loading \n\nStart processing data ... ') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(train_doc_list) # convert doc list to trainable interval summed one-hot vector self.__train_data = self.__convert(train_doc_list) self.__test_data = self.__convert(test_doc_list) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_data, self.__test_data, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()
def gen_data(self, data, batch_size): length = len(data) num_batch = int(math.ceil(length / batch_size)) print(f'\nstart generating preprocessed data ({num_batch} files) ... ') for i in range(num_batch): # show progress if i % 10 == 0: progress = float(i + 1) / num_batch * 100. print('\rprogress: %.2f%% ' % progress, end='') # get a batch index_of_batch = i % num_batch index_start = int(index_of_batch * batch_size) index_end = index_start + batch_size batch_src, batch_tar = list(zip(*data[index_start: index_end])) # save data to file file_path = os.path.join(self.__processed_dir_path, f'batch_{i}.pkl') write_pkl(file_path, [batch_src, batch_tar]) print('finish generating preprocessed data ')
def gen_group_according_to(file_path): print('loading data ...') dict_dealer_index_2_group = utils.load_json(file_path) data, d_dealers, total_volume, total_transaction_count, bound_timestamp, d_new_bonds = utils.load_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'tmp123.pkl')) utils.write_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'), d_dealers) # d_dealers = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl')) labels = set(list(map(lambda x: x[1], dict_dealer_index_2_group.items()))) group_list = [{} for i in range(len(labels))] print('traversing data ...') length = len(d_dealers) cur = 0 for dealer_index, trace_list in d_dealers.items(): # show progress if cur % 5 == 0: progress = float(cur + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') cur += 1 if dealer_index not in dict_dealer_index_2_group: continue group_index = dict_dealer_index_2_group[dealer_index] group_list[group_index][dealer_index] = trace_list print('\rprogress: 100.0% \nsaving data ...') plan_name = os.path.splitext(os.path.split(file_path)[1])[0] + '.json' group_path = os.path.join(path.DATA_ROOT_DIR, 'groups', plan_name) utils.write_json(group_path, group_list)
'total_transaction_count': tmp_trace_count, 'total_volume': tmp_volume, 'dictionary': tmp_dictionary, 'no_below_transaction_count': new_l[-1][0], 'no_below_volume': new_l[-1][1], 'no_below_num_bonds': new_l[-1][2], 'trace_list': tmp_trace_list, } if new_l[-1][0] == 0 or new_l[-1][2] <= 5: continue l_dealers.append(new_l) l_dealers.sort(key=lambda x: -x[1]) print(f'len of d_dealer_for_gen_input: {len(d_dealer_for_gen_input)}') utils.write_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'd_dealer_for_gen_input_with_no_below_50_25_10.pkl'), d_dealer_for_gen_input) # print('done') # exit() string = 'num_of_dealers,dealers_total_transaction_count,dealers_total_transaction_count(percentage),' string += 'dealer_total_volume,dealer_total_volume(percentage),' string += 'num_of_old_bonds,num_of_old_bonds(percentage),' string += 'dealers_total_transaction_count_2,dealers_total_transaction_count_2(percentage),' string += 'dealer_total_volume_2,dealer_total_volume_2(percentage),' string += 'num_of_old_bonds_2,num_of_old_bonds_2(percentage)\n' for num_of_dealers in range(20, 270, 20): tmp_dealers = l_dealers[:num_of_dealers] dict_first_num_dealers = {}
def __init__(self, input_days=20, force_refresh=False): self.__input_days = input_days self.__output_days = input_days // 10 self.__input_mode = 0 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_TEMPORAL_INPUT volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT data_all_root_dir = os.path.join(data_subset, year, volume_level) all_level = os.path.split(data_all_root_dir)[1] self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl' ) if os.path.isfile(self.__data_pkl_path) and not force_refresh: self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: print(f'\nStart loading data from {data_all_root_dir} ...') train_start_timestamp = utils.date_2_timestamp('2015-01-02') train_end_timestamp = utils.date_2_timestamp('2015-10-14', True) test_start_timestamp = utils.date_2_timestamp('2015-10-14') test_end_timestamp = utils.date_2_timestamp('2015-12-31', True) data_all_pkl_path = os.path.join( path.PATH_TMP_DIR, f'all_doc_list_for_pretrain_{subset}_{year}_{all_level}.pkl') if os.path.isfile(data_all_pkl_path): train_all_doc_list, test_all_doc_list = utils.load_pkl( data_all_pkl_path) else: train_all_doc_list = self.__load_dir_all( data_all_root_dir, train_start_timestamp, train_end_timestamp, 'train') test_all_doc_list = self.__load_dir_all( data_all_root_dir, test_start_timestamp, test_end_timestamp, 'test') # train_all_doc_list = [] # test_all_doc_list = [] # for _volume in os.listdir(data_all_root_dir): # sub_all_root_dir = os.path.join(data_all_root_dir, _volume) # sub_train_all_doc_list = self.__load_dir_all(sub_all_root_dir, train_start_timestamp, # train_end_timestamp, # 'train') # sub_test_all_doc_list = self.__load_dir_all(sub_all_root_dir, test_start_timestamp, # test_end_timestamp, # 'test') # # train_all_doc_list += sub_train_all_doc_list # test_all_doc_list += sub_test_all_doc_list utils.write_pkl(data_all_pkl_path, [train_all_doc_list, test_all_doc_list]) print(f'Finish loading \n\nStart processing data ... ') train_all_docs = [] for v in train_all_doc_list: train_all_docs += v test_all_docs = [] for v in test_all_doc_list: test_all_docs += v del train_all_doc_list del test_all_doc_list self.dict, self.voc_size = self.__gen_dict(train_all_docs, 150) # # generate the dictionary which maps the bond_id to index # self.dict, self.voc_size = self.__gen_dict(train_all_doc_list, no_below) print(self.voc_size) # print(self.voc_size_all) print( 'Finish generating dict\n\nStart converting input output ...') # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_all_docs, self.dict, self.voc_size, 'allow_unknown') self.__train_y = self.__convert_output(train_all_docs) self.__test_X = self.__convert_input(test_all_docs, self.dict, self.voc_size, 'allow_unknown') self.__test_y = self.__convert_output(test_all_docs) # self.__train_X = 0. # self.__test_X = 0. # self.__train_y = 0. # self.__test_y = 0. # # for doc_list in train_all_doc_list: # self.__train_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__train_y += self.__convert_output(doc_list) # self.__train_X /= len(train_all_doc_list) # # for doc_list in test_all_doc_list: # self.__test_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__test_y += self.__convert_output(doc_list) # self.__test_X /= len(test_all_doc_list) # self.__train_all_X = np.array( # list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'), # train_all_doc_list))) # self.__train_all_y = np.array(list(map(self.__convert_output, train_all_doc_list))) # self.__test_all_X = np.array( # list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'), # test_all_doc_list))) # self.__test_all_y = np.array(list(map(self.__convert_output, test_all_doc_list))) # self.__train_all_X = np.mean(self.__train_all_X, axis=0) # self.__train_all_y = np.mean(self.__train_all_y, axis=0) # self.__test_all_X = np.mean(self.__test_all_X, axis=0) # self.__test_all_y = np.mean(self.__test_all_y, axis=0) # # convert doc list to trainable interval summed one-hot vector # self.__train_X = self.__convert_input(train_doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__train_y = self.__convert_output(train_doc_list) # self.__test_X = self.__convert_input(test_doc_list, self.dict, self.voc_size, 'allow_unknown') # self.__test_y = self.__convert_output(test_doc_list) # self.__train_X = np.vstack([self.__train_X.transpose([2, 0, 1]), self.__train_all_X.transpose([2, 0, 1])]) # self.__test_X = np.vstack([self.__test_X.transpose([2, 0, 1]), self.__test_all_X.transpose([2, 0, 1])]) # self.__train_X = self.__train_X.transpose([1, 2, 0]) # self.__test_X = self.__test_X.transpose([1, 2, 0]) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()
def gen_inputs(group_file_path, group_index, input_time_steps_list, output_time_steps_list, with_day_off=True, buy_sell_plan=2, use_volume=False, save_path='', split_ratio=0.9, is_train=True): d_dealer_index_2_group_label = utils.load_json(group_file_path) # d_dealer_index_2_trace_list = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl')) d_dealer_for_gen_input = utils.load_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'd_dealer_for_gen_input_with_no_below_50_25_10.pkl')) tmp_list = [ dealer_index for dealer_index, group_label in d_dealer_index_2_group_label.items() if group_label == group_index ] print(f'len_group_member: {len(tmp_list)}') # get total trace list train_trace_list = [] test_trace_list = [] for dealer_index, val in d_dealer_for_gen_input.items(): if dealer_index not in d_dealer_index_2_group_label or \ d_dealer_index_2_group_label[dealer_index] != group_index: continue trace_list = val['trace_list'] trace_list.sort(key=lambda x: x[-1]) num_samples = len(trace_list) - max(input_time_steps_list) - max( output_time_steps_list) split_index = int(num_samples * split_ratio + max(input_time_steps_list)) train_trace_list += trace_list[:split_index] test_trace_list += trace_list[split_index:] # print(dealer_index, len(trace_list), trace_list) train_trace_list.sort(key=lambda x: x[-1]) # get dictionary train_doc_list = [list(map(lambda x: x[0], train_trace_list))] dictionary = corpora.Dictionary(train_doc_list) len_bonds = len(dictionary) print(f'total bond num (group {group_index}): {len_bonds}') X = [] X_mask = [] Y = [] for dealer_index, val in d_dealer_for_gen_input.items(): if dealer_index not in d_dealer_index_2_group_label or \ d_dealer_index_2_group_label[dealer_index] != group_index: continue # filter bonds that only appear in test set trace_list = val['trace_list'] num_samples = len(trace_list) - max(input_time_steps_list) - max( output_time_steps_list) split_index = int(num_samples * split_ratio + max(input_time_steps_list)) if is_train: trace_list = trace_list[:split_index] else: trace_list = trace_list[split_index:] trace_list = [ v for v in trace_list if dictionary.doc2idx([v[0]])[0] != -1 ] trace_list.sort(key=lambda x: x[-1]) start_date = trace_list[0][-1] end_date = trace_list[-1][-1] # Format the data in date structure date_matrix, date_mask, dict_date_2_input_m_index = __generate_date_structure( len_bonds, start_date, end_date, with_day_off, buy_sell_plan) # according to the transaction history, fill the data into date structure for i, trace in enumerate(trace_list): bond_id = trace[0] volume = trace[1] _date = trace[-1] trace_type = trace[2] bond_index = dictionary.doc2idx([bond_id])[0] value = 1 if not use_volume else np.log10(volume) if _date not in dict_date_2_input_m_index: continue date_mask = __change_mask(buy_sell_plan, date_mask, bond_index, len_bonds, dict_date_2_input_m_index, _date, trace_type, value) # sample the data longest_input_length = max(input_time_steps_list) + 2 for input_time_steps in input_time_steps_list: for output_time_steps in output_time_steps_list: # input_list = __sample_list(date_matrix, input_time_steps, 0, output_time_steps, # __token(date_matrix.shape[-1]), __token(date_matrix.shape[-1])) input_mask_list = __sample_list( date_mask, input_time_steps, 0, output_time_steps, __start_token_mask(date_mask.shape[1:], with_day_off), __end_token_mask(date_mask.shape[1:], with_day_off), longest_len=longest_input_length) convert_fn = __convert_2_zero_one if buy_sell_plan in [ 0, 2 ] else None output_list = __sample_list(date_mask, output_time_steps, input_time_steps, 0, convert_fn=convert_fn) if len(input_mask_list) != len(output_list): continue # ... # X += input_list X_mask += input_mask_list Y += output_list # d_dealer_index_2_trace_list_ordered[dealer_index] = [date_matrix, date_mask] if save_path: del d_dealer_for_gen_input del d_dealer_index_2_group_label del X X_mask = np.asarray(X_mask, dtype=np.int32) Y = np.asarray(Y, dtype=np.int32) print('\n------------------------------') # print(len(X)) print(X_mask.shape) print(Y.shape) len_X = len(X_mask) num_files = int(np.ceil(len_X / 2000.)) for i in range(num_files): start_index = i * 2000 end_index = (i + 1) * 2000 utils.write_pkl( save_path + f'_{i}.pkl', [X_mask[start_index:end_index], Y[start_index:end_index]]) # return d_dealer_index_2_trace_list_ordered return X_mask, Y
def __init__(self, input_days=20, force_refresh=False): self.__input_days = input_days self.__output_days = input_days // 10 # get the path of the cache data data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT subset = os.path.split(data_subset)[1] year = load.YEAR_FOR_TEMPORAL_INPUT volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT self.__data_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_has_pretrain_same_volume_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl' ) data_all_root_dir = os.path.join(data_subset, year, volume_level) all_level = os.path.split(data_all_root_dir)[1] self.__pretrain_pkl_path = os.path.join( path.PATH_TMP_DIR, f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_dealer_83_no_below_{no_below}_input_days_{input_days}.pkl' ) _, _, _, _, self.dict_pretrain, self.voc_size_pretrain = utils.load_pkl( self.__pretrain_pkl_path) if os.path.isfile(self.__data_pkl_path) and not force_refresh: self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \ utils.load_pkl(self.__data_pkl_path) else: data_root_dir = os.path.join(data_subset, year, volume_level, data_index) print(f'\nStart loading data from {data_root_dir} ...') # load doc list train_doc_list = self.__load_dir( os.path.join(data_root_dir, 'train')) test_doc_list = self.__load_dir(os.path.join( data_root_dir, 'test')) print(f'Finish loading \n\nStart processing data ... ') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(train_doc_list, no_below) # convert doc list to trainable interval summed one-hot vector self.__train_X = self.__convert_input(train_doc_list, self.dict, self.voc_size) self.__train_y = self.__convert_output(train_doc_list) self.__test_X = self.__convert_input(test_doc_list, self.dict, self.voc_size) self.__test_y = self.__convert_output(test_doc_list) self.__train_X_pretrain = self.__convert_input( train_doc_list, self.dict_pretrain, self.voc_size_pretrain) self.__test_X_pretrain = self.__convert_input( test_doc_list, self.dict_pretrain, self.voc_size_pretrain) o_lstm = LSTM('2020_01_12_18_49_46', 'lstm_for_pretrain_with_same_volume', 2007) o_lstm.compile(0.001) o_lstm.load_model( r'D:\Github\bond_prediction\runtime\models\lstm_for_pretrain_with_same_volume\2020_01_12_18_49_46\lstm_for_pretrain_with_same_volume.030-0.0596.hdf5', np.zeros([1, 20, 2007]), np.zeros([1, 20, 2007])) self.__train_X_pretrain = o_lstm.predict(self.__train_X_pretrain) self.__test_X_pretrain = o_lstm.predict(self.__test_X_pretrain) self.__train_X_pretrain = np.array([ self.__train_X_pretrain for i in range(20) ]).transpose([1, 0, 2]) self.__test_X_pretrain = np.array([ self.__test_X_pretrain for i in range(20) ]).transpose([1, 0, 2]) # print(self.__train_X_pretrain.shape) # print(self.__test_X_pretrain.shape) self.__train_X = np.vstack([ self.__train_X.transpose([2, 0, 1]), self.__train_X_pretrain.transpose([2, 0, 1]) ]) self.__test_X = np.vstack([ self.__test_X.transpose([2, 0, 1]), self.__test_X_pretrain.transpose([2, 0, 1]) ]) self.__train_X = self.__train_X.transpose([1, 2, 0]) self.__test_X = self.__test_X.transpose([1, 2, 0]) # print(self.__train_X.shape) # print(self.__test_X.shape) print('Finish processing ') # cache data for faster processing next time utils.write_pkl(self.__data_pkl_path, [ self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size ]) self.__gen_topics_mask() self.__statistic()