def preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') # preprocess train data self.train_x, self.train_y, self.train_lan_x, self.train_lan_y = utils.pipeline( self.M.encode_pl, self.train_src, self.train_tar, { **self.M.data_params, 'tokenizer': self.tokenizer }, ) # preprocess test data self.test_x, self.test_y, self.test_lan_x, self.test_lan_y = utils.pipeline( self.M.encode_pl, self.test_src, self.test_tar, { **self.M.data_params, 'tokenizer': self.tokenizer }, ) # get vocabulary size self.vocab_size = self.tokenizer.vocab_size # release storage del self.train_src del self.train_tar del self.test_src del self.test_tar print('\nFinish preprocessing ')
def __preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') if self.tokenizer_dir: self.__src_tokenizer = load_pkl( get_file_path(data_dir, 'tokenizer', self.tokenizer_dir, 'tokenizer.pkl')) self.__tar_tokenizer = self.__src_tokenizer elif self.M.checkpoint_params['load_model']: load_model_params = self.M.checkpoint_params['load_model'] tokenizer_path = create_dir_in_root('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.__src_tokenizer = self.__tar_tokenizer = read_cache( tokenizer_path) else: self.__src_tokenizer = utils.pipeline( self.M.tokenizer_pl, self.__tokenizer_data_src, self.__tokenizer_data_tar, self.M.data_params, ) self.__tar_tokenizer = self.__src_tokenizer del self.__tokenizer_data_src del self.__tokenizer_data_tar params = { **self.M.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, } self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__train_src, self.__train_tar, params) self.__val_src_encode, self.__val_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__val_src, self.__val_tar, params) self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__test_src, self.__test_tar, params) # get vocabulary size self.__src_vocab_size = self.__src_tokenizer.vocab_size self.__tar_vocab_size = self.__tar_tokenizer.vocab_size print('\nFinish preprocessing ')
def get_attention_map(self, list_of_src_sentences, src_tokenizer, tar_tokenizer): """ translate list of sentences and decode the results """ encoded_data = utils.pipeline( self.encode_pipeline_for_src, list_of_src_sentences, None, { 'tokenizer': src_tokenizer, 'vocab_size': src_tokenizer.vocab_size, 'max_src_seq_len': self.data_params['max_src_seq_len'], }) pred_encoded, attentions = self.evaluate(encoded_data, True) pred_decoded = self.decode_tar_data(pred_encoded, tar_tokenizer, False) src_decoded = self.decode_src_data(encoded_data, src_tokenizer, False) pred_decoded = pred_decoded[0] src_decoded = src_decoded[0] attentions = attentions[0] print('start ploting ...') for _layer, attention in attentions.items(): if _layer != 'decoder_layer6_block2': continue # if _layer[-1] != '2': # continue print(f'plotting {_layer} ... ') self.plot_attention_weights(attention, src_decoded, pred_decoded, _layer) print('finish plotting ')
def gen_preprocessed_data(self, data, batch_size): length = len(data) num_batch = int(math.ceil(length / batch_size)) steps = int(num_batch * self.__sample_rate) print(f'\nstart generating preprocessed data ({steps} files) ... ') for i in range(steps): # show progress if i % 10 == 0: progress = float(i + 1) / steps * 100. print('\rprogress: %.2f%% ' % progress, end='') # get a batch index_of_batch = i % num_batch index_start = int(index_of_batch * batch_size) index_end = index_start + batch_size batch_src, batch_tar = list(zip(*data[index_start:index_end])) # preprocess data batch_x, batch_y, _, _ = utils.pipeline( self.__encoder_pl, batch_src, batch_tar, { **self.__data_params, 'tokenizer': self.__tokenizer }, verbose=i == 0) # save data to file file_path = os.path.join(self.__processed_dir_path, f'batch_{i}.pkl') write_pkl(file_path, [batch_x, batch_y]) print('finish generating preprocessed data ')
def __load(self): data_queue = [] max_queue_size = min(self.size(), self.queue_size) max_buffer_size = min(self.size(), self.buffer_size) while self.__running: while len(data_queue) < max_queue_size: file_path = self.__file_list[self.__cur_index] self.__cur_index = (self.__cur_index + 1) % self.__len_files batch_src, batch_tar = load_pkl(file_path) # preprocess data batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y = utils.pipeline( self.__encoder_pl, batch_src, batch_tar, {**self.__data_params, 'tokenizer': self.__tokenizer}, verbose=False ) data_queue += list(zip(batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y)) if len(self.__data) < max_buffer_size: random.seed(42) random.shuffle(data_queue) self.__data += data_queue data_queue = [] time.sleep(0.1) print('Stop thread for loading data ')
def translate_sentences(self, list_of_src_sentences, src_tokenizer, tar_tokenizer): """ translate list of sentences and decode the results """ encoded_data = utils.pipeline(self.encode_pipeline_for_src, list_of_src_sentences, None, { 'src_tokenizer': src_tokenizer, 'max_src_seq_len': self.data_params['max_src_seq_len'], }) pred_encoded = self.evaluate(encoded_data) return self.decode_tar_data(pred_encoded, tar_tokenizer)
def __preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') load_model_params = Model.checkpoint_params['load_model'] if load_model_params: tokenizer_path = create_dir_in_root('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.__src_tokenizer = self.__tar_tokenizer = read_cache( tokenizer_path) self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline( Model.encode_pipeline, self.__train_src, self.__train_tar, { **Model.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, }) else: self.__train_src_encode, self.__train_tar_encode, self.__src_tokenizer, self.__tar_tokenizer = utils.pipeline( Model.preprocess_pipeline, self.__train_src, self.__train_tar, Model.data_params, ) params = { **Model.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, } self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline( Model.encode_pipeline, self.__test_src, self.__test_tar, params) # get vocabulary size self.__src_vocab_size = self.__src_tokenizer.vocab_size self.__tar_vocab_size = self.__tar_tokenizer.vocab_size print('\nFinish preprocessing ')
def get_tokenizer(self): print('\nstart training tokenizer ... ') self.__tokenizer = utils.pipeline( self.__tokenizer_pl, self.__tokenizer_src, self.__tokenizer_tar, self.__data_params, ) del self.__tokenizer_src del self.__tokenizer_tar print('finish training tokenizer') # saving the tokenizer to file write_pkl(self.__tokenizer_path, self.__tokenizer) return self.__tokenizer
def __init__(self, _is_train, _dataset='cdlm'): # initialize variables self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset) # initialize wmt news loader start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio) # initialize news commentary loader start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_news_commentary_loader = zh_en_news_commentary.Loader(start_ratio, end_ratio) # load the data zh_data, en_data = zh_en_wmt_loader.data() zh_data_2, en_data_2 = zh_en_news_commentary_loader.data() # um corpus data is only for training if _is_train: zh_data_3, en_data_3 = um_corpus.zh_en(get_test=False) # combine data zh_data += tuple(zh_data_3) en_data += tuple(en_data_3) # combine data zh_data += zh_data_2 en_data += en_data_2 # word segmentation for zh_data zh_data = utils.pipeline(seg_zh_by_jieba_pipeline, zh_data) data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) self.gen_data(data, self.BATCH_SIZE_PER_FILE)
def preprocess_tokenizer(self): print('\nProcessing tokenizer ... ') # get tokenizer load_model_params = self.M.checkpoint_params['load_model'] if not load_model_params: self.tokenizer = utils.pipeline( self.M.tokenizer_pl, self.train_tokenizer_src, self.train_tokenizer_tar, self.M.data_params, ) del self.train_tokenizer_src del self.train_tokenizer_tar # load tokenizer from cache else: tokenizer_path = get_relative_file_path('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.tokenizer = read_cache(tokenizer_path)
# token_zh_data + list(origin_zh_data[:1000]), token_en_data + list(origin_en_data[:1000]), params) pipeline = noise_pl.remove_noise + tfds_share_pl.train_tokenizer # pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise pipeline += pl.sent_2_tokens + MLM_pl(0.2) + pl.CDLM_encode + [{ 'output_keys': [ 'input_1', 'ground_truth_1', 'lan_idx_for_input_1', 'lan_idx_for_gt_1', 'pos_for_gt_1', 'tokenizer' ] }] print('\n------------------- Encoding -------------------------') x, y, lan_x, lan_y, soft_pos_y, tokenizer = utils.pipeline( preprocess_pipeline=pipeline, lan_data_1=origin_ro_data, lan_data_2=origin_en_data, params={ **params, # 'tokenizer': tokenizer }) print('\n----------------------------------------------') print(x.shape) print(y.shape) print(lan_x.shape) print(lan_y.shape) print(soft_pos_y.shape) print('\n------------------- Decoding -------------------------') x = utils.pipeline(decode_pl(''), x[:2], None, {'tokenizer': tokenizer}) y = utils.pipeline(decode_pl(''), y[:2], None, {'tokenizer': tokenizer}) print(x[0])
def decode_tar_data(self, encoded_data, tokenizer, to_sentence=True): """ decode the list of list token idx to sentences """ end_index = None if to_sentence else -1 return utils.pipeline(self.decode_pipeline_for_tar[:end_index], encoded_data, None, {'tokenizer': tokenizer}, False)
# origin_de_data, origin_en_data = wmt_news.de_en() origin_de_data, origin_en_data = wmt_news.fr_en() params = { 'vocab_size': 40000, 'src_vocab_size': 2**13, 'tar_vocab_size': 2**13, 'max_src_seq_len': 50, 'max_tar_seq_len': 60, } print('\n------------------- Encoding -------------------------') de_data, en_data, de_tokenizer, en_tokenizer = utils.pipeline( preprocess_pipeline=noise_pl.remove_noise + tfds_share_pl.train_tokenizer + tfds_share_pl.encode_pipeline, lan_data_1=origin_de_data, lan_data_2=origin_en_data, params=params) print('\n----------------------------------------------') print(de_data.shape) print(en_data.shape) print(de_tokenizer.vocab_size) print(en_tokenizer.vocab_size) print('\n------------------- Decoding -------------------------') de_data = utils.pipeline(tfds_share_pl.decode_pipeline, de_data, None, {'tokenizer': de_tokenizer}) print('\n------------------- Decoding -------------------------') en_data = utils.pipeline(tfds_share_pl.decode_pipeline, en_data, None,
pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer # pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise pipeline += pl.sent_2_tokens + sample_pl(2.0) + combine_pl( 0.2) + pl.CDLM_encode + [{ 'output_keys': [ 'input_1', 'ground_truth_1', 'lan_idx_for_input_1', 'lan_idx_for_gt_1', 'pos_for_gt_1', 'tokenizer' ] }] print('\n------------------- Encoding -------------------------') x, y, lan_x, lan_y, soft_pos_y, tokenizer = utils.pipeline( preprocess_pipeline=pipeline, lan_data_1=origin_zh_data[:1000], lan_data_2=origin_en_data[:1000], params={ **params, # 'tokenizer': tokenizer }) print('\n----------------------------------------------') print(x.shape) print(y.shape) print(lan_x.shape) print(lan_y.shape) print(soft_pos_y.shape) print('\n------------------- Decoding -------------------------') x = utils.pipeline(decode_pl('ner'), x, None, {'tokenizer': tokenizer}) y = utils.pipeline(decode_pl('ner'), y, None, {'tokenizer': tokenizer})
def __preprocess_zh(self, zh_data): return utils.pipeline(self.__preprocess_zh_pl, zh_data, None, self.__data_params)
def preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') # process before CDLM train_src_preprocessed, train_tar_preprocessed = utils.pipeline( self.M.before_encode_pl, self.train_src, self.train_tar, self.M.data_params, ) test_src_preprocessed, test_tar_preprocessed = utils.pipeline( self.M.before_encode_pl, self.test_src, self.test_tar, self.M.data_params, ) del self.train_src del self.train_tar del self.test_src del self.test_tar # preprocess CDLM_translate train_x_t, train_y_t, train_lan_x_t, train_lan_y_t, train_pos_y_t = utils.pipeline( self.M.translate_encode_pl, train_src_preprocessed, train_tar_preprocessed, { **self.M.data_params, 'tokenizer': self.tokenizer }, ) test_x_t, test_y_t, test_lan_x_t, test_lan_y_t, test_pos_y_t = utils.pipeline( self.M.translate_encode_pl, test_src_preprocessed, test_tar_preprocessed, { **self.M.data_params, 'tokenizer': self.tokenizer }, ) # preprocess CDLM_pos train_x_pos, train_y_pos, train_lan_x_pos, train_lan_y_pos, train_pos_y_pos = \ utils.pipeline(self.M.pos_encode_pl, train_src_preprocessed, train_tar_preprocessed, {**self.M.data_params, 'tokenizer': self.tokenizer}) test_x_pos, test_y_pos, test_lan_x_pos, test_lan_y_pos, test_pos_y_pos = \ utils.pipeline(self.M.pos_encode_pl, test_src_preprocessed, test_tar_preprocessed, {**self.M.data_params, 'tokenizer': self.tokenizer}) # preprocess CDLM_ner train_x_ner, train_y_ner, train_lan_x_ner, train_lan_y_ner, train_pos_y_ner = \ utils.pipeline(self.M.ner_encode_pl, train_src_preprocessed, train_tar_preprocessed, {**self.M.data_params, 'tokenizer': self.tokenizer}) test_x_ner, test_y_ner, test_lan_x_ner, test_lan_y_ner, test_pos_y_ner = \ utils.pipeline(self.M.ner_encode_pl, test_src_preprocessed, test_tar_preprocessed, {**self.M.data_params, 'tokenizer': self.tokenizer}) # preprocess CDLM_synonym train_x_syn, train_y_syn, train_lan_x_syn, train_lan_y_syn, train_pos_y_syn = \ utils.pipeline(self.M.synonym_encode_pl, train_src_preprocessed, train_tar_preprocessed, {**self.M.data_params, 'tokenizer': self.tokenizer}) test_x_syn, test_y_syn, test_lan_x_syn, test_lan_y_syn, test_pos_y_syn = \ utils.pipeline(self.M.synonym_encode_pl, test_src_preprocessed, test_tar_preprocessed, {**self.M.data_params, 'tokenizer': self.tokenizer}) # preprocess CDLM_definition train_x_def, train_y_def, train_lan_x_def, train_lan_y_def, train_pos_y_def = \ utils.pipeline(self.M.def_encode_pl, train_src_preprocessed, train_tar_preprocessed, {**self.M.data_params, 'tokenizer': self.tokenizer}) test_x_def, test_y_def, test_lan_x_def, test_lan_y_def, test_pos_y_def = \ utils.pipeline(self.M.def_encode_pl, test_src_preprocessed, test_tar_preprocessed, {**self.M.data_params, 'tokenizer': self.tokenizer}) # release some storage del train_src_preprocessed del train_tar_preprocessed del test_src_preprocessed del test_tar_preprocessed # merge data self.train_x = np.vstack( [train_x_t, train_x_pos, train_x_ner, train_x_syn, train_x_def]) self.train_y = np.vstack( [train_y_t, train_y_pos, train_y_ner, train_y_syn, train_y_def]) self.train_lan_x = np.vstack([ train_lan_x_t, train_lan_x_pos, train_lan_x_ner, train_lan_x_syn, train_lan_x_def ]) self.train_lan_y = np.vstack([ train_lan_y_t, train_lan_y_pos, train_lan_y_ner, train_lan_y_syn, train_lan_y_def ]) self.train_pos_y = np.vstack([ train_pos_y_t, train_pos_y_pos, train_pos_y_ner, train_pos_y_syn, train_pos_y_def ]) self.test_x = np.vstack( [test_x_t, test_x_pos, test_x_ner, test_x_syn, test_x_def]) self.test_y = np.vstack( [test_y_t, test_y_pos, test_y_ner, test_y_syn, test_y_def]) self.test_lan_x = np.vstack([ test_lan_x_t, test_lan_x_pos, test_lan_x_ner, test_lan_x_syn, test_lan_x_def ]) self.test_lan_y = np.vstack([ test_lan_y_t, test_lan_y_pos, test_lan_y_ner, test_lan_y_syn, test_lan_y_def ]) self.test_pos_y = np.vstack([ test_pos_y_t, test_pos_y_pos, test_pos_y_ner, test_pos_y_syn, test_pos_y_def ]) # shuffle data train_data = list( zip(self.train_x, self.train_y, self.train_lan_x, self.train_lan_y, self.train_pos_y)) test_data = list( zip(self.test_x, self.test_y, self.test_lan_x, self.test_lan_y, self.test_pos_y)) random.seed(42) random.shuffle(train_data) random.seed(42) random.shuffle(test_data) self.train_x, self.train_y, self.train_lan_x, self.train_lan_y, self.train_pos_y = list( zip(*train_data)) self.test_x, self.test_y, self.test_lan_x, self.test_lan_y, self.test_pos_y = list( zip(*test_data)) # convert to array def convert_arr(*args): return list(map(lambda x: np.array(x), args)) self.train_x, self.train_y, self.train_lan_x, self.train_lan_y, self.train_pos_y = convert_arr( self.train_x, self.train_y, self.train_lan_x, self.train_lan_y, self.train_pos_y) self.test_x, self.test_y, self.test_lan_x, self.test_lan_y, self.test_pos_y = convert_arr( self.test_x, self.test_y, self.test_lan_x, self.test_lan_y, self.test_pos_y) # get vocabulary size self.vocab_size = self.tokenizer.vocab_size print('\nFinish preprocessing ')
if __name__ == '__main__': from nmt.preprocess.corpus import wmt_news from nmt.preprocess.inputs.zh_en import seg_zh_by_jieba_pipeline, remove_space_pipeline zh_data, en_data = wmt_news.zh_en() params = { 'src_vocab_size': 2**13, 'tar_vocab_size': 2**13, 'max_src_seq_len': 50, 'max_tar_seq_len': 60, } print('\n------------------- Encoding -------------------------') zh_data, en_data, zh_tokenizer, en_tokenizer = utils.pipeline( preprocess_pipeline=seg_zh_by_jieba_pipeline + train_tokenizer_pipeline + encode_pipeline, lan_data_1=zh_data, lan_data_2=en_data, params=params) print('\n----------------------------------------------') print(zh_data.shape) print(en_data.shape) print(zh_tokenizer.vocab_size) print(en_tokenizer.vocab_size) print('\n------------------- Decoding -------------------------') zh_data = utils.pipeline(decode_pipeline + remove_space_pipeline, zh_data, None, {'tokenizer': zh_tokenizer}) print('\n------------------- Decoding -------------------------') en_data = utils.pipeline(decode_pipeline, en_data, None,
from nmt.preprocess.corpus import um_corpus from nmt.preprocess.inputs import noise_pl, tfds_share_pl # origin_zh_data, origin_en_data = wmt_news.zh_en() origin_zh_data, origin_en_data = um_corpus.zh_en() params = { 'vocab_size': 45000, 'max_src_seq_len': 79, 'max_tar_seq_len': 98, } seg_pipeline = seg_zh_by_jieba_pipeline print('\n------------------- Encoding -------------------------') zh_data, en_data, zh_tokenizer, en_tokenizer = utils.pipeline( preprocess_pipeline=seg_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer + tfds_share_pl.encode_pipeline, lan_data_1=origin_zh_data, lan_data_2=origin_en_data, params=params) print('\n----------------------------------------------') print(zh_data.shape) print(en_data.shape) print(zh_tokenizer.vocab_size) print(en_tokenizer.vocab_size) print('\n------------------- Decoding -------------------------') zh_data = utils.pipeline(tfds_share_pl.decode_pipeline + remove_space_pipeline, zh_data, None, {'tokenizer': zh_tokenizer}) print('\n------------------- Decoding -------------------------') en_data = utils.pipeline(tfds_share_pl.decode_pipeline, en_data, None, {'tokenizer': en_tokenizer})