示例#1
0
    def preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        # preprocess train data
        self.train_x, self.train_y, self.train_lan_x, self.train_lan_y = utils.pipeline(
            self.M.encode_pl,
            self.train_src,
            self.train_tar,
            {
                **self.M.data_params, 'tokenizer': self.tokenizer
            },
        )

        # preprocess test data
        self.test_x, self.test_y, self.test_lan_x, self.test_lan_y = utils.pipeline(
            self.M.encode_pl,
            self.test_src,
            self.test_tar,
            {
                **self.M.data_params, 'tokenizer': self.tokenizer
            },
        )

        # get vocabulary size
        self.vocab_size = self.tokenizer.vocab_size

        # release storage
        del self.train_src
        del self.train_tar
        del self.test_src
        del self.test_tar

        print('\nFinish preprocessing ')
示例#2
0
    def __preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        if self.tokenizer_dir:
            self.__src_tokenizer = load_pkl(
                get_file_path(data_dir, 'tokenizer', self.tokenizer_dir,
                              'tokenizer.pkl'))
            self.__tar_tokenizer = self.__src_tokenizer

        elif self.M.checkpoint_params['load_model']:
            load_model_params = self.M.checkpoint_params['load_model']

            tokenizer_path = create_dir_in_root('runtime', 'tokenizer',
                                                load_model_params[0],
                                                load_model_params[1],
                                                'tokenizer.pkl')
            self.__src_tokenizer = self.__tar_tokenizer = read_cache(
                tokenizer_path)

        else:
            self.__src_tokenizer = utils.pipeline(
                self.M.tokenizer_pl,
                self.__tokenizer_data_src,
                self.__tokenizer_data_tar,
                self.M.data_params,
            )
            self.__tar_tokenizer = self.__src_tokenizer
            del self.__tokenizer_data_src
            del self.__tokenizer_data_tar

        params = {
            **self.M.data_params,
            'tokenizer': self.__src_tokenizer,
            'src_tokenizer': self.__src_tokenizer,
            'tar_tokenizer': self.__tar_tokenizer,
        }

        self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__train_src, self.__train_tar, params)

        self.__val_src_encode, self.__val_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__val_src, self.__val_tar, params)

        self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__test_src, self.__test_tar, params)

        # get vocabulary size
        self.__src_vocab_size = self.__src_tokenizer.vocab_size
        self.__tar_vocab_size = self.__tar_tokenizer.vocab_size

        print('\nFinish preprocessing ')
示例#3
0
    def get_attention_map(self, list_of_src_sentences, src_tokenizer,
                          tar_tokenizer):
        """ translate list of sentences and decode the results """
        encoded_data = utils.pipeline(
            self.encode_pipeline_for_src, list_of_src_sentences, None, {
                'tokenizer': src_tokenizer,
                'vocab_size': src_tokenizer.vocab_size,
                'max_src_seq_len': self.data_params['max_src_seq_len'],
            })

        pred_encoded, attentions = self.evaluate(encoded_data, True)

        pred_decoded = self.decode_tar_data(pred_encoded, tar_tokenizer, False)
        src_decoded = self.decode_src_data(encoded_data, src_tokenizer, False)

        pred_decoded = pred_decoded[0]
        src_decoded = src_decoded[0]
        attentions = attentions[0]

        print('start ploting ...')

        for _layer, attention in attentions.items():
            if _layer != 'decoder_layer6_block2':
                continue
            # if _layer[-1] != '2':
            #     continue
            print(f'plotting {_layer} ... ')
            self.plot_attention_weights(attention, src_decoded, pred_decoded,
                                        _layer)

        print('finish plotting ')
示例#4
0
    def gen_preprocessed_data(self, data, batch_size):
        length = len(data)
        num_batch = int(math.ceil(length / batch_size))
        steps = int(num_batch * self.__sample_rate)

        print(f'\nstart generating preprocessed data ({steps} files) ... ')

        for i in range(steps):
            # show progress
            if i % 10 == 0:
                progress = float(i + 1) / steps * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            # get a batch
            index_of_batch = i % num_batch
            index_start = int(index_of_batch * batch_size)
            index_end = index_start + batch_size
            batch_src, batch_tar = list(zip(*data[index_start:index_end]))

            # preprocess data
            batch_x, batch_y, _, _ = utils.pipeline(
                self.__encoder_pl,
                batch_src,
                batch_tar, {
                    **self.__data_params, 'tokenizer': self.__tokenizer
                },
                verbose=i == 0)

            # save data to file
            file_path = os.path.join(self.__processed_dir_path,
                                     f'batch_{i}.pkl')
            write_pkl(file_path, [batch_x, batch_y])

        print('finish generating preprocessed data ')
示例#5
0
    def __load(self):
        data_queue = []
        max_queue_size = min(self.size(), self.queue_size)
        max_buffer_size = min(self.size(), self.buffer_size)

        while self.__running:
            while len(data_queue) < max_queue_size:
                file_path = self.__file_list[self.__cur_index]
                self.__cur_index = (self.__cur_index + 1) % self.__len_files

                batch_src, batch_tar = load_pkl(file_path)

                # preprocess data
                batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y = utils.pipeline(
                    self.__encoder_pl, batch_src, batch_tar, {**self.__data_params, 'tokenizer': self.__tokenizer},
                    verbose=False
                )

                data_queue += list(zip(batch_x, batch_y, batch_lan_x, batch_lan_y, batch_pos_y))

            if len(self.__data) < max_buffer_size:
                random.seed(42)
                random.shuffle(data_queue)

                self.__data += data_queue
                data_queue = []

            time.sleep(0.1)

        print('Stop thread for loading data ')
示例#6
0
    def translate_sentences(self, list_of_src_sentences, src_tokenizer, tar_tokenizer):
        """ translate list of sentences and decode the results """
        encoded_data = utils.pipeline(self.encode_pipeline_for_src, list_of_src_sentences, None, {
            'src_tokenizer': src_tokenizer,
            'max_src_seq_len': self.data_params['max_src_seq_len'],
        })

        pred_encoded = self.evaluate(encoded_data)
        return self.decode_tar_data(pred_encoded, tar_tokenizer)
示例#7
0
    def __preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        load_model_params = Model.checkpoint_params['load_model']
        if load_model_params:
            tokenizer_path = create_dir_in_root('runtime', 'tokenizer',
                                                load_model_params[0],
                                                load_model_params[1],
                                                'tokenizer.pkl')
            self.__src_tokenizer = self.__tar_tokenizer = read_cache(
                tokenizer_path)

            self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline(
                Model.encode_pipeline, self.__train_src, self.__train_tar, {
                    **Model.data_params,
                    'tokenizer': self.__src_tokenizer,
                    'src_tokenizer': self.__src_tokenizer,
                    'tar_tokenizer': self.__tar_tokenizer,
                })

        else:
            self.__train_src_encode, self.__train_tar_encode, self.__src_tokenizer, self.__tar_tokenizer = utils.pipeline(
                Model.preprocess_pipeline,
                self.__train_src,
                self.__train_tar,
                Model.data_params,
            )

        params = {
            **Model.data_params,
            'tokenizer': self.__src_tokenizer,
            'src_tokenizer': self.__src_tokenizer,
            'tar_tokenizer': self.__tar_tokenizer,
        }

        self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline(
            Model.encode_pipeline, self.__test_src, self.__test_tar, params)

        # get vocabulary size
        self.__src_vocab_size = self.__src_tokenizer.vocab_size
        self.__tar_vocab_size = self.__tar_tokenizer.vocab_size

        print('\nFinish preprocessing ')
    def get_tokenizer(self):
        print('\nstart training tokenizer ... ')

        self.__tokenizer = utils.pipeline(
            self.__tokenizer_pl, self.__tokenizer_src, self.__tokenizer_tar, self.__data_params,
        )

        del self.__tokenizer_src
        del self.__tokenizer_tar

        print('finish training tokenizer')

        # saving the tokenizer to file
        write_pkl(self.__tokenizer_path, self.__tokenizer)

        return self.__tokenizer
    def __init__(self, _is_train, _dataset='cdlm'):
        # initialize variables
        self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset)

        # initialize wmt news loader
        start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio)

        # initialize news commentary loader
        start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_news_commentary_loader = zh_en_news_commentary.Loader(start_ratio, end_ratio)

        # load the data
        zh_data, en_data = zh_en_wmt_loader.data()
        zh_data_2, en_data_2 = zh_en_news_commentary_loader.data()

        # um corpus data is only for training
        if _is_train:
            zh_data_3, en_data_3 = um_corpus.zh_en(get_test=False)

            # combine data
            zh_data += tuple(zh_data_3)
            en_data += tuple(en_data_3)

        # combine data
        zh_data += zh_data_2
        en_data += en_data_2

        # word segmentation for zh_data
        zh_data = utils.pipeline(seg_zh_by_jieba_pipeline, zh_data)

        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        self.gen_data(data, self.BATCH_SIZE_PER_FILE)
示例#10
0
    def preprocess_tokenizer(self):
        print('\nProcessing tokenizer ... ')

        # get tokenizer
        load_model_params = self.M.checkpoint_params['load_model']
        if not load_model_params:
            self.tokenizer = utils.pipeline(
                self.M.tokenizer_pl,
                self.train_tokenizer_src,
                self.train_tokenizer_tar,
                self.M.data_params,
            )
            del self.train_tokenizer_src
            del self.train_tokenizer_tar

        # load tokenizer from cache
        else:
            tokenizer_path = get_relative_file_path('runtime', 'tokenizer',
                                                    load_model_params[0],
                                                    load_model_params[1],
                                                    'tokenizer.pkl')
            self.tokenizer = read_cache(tokenizer_path)
示例#11
0
    #                            token_zh_data + list(origin_zh_data[:1000]), token_en_data + list(origin_en_data[:1000]), params)

    pipeline = noise_pl.remove_noise + tfds_share_pl.train_tokenizer
    # pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise
    pipeline += pl.sent_2_tokens + MLM_pl(0.2) + pl.CDLM_encode + [{
        'output_keys': [
            'input_1', 'ground_truth_1', 'lan_idx_for_input_1',
            'lan_idx_for_gt_1', 'pos_for_gt_1', 'tokenizer'
        ]
    }]

    print('\n------------------- Encoding -------------------------')
    x, y, lan_x, lan_y, soft_pos_y, tokenizer = utils.pipeline(
        preprocess_pipeline=pipeline,
        lan_data_1=origin_ro_data,
        lan_data_2=origin_en_data,
        params={
            **params,
            # 'tokenizer': tokenizer
        })

    print('\n----------------------------------------------')
    print(x.shape)
    print(y.shape)
    print(lan_x.shape)
    print(lan_y.shape)
    print(soft_pos_y.shape)

    print('\n------------------- Decoding -------------------------')
    x = utils.pipeline(decode_pl(''), x[:2], None, {'tokenizer': tokenizer})
    y = utils.pipeline(decode_pl(''), y[:2], None, {'tokenizer': tokenizer})
    print(x[0])
示例#12
0
 def decode_tar_data(self, encoded_data, tokenizer, to_sentence=True):
     """ decode the list of list token idx to sentences """
     end_index = None if to_sentence else -1
     return utils.pipeline(self.decode_pipeline_for_tar[:end_index],
                           encoded_data, None, {'tokenizer': tokenizer},
                           False)
示例#13
0
    # origin_de_data, origin_en_data = wmt_news.de_en()
    origin_de_data, origin_en_data = wmt_news.fr_en()

    params = {
        'vocab_size': 40000,
        'src_vocab_size': 2**13,
        'tar_vocab_size': 2**13,
        'max_src_seq_len': 50,
        'max_tar_seq_len': 60,
    }

    print('\n------------------- Encoding -------------------------')
    de_data, en_data, de_tokenizer, en_tokenizer = utils.pipeline(
        preprocess_pipeline=noise_pl.remove_noise +
        tfds_share_pl.train_tokenizer + tfds_share_pl.encode_pipeline,
        lan_data_1=origin_de_data,
        lan_data_2=origin_en_data,
        params=params)

    print('\n----------------------------------------------')
    print(de_data.shape)
    print(en_data.shape)
    print(de_tokenizer.vocab_size)
    print(en_tokenizer.vocab_size)

    print('\n------------------- Decoding -------------------------')
    de_data = utils.pipeline(tfds_share_pl.decode_pipeline, de_data, None,
                             {'tokenizer': de_tokenizer})

    print('\n------------------- Decoding -------------------------')
    en_data = utils.pipeline(tfds_share_pl.decode_pipeline, en_data, None,
示例#14
0
文件: CDLM_ner.py 项目: SamuelLAN/DLM
    pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer
    # pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise
    pipeline += pl.sent_2_tokens + sample_pl(2.0) + combine_pl(
        0.2) + pl.CDLM_encode + [{
            'output_keys': [
                'input_1', 'ground_truth_1', 'lan_idx_for_input_1',
                'lan_idx_for_gt_1', 'pos_for_gt_1', 'tokenizer'
            ]
        }]

    print('\n------------------- Encoding -------------------------')
    x, y, lan_x, lan_y, soft_pos_y, tokenizer = utils.pipeline(
        preprocess_pipeline=pipeline,
        lan_data_1=origin_zh_data[:1000],
        lan_data_2=origin_en_data[:1000],
        params={
            **params,
            # 'tokenizer': tokenizer
        })

    print('\n----------------------------------------------')
    print(x.shape)
    print(y.shape)
    print(lan_x.shape)
    print(lan_y.shape)
    print(soft_pos_y.shape)

    print('\n------------------- Decoding -------------------------')
    x = utils.pipeline(decode_pl('ner'), x, None, {'tokenizer': tokenizer})
    y = utils.pipeline(decode_pl('ner'), y, None, {'tokenizer': tokenizer})
示例#15
0
 def __preprocess_zh(self, zh_data):
     return utils.pipeline(self.__preprocess_zh_pl, zh_data, None,
                           self.__data_params)
    def preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        # process before CDLM
        train_src_preprocessed, train_tar_preprocessed = utils.pipeline(
            self.M.before_encode_pl,
            self.train_src,
            self.train_tar,
            self.M.data_params,
        )

        test_src_preprocessed, test_tar_preprocessed = utils.pipeline(
            self.M.before_encode_pl,
            self.test_src,
            self.test_tar,
            self.M.data_params,
        )

        del self.train_src
        del self.train_tar
        del self.test_src
        del self.test_tar

        # preprocess CDLM_translate
        train_x_t, train_y_t, train_lan_x_t, train_lan_y_t, train_pos_y_t = utils.pipeline(
            self.M.translate_encode_pl,
            train_src_preprocessed,
            train_tar_preprocessed,
            {
                **self.M.data_params, 'tokenizer': self.tokenizer
            },
        )

        test_x_t, test_y_t, test_lan_x_t, test_lan_y_t, test_pos_y_t = utils.pipeline(
            self.M.translate_encode_pl,
            test_src_preprocessed,
            test_tar_preprocessed,
            {
                **self.M.data_params, 'tokenizer': self.tokenizer
            },
        )

        # preprocess CDLM_pos
        train_x_pos, train_y_pos, train_lan_x_pos, train_lan_y_pos, train_pos_y_pos = \
            utils.pipeline(self.M.pos_encode_pl, train_src_preprocessed, train_tar_preprocessed,
                           {**self.M.data_params, 'tokenizer': self.tokenizer})

        test_x_pos, test_y_pos, test_lan_x_pos, test_lan_y_pos, test_pos_y_pos = \
            utils.pipeline(self.M.pos_encode_pl, test_src_preprocessed, test_tar_preprocessed,
                           {**self.M.data_params, 'tokenizer': self.tokenizer})

        # preprocess CDLM_ner
        train_x_ner, train_y_ner, train_lan_x_ner, train_lan_y_ner, train_pos_y_ner = \
            utils.pipeline(self.M.ner_encode_pl, train_src_preprocessed, train_tar_preprocessed,
                           {**self.M.data_params, 'tokenizer': self.tokenizer})

        test_x_ner, test_y_ner, test_lan_x_ner, test_lan_y_ner, test_pos_y_ner = \
            utils.pipeline(self.M.ner_encode_pl, test_src_preprocessed, test_tar_preprocessed,
                           {**self.M.data_params, 'tokenizer': self.tokenizer})

        # preprocess CDLM_synonym
        train_x_syn, train_y_syn, train_lan_x_syn, train_lan_y_syn, train_pos_y_syn = \
            utils.pipeline(self.M.synonym_encode_pl, train_src_preprocessed, train_tar_preprocessed,
                           {**self.M.data_params, 'tokenizer': self.tokenizer})

        test_x_syn, test_y_syn, test_lan_x_syn, test_lan_y_syn, test_pos_y_syn = \
            utils.pipeline(self.M.synonym_encode_pl, test_src_preprocessed, test_tar_preprocessed,
                           {**self.M.data_params, 'tokenizer': self.tokenizer})

        # preprocess CDLM_definition
        train_x_def, train_y_def, train_lan_x_def, train_lan_y_def, train_pos_y_def = \
            utils.pipeline(self.M.def_encode_pl, train_src_preprocessed, train_tar_preprocessed,
                           {**self.M.data_params, 'tokenizer': self.tokenizer})

        test_x_def, test_y_def, test_lan_x_def, test_lan_y_def, test_pos_y_def = \
            utils.pipeline(self.M.def_encode_pl, test_src_preprocessed, test_tar_preprocessed,
                           {**self.M.data_params, 'tokenizer': self.tokenizer})

        # release some storage
        del train_src_preprocessed
        del train_tar_preprocessed
        del test_src_preprocessed
        del test_tar_preprocessed

        # merge data
        self.train_x = np.vstack(
            [train_x_t, train_x_pos, train_x_ner, train_x_syn, train_x_def])
        self.train_y = np.vstack(
            [train_y_t, train_y_pos, train_y_ner, train_y_syn, train_y_def])
        self.train_lan_x = np.vstack([
            train_lan_x_t, train_lan_x_pos, train_lan_x_ner, train_lan_x_syn,
            train_lan_x_def
        ])
        self.train_lan_y = np.vstack([
            train_lan_y_t, train_lan_y_pos, train_lan_y_ner, train_lan_y_syn,
            train_lan_y_def
        ])
        self.train_pos_y = np.vstack([
            train_pos_y_t, train_pos_y_pos, train_pos_y_ner, train_pos_y_syn,
            train_pos_y_def
        ])

        self.test_x = np.vstack(
            [test_x_t, test_x_pos, test_x_ner, test_x_syn, test_x_def])
        self.test_y = np.vstack(
            [test_y_t, test_y_pos, test_y_ner, test_y_syn, test_y_def])
        self.test_lan_x = np.vstack([
            test_lan_x_t, test_lan_x_pos, test_lan_x_ner, test_lan_x_syn,
            test_lan_x_def
        ])
        self.test_lan_y = np.vstack([
            test_lan_y_t, test_lan_y_pos, test_lan_y_ner, test_lan_y_syn,
            test_lan_y_def
        ])
        self.test_pos_y = np.vstack([
            test_pos_y_t, test_pos_y_pos, test_pos_y_ner, test_pos_y_syn,
            test_pos_y_def
        ])

        # shuffle data
        train_data = list(
            zip(self.train_x, self.train_y, self.train_lan_x, self.train_lan_y,
                self.train_pos_y))
        test_data = list(
            zip(self.test_x, self.test_y, self.test_lan_x, self.test_lan_y,
                self.test_pos_y))

        random.seed(42)
        random.shuffle(train_data)
        random.seed(42)
        random.shuffle(test_data)

        self.train_x, self.train_y, self.train_lan_x, self.train_lan_y, self.train_pos_y = list(
            zip(*train_data))
        self.test_x, self.test_y, self.test_lan_x, self.test_lan_y, self.test_pos_y = list(
            zip(*test_data))

        # convert to array
        def convert_arr(*args):
            return list(map(lambda x: np.array(x), args))

        self.train_x, self.train_y, self.train_lan_x, self.train_lan_y, self.train_pos_y = convert_arr(
            self.train_x, self.train_y, self.train_lan_x, self.train_lan_y,
            self.train_pos_y)
        self.test_x, self.test_y, self.test_lan_x, self.test_lan_y, self.test_pos_y = convert_arr(
            self.test_x, self.test_y, self.test_lan_x, self.test_lan_y,
            self.test_pos_y)

        # get vocabulary size
        self.vocab_size = self.tokenizer.vocab_size

        print('\nFinish preprocessing ')
示例#17
0
if __name__ == '__main__':
    from nmt.preprocess.corpus import wmt_news
    from nmt.preprocess.inputs.zh_en import seg_zh_by_jieba_pipeline, remove_space_pipeline

    zh_data, en_data = wmt_news.zh_en()
    params = {
        'src_vocab_size': 2**13,
        'tar_vocab_size': 2**13,
        'max_src_seq_len': 50,
        'max_tar_seq_len': 60,
    }

    print('\n------------------- Encoding -------------------------')
    zh_data, en_data, zh_tokenizer, en_tokenizer = utils.pipeline(
        preprocess_pipeline=seg_zh_by_jieba_pipeline +
        train_tokenizer_pipeline + encode_pipeline,
        lan_data_1=zh_data,
        lan_data_2=en_data,
        params=params)

    print('\n----------------------------------------------')
    print(zh_data.shape)
    print(en_data.shape)
    print(zh_tokenizer.vocab_size)
    print(en_tokenizer.vocab_size)

    print('\n------------------- Decoding -------------------------')
    zh_data = utils.pipeline(decode_pipeline + remove_space_pipeline, zh_data,
                             None, {'tokenizer': zh_tokenizer})

    print('\n------------------- Decoding -------------------------')
    en_data = utils.pipeline(decode_pipeline, en_data, None,
示例#18
0
文件: zh_en.py 项目: SamuelLAN/DLM
    from nmt.preprocess.corpus import um_corpus
    from nmt.preprocess.inputs import noise_pl, tfds_share_pl

    # origin_zh_data, origin_en_data = wmt_news.zh_en()
    origin_zh_data, origin_en_data = um_corpus.zh_en()
    params = {
        'vocab_size': 45000,
        'max_src_seq_len': 79,
        'max_tar_seq_len': 98,
    }

    seg_pipeline = seg_zh_by_jieba_pipeline

    print('\n------------------- Encoding -------------------------')
    zh_data, en_data, zh_tokenizer, en_tokenizer = utils.pipeline(
        preprocess_pipeline=seg_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer + tfds_share_pl.encode_pipeline,
        lan_data_1=origin_zh_data, lan_data_2=origin_en_data, params=params)

    print('\n----------------------------------------------')
    print(zh_data.shape)
    print(en_data.shape)
    print(zh_tokenizer.vocab_size)
    print(en_tokenizer.vocab_size)

    print('\n------------------- Decoding -------------------------')
    zh_data = utils.pipeline(tfds_share_pl.decode_pipeline + remove_space_pipeline,
                             zh_data, None, {'tokenizer': zh_tokenizer})

    print('\n------------------- Decoding -------------------------')
    en_data = utils.pipeline(tfds_share_pl.decode_pipeline, en_data, None, {'tokenizer': en_tokenizer})