Exemplo n.º 1
0
  def export_inputs(self):
    """Inputs for exported model."""
    vocab_dict = load_vocab_dict(self.text_vocab_file_path)
    vocab_size = len(vocab_dict)
    self.config['data']['vocab_size'] = vocab_size

    input_sent_left = tf.placeholder(
        shape=(None,), dtype=tf.string, name="input_sent_left")
    input_sent_right = tf.placeholder(
        shape=(None,), dtype=tf.string, name="input_sent_right")
    input_pipeline_func = self.get_input_pipeline(for_export=True)

    token_ids_left = input_pipeline_func(input_sent_left)
    token_ids_right = input_pipeline_func(input_sent_right)
    token_ids_len_left = tf.map_fn(
        lambda x: compute_sen_lens(x, padding_token=0), token_ids_left)
    token_ids_len_right = tf.map_fn(
        lambda x: compute_sen_lens(x, padding_token=0), token_ids_right)

    export_data = {
        "export_inputs": {
            "input_sent_left": input_sent_left,
            "input_sent_right": input_sent_right,
        },
        "model_inputs": {
            "input_x_left": token_ids_left,
            "input_x_right": token_ids_right,
            "input_x_left_len": token_ids_len_left,
            "input_x_right_len": token_ids_len_right,
            "input_x_len": [token_ids_len_left, token_ids_len_right]
        }
    }
    return export_data
Exemplo n.º 2
0
    def generate_data(self):
        """Generate data for offline training."""
        (text_left, text_right), label = load_match_raw_data(
            paths=self.paths_after_pre_process, mode=self.mode)

        text_left_placeholder = tf.placeholder(tf.string, name="text_left")
        text_right_placeholder = tf.placeholder(tf.string, name="text_right")
        label_placeholder = tf.placeholder(tf.string, name="label")
        self.init_feed_dict[text_left_placeholder] = text_left
        self.init_feed_dict[text_right_placeholder] = text_right
        self.init_feed_dict[label_placeholder] = label

        text_ds_left = tf.data.Dataset.from_tensor_slices(
            text_left_placeholder)
        text_ds_right = tf.data.Dataset.from_tensor_slices(
            text_right_placeholder)
        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds_left = text_ds_left.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_ds_right = text_ds_right.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_size_ds_left = text_ds_left.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_size_ds_right = text_ds_right.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right))
        text_len_left_right = tf.data.Dataset.zip(
            (text_size_ds_left, text_size_ds_right))
        if self.infer_without_label:
            data_set_left_right = text_ds_left_right
        else:
            label_ds = load_one_label_dataset(label_placeholder, self.config)
            data_set_left_right = tf.data.Dataset.zip(
                (text_ds_left_right, label_ds))
        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        data_size = len(text_left)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size
        return data_set_left_right, text_len_left_right
Exemplo n.º 3
0
    def export_inputs(self):
        """Inputs for exported model."""
        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0])
        label_vocab_size = len(label_vocab_dict)
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['label_vocab_size'] = label_vocab_size

        input_sentence = tf.placeholder(shape=(None, ),
                                        dtype=tf.string,
                                        name="input_sentence")

        input_pipeline_func = self.get_input_pipeline(for_export=True)

        token_ids = input_pipeline_func(input_sentence)
        token_ids_len = tf.map_fn(
            lambda x: compute_sen_lens(x, padding_token=0), token_ids)

        export_data = {
            "export_inputs": {
                "input_sentence": input_sentence
            },
            "model_inputs": {
                "input_enc_x": token_ids,
                "input_x_len": token_ids_len
            }
        }

        return export_data
Exemplo n.º 4
0
  def generate_data(self):
    """Generate data for offline training."""
    if self.infer_without_label:
      column_num = 1
      text_ds = load_textline_dataset(self.paths_after_pre_process, column_num)
    else:
      column_num = 3
      intent_label_ds, slots_label_ds, text_ds = load_textline_dataset(
          self.paths_after_pre_process, column_num)

    logging.info("Loading text dataset...")
    input_pipeline_func = self.get_input_pipeline(for_export=False)
    text_ds = text_ds.map(
        input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
    text_size_ds = text_ds.map(
        lambda x: compute_sen_lens(x, padding_token=0),
        num_parallel_calls=self.num_parallel_calls)
    text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

    if self.infer_without_label:
      data_set = text_ds
    else:
      intent_label_ds = process_one_label_dataset(
          intent_label_ds, self.config, output_index=0)
      slots_label_ds = process_multi_label_dataset(
          slots_label_ds, self.config, output_index=1)
      data_set = tf.data.Dataset.zip((text_ds, intent_label_ds, slots_label_ds))

    self.config['data']['vocab_size'] = get_vocab_size(
        self.text_vocab_file_path)
    self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
        self.paths_after_pre_process)

    return data_set
Exemplo n.º 5
0
    def generate_data(self):
        """Generate data for offline training."""
        paths = self.paths
        if self.infer_without_label:
            self.column_num = 1
            text_ds = load_textline_dataset(paths, self.column_num)
        else:
            self.column_num = 2
            label_ds, text_ds = load_textline_dataset(paths, self.column_num)

        logging.info("process text ds...")
        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)
        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        logging.info("process label ds...")
        if self.infer_without_label:
            data_set = text_ds
        else:
            label_ds = process_multi_label_dataset(label_ds, self.config)
            data_set = tf.data.Dataset.zip((text_ds, label_ds))

        self.config['data']['vocab_size'] = get_vocab_size(
            self.text_vocab_file_path)
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths)

        return data_set
Exemplo n.º 6
0
  def export_inputs(self):
    """Inputs for exported model."""
    vocab_dict = load_vocab_dict(self.text_vocab_file_path)
    vocab_size = len(vocab_dict)
    if self.split_token != "":
      if self.split_token not in vocab_dict:
        raise ValueError(
            "The Model uses split token: {}, not in corpus.".format(
                self.split_token))
      self.config['data']['split_token'] = int(vocab_dict[self.split_token])
    self.config['data']['vocab_size'] = vocab_size

    input_sentence = tf.placeholder(
        shape=(None,), dtype=tf.string, name="input_sentence")

    input_pipeline_func = self.get_input_pipeline(for_export=True)

    token_ids = input_pipeline_func(input_sentence)
    token_ids_len = tf.map_fn(lambda x: compute_sen_lens(x, padding_token=0),
                              token_ids)

    export_data = {
        "export_inputs": {
            "input_sentence": input_sentence
        },
        "model_inputs": {
            "input_enc_x": token_ids,
            "input_x_len": token_ids_len
        }
    }

    return export_data
Exemplo n.º 7
0
  def call(self, inputs, training=None, mask=None):  # pylint: disable=too-many-locals
    input_x = inputs["input_x"]
    if self.use_dense_task:
      dense_input = inputs["input_dense"]

    if self.use_true_length:
      # [batch_size, max_doc_len, max_sen_len]
      input_hx = self.pad_to_hier_input_true_len(
          input_x,
          self.max_doc_len,
          self.max_sen_len,
          self.split_token,
          padding_token=self.padding_token)
    else:
      # [batch_size, max_doc_len, max_sen_len]
      input_hx = self.pad_to_hier_input(
          input_x,
          self.max_doc_len,
          self.max_sen_len,
          padding_token=self.padding_token)

    # [batch_size, max_doc_len]
    sen_lens = compute_sen_lens(input_hx, padding_token=self.padding_token)
    # [batch_size]
    doc_lens = compute_doc_lens(sen_lens)
    # [batch_size, max_doc_len, max_sen_len, 1]
    sen_mask = tf.expand_dims(
        tf.sequence_mask(sen_lens, self.max_sen_len, dtype=tf.float32), axis=-1)

    # [batch_size, max_doc_len, 1]
    doc_mask = tf.expand_dims(
        tf.sequence_mask(doc_lens, self.max_doc_len, dtype=tf.float32), axis=-1)

    # [batch_size, max_doc_len, max_sen_len, embed_len]
    out = self.embed(input_hx)
    if self.use_pretrained_model:
      input_px = self.get_pre_train_graph(input_x)
      input_px = tf.reshape(input_px, [-1, self.max_doc_len,
                                       self.max_sen_len, self.pretrained_model_dim])
      out = tf.concat([out, input_px], axis=-1)
    out = self.embed_d(out, training=training)
    all_sen_encoder = tf.keras.layers.TimeDistributed(self.sen_encoder)
    # [batch_size, max_doc_len, features]
    out = all_sen_encoder(out, training=training, mask=sen_mask)
    # [batch_size, features]
    out = self.doc_encoder(out, training=training, mask=doc_mask)

    if self.use_dense_input:
      dense_out = self.dense_input_linear(dense_input)
      if self.only_dense_input:
        out = dense_out
      else:
        out = tf.keras.layers.Concatenate()([out, dense_out])

    # [batch_size, class_num]
    scores = self.final_dense(out)

    return scores
Exemplo n.º 8
0
    def generate_data(self):
        """Generate data for offline training."""

        text, label = load_cls_raw_data(paths=self.paths_after_pre_process,
                                        mode=self.mode)

        text_placeholder = tf.placeholder(tf.string,
                                          shape=(None, ),
                                          name="text")
        label_placeholder = tf.placeholder(tf.string, name="label")
        self.init_feed_dict[text_placeholder] = text
        self.init_feed_dict[label_placeholder] = label
        # logging.debug("init_feed_dict: {}".format(self.init_feed_dict))

        text_ds = tf.data.Dataset.from_tensor_slices(text_placeholder)
        input_pipeline_func = self.get_input_pipeline(for_export=False)

        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)

        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        if self.use_dense:
            dense = load_npy(self.dense_npy)
            dense_ds = load_dense_dataset(dense)

        if self.infer_without_label:
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds))
            else:
                data_set = text_ds
        else:
            label_ds = load_one_label_dataset(label_placeholder, self.config)
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds))
            else:
                data_set = tf.data.Dataset.zip((text_ds, label_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        data_size = len(text)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size

        return data_set
Exemplo n.º 9
0
  def load_text_dataset(self, text_ds):
    """Load text data set."""
    logging.info("Loading text dataset...")
    input_pipeline_func = self.get_input_pipeline(for_export=False)
    text_ds = text_ds.map(
        input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
    text_size_ds = text_ds.map(
        lambda x: compute_sen_lens(x, padding_token=0),
        num_parallel_calls=self.num_parallel_calls)
    text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

    return text_ds
Exemplo n.º 10
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 1
            text_ds = load_textline_dataset(self.paths_after_pre_process,
                                            column_num)
        else:
            column_num = 2
            label_ds, text_ds = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)

        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        if self.use_dense:
            dense = load_npy(self.dense_npy)
            dense_ds = load_dense_dataset(dense)

        if self.infer_without_label:
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds))
            else:
                data_set = text_ds
        else:
            label_ds = process_one_label_dataset(label_ds, self.config)
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds))
            else:
                data_set = tf.data.Dataset.zip((text_ds, label_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set
Exemplo n.º 11
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 2
            text_ds_left, text_ds_right = load_textline_dataset(
                self.paths_after_pre_process, column_num)
        else:
            column_num = 3
            label, text_ds_left, text_ds_right = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds_left = text_ds_left.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_ds_right = text_ds_right.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_size_ds_left = text_ds_left.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_size_ds_right = text_ds_right.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right))
        text_len_left_right = tf.data.Dataset.zip(
            (text_size_ds_left, text_size_ds_right))
        if self.infer_without_label:
            data_set_left_right = text_ds_left_right
        else:
            label_ds = process_one_label_dataset(label, self.config)
            data_set_left_right = tf.data.Dataset.zip(
                (text_ds_left_right, label_ds))
        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)

        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set_left_right, text_len_left_right
Exemplo n.º 12
0
    def call(self, inputs, training=None, mask=None):
        enc_inputs = inputs["input_enc_x"]
        seq_enc_len = compute_sen_lens(enc_inputs,
                                       padding_token=self.padding_token)
        enc_mask = self.mask_layer(enc_inputs)
        enc_inputs = self.embed(enc_inputs)
        enc_inputs = self.embed_d(enc_inputs)
        enc_outputs, enc_state = self.encoder(enc_inputs,
                                              training=training,
                                              mask=enc_mask)
        if self.is_infer:
            dec_outputs = self.decoder([enc_outputs, enc_state, seq_enc_len],
                                       training=training)
            return dec_outputs

        else:
            dec_inputs = inputs["input_dec_x"]
            seq_dec_len = compute_sen_lens(dec_inputs,
                                           padding_token=self.padding_token)
            dec_outputs = self.decoder(
                [dec_inputs, seq_dec_len, enc_outputs, enc_state, seq_enc_len],
                training=training)
            return dec_outputs
Exemplo n.º 13
0
    def test_compute_sen_lens(self):
        sentences = tf.placeholder(dtype=tf.int32)
        lens = compute_sen_lens(sentences)

        with self.cached_session(use_gpu=False, force_gpu=False) as sess:
            # test for 1d
            res = sess.run(lens, feed_dict={sentences: [1, 2, 0, 0]})
            self.assertEqual(res, 2)

            # test for 2d
            res = sess.run(lens,
                           feed_dict={sentences: [[1, 2, 0, 0], [1, 2, 3, 4]]})
            self.assertAllEqual(res, [2, 4])

            # test for 3d
            res = sess.run(lens,
                           feed_dict={
                               sentences: [[[1, 2, 0, 0]], [[1, 2, 3, 4]],
                                           [[1, 0, 0, 0]]]
                           })
            self.assertAllEqual(res, [[2], [4], [1]])
Exemplo n.º 14
0
 def call(self, inputs, training=None, mask=None):
     input_x = inputs["input_x"]
     # [batch_size, max_len]
     input_x_lens = compute_sen_lens(input_x,
                                     padding_token=self.padding_token)
     # [batch_size, max_len, 1]
     mask = tf.expand_dims(tf.sequence_mask(input_x_lens,
                                            self.max_len,
                                            dtype=tf.float32),
                           axis=-1)
     # [batch_size, max_len, embed_len]
     out = self.embed(input_x)
     # [batch_size, features]
     out = self.embed_dropout(out, training=training)
     out = self.bi_rnn(out)
     intent_out = self.attention(out, mask=mask)
     intent_out = self.dropout(intent_out)
     intent_out = self.intent_dense(intent_out)
     intent_out = tf.identity(intent_out, name="intent_logits")
     slots_out = self.dropout(out)
     slots_out = self.slots_dense(slots_out)
     slots_out = tf.identity(slots_out, name="slots_logits")
     return intent_out, slots_out
Exemplo n.º 15
0
    def generate_data(self):
        """Generate data for offline training."""

        column_num = 1
        src_path = self.src_paths_after_pre_process
        target_path = self.tgt_paths_after_pre_process

        src_ds = load_textline_dataset([src_path], column_num)

        src_ds = src_ds[0]

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        src_ds = src_ds.map(input_pipeline_func,
                            num_parallel_calls=self.num_parallel_calls)

        src_size_ds = src_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        src_ds = src_ds.map(self.exclude_padding,
                            num_parallel_calls=self.num_parallel_calls)

        if self.infer_without_label:
            data_set = tf.data.Dataset.zip((src_ds, src_size_ds))

        else:
            tgt = load_textline_dataset([target_path], column_num)
            tgt = tgt[0]
            tgt_out_ds = tgt.map(lambda x: x + ' ' + self.END_TOKEN)
            tgt_in_ds = tgt.map(lambda x: self.START_TOKEN + ' ' + x)

            tgt_in_ds = tgt_in_ds.map(
                lambda batch: self.text_pipeline_func(
                    batch, self.max_dec_len, self.text_vocab_file_path),
                num_parallel_calls=self.num_parallel_calls)

            tgt_in_size_ds = tgt_in_ds.map(
                lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
                num_parallel_calls=self.num_parallel_calls)

            tgt_in_ds = tgt_in_ds.map(
                self.exclude_padding,
                num_parallel_calls=self.num_parallel_calls)

            inp_ds = tf.data.Dataset.zip(
                (src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds))

            if self.use_label_vocab:
                target_vocab_file_path = self.label_vocab_file_paths[0]
            else:
                target_vocab_file_path = self.text_vocab_file_path
            tgt_out_ds = tgt_out_ds.map(
                lambda batch: self.text_pipeline_func(batch, self.max_dec_len,
                                                      target_vocab_file_path),
                num_parallel_calls=self.num_parallel_calls)

            tgt_out_ds = tgt_out_ds.map(
                self.exclude_padding,
                num_parallel_calls=self.num_parallel_calls)
            data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0])
        label_vocab_size = len(label_vocab_dict)
        data_size = get_file_len(self.src_paths_after_pre_process)
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['label_vocab_size'] = label_vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size

        return data_set
Exemplo n.º 16
0
  def generate_data(self):
    """Generate data for offline training."""

    src = load_seq2seq_raw_data(paths=self.src_paths_after_pre_process)
    tgt = load_seq2seq_raw_data(paths=self.tgt_paths_after_pre_process)

    tgt_out = [abs_ + ' ' + self.END_TOKEN for abs_ in tgt]
    tgt_in = [self.START_TOKEN + ' ' + abs_ for abs_ in tgt]

    assert len(src) == len(tgt_in)
    src_placeholder = tf.placeholder(tf.string, shape=(None,), name="src")
    tgt_out_placeholder = tf.placeholder(tf.string, name="tgt_out")
    tgt_in_placeholder = tf.placeholder(tf.string, name="tgt_in")
    self.init_feed_dict[src_placeholder] = src
    self.init_feed_dict[tgt_out_placeholder] = tgt_out
    self.init_feed_dict[tgt_in_placeholder] = tgt_in
    src_ds = tf.data.Dataset.from_tensor_slices(src_placeholder)
    tgt_in_ds = tf.data.Dataset.from_tensor_slices(tgt_in_placeholder)

    tgt_out_ds = tf.data.Dataset.from_tensor_slices(tgt_out_placeholder)

    input_pipeline_func = self.get_input_pipeline(for_export=False)

    src_ds = src_ds.map(
        input_pipeline_func, num_parallel_calls=self.num_parallel_calls)

    src_size_ds = src_ds.map(
        lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
        num_parallel_calls=self.num_parallel_calls)

    src_ds = src_ds.map(self.exclude_padding, num_parallel_calls=self.num_parallel_calls)

    tgt_in_ds = tgt_in_ds.map(
        lambda batch: self.text_pipeline_func(batch, self.max_dec_len,
                                              self.text_vocab_file_path),
        num_parallel_calls=self.num_parallel_calls)

    tgt_in_size_ds = tgt_in_ds.map(
        lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
        num_parallel_calls=self.num_parallel_calls)

    tgt_in_ds = tgt_in_ds.map(self.exclude_padding, num_parallel_calls=self.num_parallel_calls)

    inp_ds = tf.data.Dataset.zip((src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds))

    if self.infer_without_label:
      data_set = inp_ds
    else:
      if self.use_label_vocab:
        target_vocab_file_path = self.label_vocab_file_paths[0]
      else:
        target_vocab_file_path = self.text_vocab_file_path
      tgt_out_ds = tgt_out_ds.map(
        lambda batch: self.text_pipeline_func(batch, self.max_dec_len,
                                              target_vocab_file_path),
        num_parallel_calls=self.num_parallel_calls)

      tgt_out_ds = tgt_out_ds.map(
        self.exclude_padding, num_parallel_calls=self.num_parallel_calls
      )
      data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds))

    vocab_dict = load_vocab_dict(self.text_vocab_file_path)
    vocab_size = len(vocab_dict)
    label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0])
    label_vocab_size = len(label_vocab_dict)
    data_size = len(src)
    self.config['data']['vocab_size'] = vocab_size
    self.config['data']['label_vocab_size'] = label_vocab_size
    self.config['data']['{}_data_size'.format(self.mode)] = data_size

    return data_set