Пример #1
0
  def generate_data(self):
    """Generate data for offline training."""
    if self.infer_without_label:
      column_num = 1
      text_ds = load_textline_dataset(self.paths_after_pre_process, column_num)
    else:
      column_num = 3
      intent_label_ds, slots_label_ds, text_ds = load_textline_dataset(
          self.paths_after_pre_process, column_num)

    logging.info("Loading text dataset...")
    input_pipeline_func = self.get_input_pipeline(for_export=False)
    text_ds = text_ds.map(
        input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
    text_size_ds = text_ds.map(
        lambda x: compute_sen_lens(x, padding_token=0),
        num_parallel_calls=self.num_parallel_calls)
    text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

    if self.infer_without_label:
      data_set = text_ds
    else:
      intent_label_ds = process_one_label_dataset(
          intent_label_ds, self.config, output_index=0)
      slots_label_ds = process_multi_label_dataset(
          slots_label_ds, self.config, output_index=1)
      data_set = tf.data.Dataset.zip((text_ds, intent_label_ds, slots_label_ds))

    self.config['data']['vocab_size'] = get_vocab_size(
        self.text_vocab_file_path)
    self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
        self.paths_after_pre_process)

    return data_set
Пример #2
0
    def generate_data(self):
        """Generate data for offline training."""
        paths = self.paths
        if self.infer_without_label:
            self.column_num = 1
            text_ds = load_textline_dataset(paths, self.column_num)
        else:
            self.column_num = 2
            label_ds, text_ds = load_textline_dataset(paths, self.column_num)

        logging.info("process text ds...")
        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)
        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        logging.info("process label ds...")
        if self.infer_without_label:
            data_set = text_ds
        else:
            label_ds = process_multi_label_dataset(label_ds, self.config)
            data_set = tf.data.Dataset.zip((text_ds, label_ds))

        self.config['data']['vocab_size'] = get_vocab_size(
            self.text_vocab_file_path)
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths)

        return data_set
Пример #3
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 1
            text_ds = load_textline_dataset(self.paths_after_pre_process,
                                            column_num)
        else:
            column_num = 2
            label_ds, text_ds = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)

        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        if self.use_dense:
            dense = load_npy(self.dense_npy)
            dense_ds = load_dense_dataset(dense)

        if self.infer_without_label:
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds))
            else:
                data_set = text_ds
        else:
            label_ds = process_one_label_dataset(label_ds, self.config)
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds))
            else:
                data_set = tf.data.Dataset.zip((text_ds, label_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set
Пример #4
0
 def load_a_raw_file(self, one_path, infer_without_label):
   """
   Load a raw file. Return text and label.
   For single text input, text: [sentence1, ...]
   For multiple text inputs, text: [[sentence1_1, ...], [sentence1_2, ...]]
   For single output, label: [label1, label2, ...]
   For multiple outputs, label: [[label1_1, ...], [label1_2, ...]]
   """
   column_num = 1
   text_path, target_path = one_path
   texts = load_textline_dataset([text_path], column_num)
  # texts = data_utils.load_seq2seq_raw_data([text_path])
   if not infer_without_label:
     target = load_textline_dataset([target_path],column_num)
     return texts+target, target
   return texts, []
Пример #5
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 2
            text_ds_left, text_ds_right = load_textline_dataset(
                self.paths_after_pre_process, column_num)
        else:
            column_num = 3
            label, text_ds_left, text_ds_right = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds_left = text_ds_left.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_ds_right = text_ds_right.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_size_ds_left = text_ds_left.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_size_ds_right = text_ds_right.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right))
        text_len_left_right = tf.data.Dataset.zip(
            (text_size_ds_left, text_size_ds_right))
        if self.infer_without_label:
            data_set_left_right = text_ds_left_right
        else:
            label_ds = process_one_label_dataset(label, self.config)
            data_set_left_right = tf.data.Dataset.zip(
                (text_ds_left_right, label_ds))
        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)

        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set_left_right, text_len_left_right
Пример #6
0
 def load_a_raw_file(self, one_path, infer_without_label):
     """
 Load a raw file. Return text and label.
 For single text input, text: [sentence1, ...]
 For multiple text inputs, text: [[sentence1_1, ...], [sentence1_2, ...]]
 For single output, label: [label1, label2, ...]
 For multiple outputs, label: [[label1_1, ...], [label1_2, ...]]
 """
     if infer_without_label:
         column_num = 1
     else:
         column_num = 2
     ds_list = load_textline_dataset(one_path, column_num)
     if infer_without_label:
         text = ds_list
         label = []  #to modifiy
     else:
         text = ds_list[1:]
         label = ds_list[:1]
     return (text, label)
Пример #7
0
    def generate_data(self):
        """Generate data for offline training."""

        column_num = 1
        src_path = self.src_paths_after_pre_process
        target_path = self.tgt_paths_after_pre_process

        src_ds = load_textline_dataset([src_path], column_num)

        src_ds = src_ds[0]

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        src_ds = src_ds.map(input_pipeline_func,
                            num_parallel_calls=self.num_parallel_calls)

        src_size_ds = src_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        src_ds = src_ds.map(self.exclude_padding,
                            num_parallel_calls=self.num_parallel_calls)

        if self.infer_without_label:
            data_set = tf.data.Dataset.zip((src_ds, src_size_ds))

        else:
            tgt = load_textline_dataset([target_path], column_num)
            tgt = tgt[0]
            tgt_out_ds = tgt.map(lambda x: x + ' ' + self.END_TOKEN)
            tgt_in_ds = tgt.map(lambda x: self.START_TOKEN + ' ' + x)

            tgt_in_ds = tgt_in_ds.map(
                lambda batch: self.text_pipeline_func(
                    batch, self.max_dec_len, self.text_vocab_file_path),
                num_parallel_calls=self.num_parallel_calls)

            tgt_in_size_ds = tgt_in_ds.map(
                lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
                num_parallel_calls=self.num_parallel_calls)

            tgt_in_ds = tgt_in_ds.map(
                self.exclude_padding,
                num_parallel_calls=self.num_parallel_calls)

            inp_ds = tf.data.Dataset.zip(
                (src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds))

            if self.use_label_vocab:
                target_vocab_file_path = self.label_vocab_file_paths[0]
            else:
                target_vocab_file_path = self.text_vocab_file_path
            tgt_out_ds = tgt_out_ds.map(
                lambda batch: self.text_pipeline_func(batch, self.max_dec_len,
                                                      target_vocab_file_path),
                num_parallel_calls=self.num_parallel_calls)

            tgt_out_ds = tgt_out_ds.map(
                self.exclude_padding,
                num_parallel_calls=self.num_parallel_calls)
            data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0])
        label_vocab_size = len(label_vocab_dict)
        data_size = get_file_len(self.src_paths_after_pre_process)
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['label_vocab_size'] = label_vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size

        return data_set