Exemplo n.º 1
0
    def export_inputs(self):
        """Inputs for exported model."""
        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0])
        label_vocab_size = len(label_vocab_dict)
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['label_vocab_size'] = label_vocab_size

        input_sentence = tf.placeholder(shape=(None, ),
                                        dtype=tf.string,
                                        name="input_sentence")

        input_pipeline_func = self.get_input_pipeline(for_export=True)

        token_ids = input_pipeline_func(input_sentence)
        token_ids_len = tf.map_fn(
            lambda x: compute_sen_lens(x, padding_token=0), token_ids)

        export_data = {
            "export_inputs": {
                "input_sentence": input_sentence
            },
            "model_inputs": {
                "input_enc_x": token_ids,
                "input_x_len": token_ids_len
            }
        }

        return export_data
Exemplo n.º 2
0
  def export_inputs(self):
    """Inputs for exported model."""
    vocab_dict = load_vocab_dict(self.text_vocab_file_path)
    vocab_size = len(vocab_dict)
    self.config['data']['vocab_size'] = vocab_size

    input_sent_left = tf.placeholder(
        shape=(None,), dtype=tf.string, name="input_sent_left")
    input_sent_right = tf.placeholder(
        shape=(None,), dtype=tf.string, name="input_sent_right")
    input_pipeline_func = self.get_input_pipeline(for_export=True)

    token_ids_left = input_pipeline_func(input_sent_left)
    token_ids_right = input_pipeline_func(input_sent_right)
    token_ids_len_left = tf.map_fn(
        lambda x: compute_sen_lens(x, padding_token=0), token_ids_left)
    token_ids_len_right = tf.map_fn(
        lambda x: compute_sen_lens(x, padding_token=0), token_ids_right)

    export_data = {
        "export_inputs": {
            "input_sent_left": input_sent_left,
            "input_sent_right": input_sent_right,
        },
        "model_inputs": {
            "input_x_left": token_ids_left,
            "input_x_right": token_ids_right,
            "input_x_left_len": token_ids_len_left,
            "input_x_right_len": token_ids_len_right,
            "input_x_len": [token_ids_len_left, token_ids_len_right]
        }
    }
    return export_data
Exemplo n.º 3
0
  def export_inputs(self):
    """Inputs for exported model."""
    vocab_dict = load_vocab_dict(self.text_vocab_file_path)
    vocab_size = len(vocab_dict)
    if self.split_token != "":
      if self.split_token not in vocab_dict:
        raise ValueError(
            "The Model uses split token: {}, not in corpus.".format(
                self.split_token))
      self.config['data']['split_token'] = int(vocab_dict[self.split_token])
    self.config['data']['vocab_size'] = vocab_size

    input_sentence = tf.placeholder(
        shape=(None,), dtype=tf.string, name="input_sentence")

    input_pipeline_func = self.get_input_pipeline(for_export=True)

    token_ids = input_pipeline_func(input_sentence)
    token_ids_len = tf.map_fn(lambda x: compute_sen_lens(x, padding_token=0),
                              token_ids)

    export_data = {
        "export_inputs": {
            "input_sentence": input_sentence
        },
        "model_inputs": {
            "input_enc_x": token_ids,
            "input_x_len": token_ids_len
        }
    }

    return export_data
Exemplo n.º 4
0
    def generate_data(self):
        """Generate data for offline training."""

        text, label = load_cls_raw_data(paths=self.paths_after_pre_process,
                                        mode=self.mode)

        text_placeholder = tf.placeholder(tf.string,
                                          shape=(None, ),
                                          name="text")
        label_placeholder = tf.placeholder(tf.string, name="label")
        self.init_feed_dict[text_placeholder] = text
        self.init_feed_dict[label_placeholder] = label
        # logging.debug("init_feed_dict: {}".format(self.init_feed_dict))

        text_ds = tf.data.Dataset.from_tensor_slices(text_placeholder)
        input_pipeline_func = self.get_input_pipeline(for_export=False)

        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)

        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        if self.use_dense:
            dense = load_npy(self.dense_npy)
            dense_ds = load_dense_dataset(dense)

        if self.infer_without_label:
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds))
            else:
                data_set = text_ds
        else:
            label_ds = load_one_label_dataset(label_placeholder, self.config)
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds))
            else:
                data_set = tf.data.Dataset.zip((text_ds, label_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        data_size = len(text)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size

        return data_set
Exemplo n.º 5
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 1
            text_ds = load_textline_dataset(self.paths_after_pre_process,
                                            column_num)
        else:
            column_num = 2
            label_ds, text_ds = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)

        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        if self.use_dense:
            dense = load_npy(self.dense_npy)
            dense_ds = load_dense_dataset(dense)

        if self.infer_without_label:
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds))
            else:
                data_set = text_ds
        else:
            label_ds = process_one_label_dataset(label_ds, self.config)
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds))
            else:
                data_set = tf.data.Dataset.zip((text_ds, label_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set
Exemplo n.º 6
0
    def generate_data(self):
        """Generate data for offline training."""
        (text_left, text_right), label = load_match_raw_data(
            paths=self.paths_after_pre_process, mode=self.mode)

        text_left_placeholder = tf.placeholder(tf.string, name="text_left")
        text_right_placeholder = tf.placeholder(tf.string, name="text_right")
        label_placeholder = tf.placeholder(tf.string, name="label")
        self.init_feed_dict[text_left_placeholder] = text_left
        self.init_feed_dict[text_right_placeholder] = text_right
        self.init_feed_dict[label_placeholder] = label

        text_ds_left = tf.data.Dataset.from_tensor_slices(
            text_left_placeholder)
        text_ds_right = tf.data.Dataset.from_tensor_slices(
            text_right_placeholder)
        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds_left = text_ds_left.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_ds_right = text_ds_right.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_size_ds_left = text_ds_left.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_size_ds_right = text_ds_right.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right))
        text_len_left_right = tf.data.Dataset.zip(
            (text_size_ds_left, text_size_ds_right))
        if self.infer_without_label:
            data_set_left_right = text_ds_left_right
        else:
            label_ds = load_one_label_dataset(label_placeholder, self.config)
            data_set_left_right = tf.data.Dataset.zip(
                (text_ds_left_right, label_ds))
        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        data_size = len(text_left)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size
        return data_set_left_right, text_len_left_right
def ids_to_sentences(ids, vocab_file_path):
    """
  transform array of numbers to array of tags/words
  ids:  [[1,2],[3,4]...]
  """

    vocab_dict = load_vocab_dict(vocab_file_path)
    id_to_vocab = {int(v): k for k, v in vocab_dict.items()}

    sentences = []
    for sent in ids:
        sent_char = []
        for s_char in sent:
            if s_char not in id_to_vocab:
                logging.error("label not in vocabs")
            else:
                sent_char.append(id_to_vocab[s_char])
        sentences.append(sent_char)
    assert len(sentences) == len(ids)
    return sentences
Exemplo n.º 8
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 2
            text_ds_left, text_ds_right = load_textline_dataset(
                self.paths_after_pre_process, column_num)
        else:
            column_num = 3
            label, text_ds_left, text_ds_right = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds_left = text_ds_left.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_ds_right = text_ds_right.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_size_ds_left = text_ds_left.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_size_ds_right = text_ds_right.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right))
        text_len_left_right = tf.data.Dataset.zip(
            (text_size_ds_left, text_size_ds_right))
        if self.infer_without_label:
            data_set_left_right = text_ds_left_right
        else:
            label_ds = process_one_label_dataset(label, self.config)
            data_set_left_right = tf.data.Dataset.zip(
                (text_ds_left_right, label_ds))
        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)

        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set_left_right, text_len_left_right
Exemplo n.º 9
0
    def generate_data(self):
        """Generate data for offline training."""

        column_num = 1
        src_path = self.src_paths_after_pre_process
        target_path = self.tgt_paths_after_pre_process

        src_ds = load_textline_dataset([src_path], column_num)

        src_ds = src_ds[0]

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        src_ds = src_ds.map(input_pipeline_func,
                            num_parallel_calls=self.num_parallel_calls)

        src_size_ds = src_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        src_ds = src_ds.map(self.exclude_padding,
                            num_parallel_calls=self.num_parallel_calls)

        if self.infer_without_label:
            data_set = tf.data.Dataset.zip((src_ds, src_size_ds))

        else:
            tgt = load_textline_dataset([target_path], column_num)
            tgt = tgt[0]
            tgt_out_ds = tgt.map(lambda x: x + ' ' + self.END_TOKEN)
            tgt_in_ds = tgt.map(lambda x: self.START_TOKEN + ' ' + x)

            tgt_in_ds = tgt_in_ds.map(
                lambda batch: self.text_pipeline_func(
                    batch, self.max_dec_len, self.text_vocab_file_path),
                num_parallel_calls=self.num_parallel_calls)

            tgt_in_size_ds = tgt_in_ds.map(
                lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
                num_parallel_calls=self.num_parallel_calls)

            tgt_in_ds = tgt_in_ds.map(
                self.exclude_padding,
                num_parallel_calls=self.num_parallel_calls)

            inp_ds = tf.data.Dataset.zip(
                (src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds))

            if self.use_label_vocab:
                target_vocab_file_path = self.label_vocab_file_paths[0]
            else:
                target_vocab_file_path = self.text_vocab_file_path
            tgt_out_ds = tgt_out_ds.map(
                lambda batch: self.text_pipeline_func(batch, self.max_dec_len,
                                                      target_vocab_file_path),
                num_parallel_calls=self.num_parallel_calls)

            tgt_out_ds = tgt_out_ds.map(
                self.exclude_padding,
                num_parallel_calls=self.num_parallel_calls)
            data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0])
        label_vocab_size = len(label_vocab_dict)
        data_size = get_file_len(self.src_paths_after_pre_process)
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['label_vocab_size'] = label_vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size

        return data_set
Exemplo n.º 10
0
  def generate_data(self):
    """Generate data for offline training."""

    src = load_seq2seq_raw_data(paths=self.src_paths_after_pre_process)
    tgt = load_seq2seq_raw_data(paths=self.tgt_paths_after_pre_process)

    tgt_out = [abs_ + ' ' + self.END_TOKEN for abs_ in tgt]
    tgt_in = [self.START_TOKEN + ' ' + abs_ for abs_ in tgt]

    assert len(src) == len(tgt_in)
    src_placeholder = tf.placeholder(tf.string, shape=(None,), name="src")
    tgt_out_placeholder = tf.placeholder(tf.string, name="tgt_out")
    tgt_in_placeholder = tf.placeholder(tf.string, name="tgt_in")
    self.init_feed_dict[src_placeholder] = src
    self.init_feed_dict[tgt_out_placeholder] = tgt_out
    self.init_feed_dict[tgt_in_placeholder] = tgt_in
    src_ds = tf.data.Dataset.from_tensor_slices(src_placeholder)
    tgt_in_ds = tf.data.Dataset.from_tensor_slices(tgt_in_placeholder)

    tgt_out_ds = tf.data.Dataset.from_tensor_slices(tgt_out_placeholder)

    input_pipeline_func = self.get_input_pipeline(for_export=False)

    src_ds = src_ds.map(
        input_pipeline_func, num_parallel_calls=self.num_parallel_calls)

    src_size_ds = src_ds.map(
        lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
        num_parallel_calls=self.num_parallel_calls)

    src_ds = src_ds.map(self.exclude_padding, num_parallel_calls=self.num_parallel_calls)

    tgt_in_ds = tgt_in_ds.map(
        lambda batch: self.text_pipeline_func(batch, self.max_dec_len,
                                              self.text_vocab_file_path),
        num_parallel_calls=self.num_parallel_calls)

    tgt_in_size_ds = tgt_in_ds.map(
        lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
        num_parallel_calls=self.num_parallel_calls)

    tgt_in_ds = tgt_in_ds.map(self.exclude_padding, num_parallel_calls=self.num_parallel_calls)

    inp_ds = tf.data.Dataset.zip((src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds))

    if self.infer_without_label:
      data_set = inp_ds
    else:
      if self.use_label_vocab:
        target_vocab_file_path = self.label_vocab_file_paths[0]
      else:
        target_vocab_file_path = self.text_vocab_file_path
      tgt_out_ds = tgt_out_ds.map(
        lambda batch: self.text_pipeline_func(batch, self.max_dec_len,
                                              target_vocab_file_path),
        num_parallel_calls=self.num_parallel_calls)

      tgt_out_ds = tgt_out_ds.map(
        self.exclude_padding, num_parallel_calls=self.num_parallel_calls
      )
      data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds))

    vocab_dict = load_vocab_dict(self.text_vocab_file_path)
    vocab_size = len(vocab_dict)
    label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0])
    label_vocab_size = len(label_vocab_dict)
    data_size = len(src)
    self.config['data']['vocab_size'] = vocab_size
    self.config['data']['label_vocab_size'] = label_vocab_size
    self.config['data']['{}_data_size'.format(self.mode)] = data_size

    return data_set