Exemplo n.º 1
0
  def _build(self, features, labels, params):
    # Pre-process features and labels
    features, labels = self._preprocess(features, labels)
    # features = tf.Print(features, [tf.shape(features), features], message="input features: ")
    encoder_output = self.encode(features, labels)
    decoder_output, _ = self.decode(encoder_output, features, labels)
    print ("build seq2seq model")
    print (encoder_output)
    print (decoder_output)
    print ("build done")
    if self.mode == tf.contrib.learn.ModeKeys.INFER:
      predictions = self.create_predictions(
          decoder_output=decoder_output, features=features, labels=labels)
      loss = None
      train_op = None
    else:
      losses, loss = self.compute_loss(decoder_output, features, labels)

      train_op = None
      if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
        train_op = self._build_train_op(loss)

      predictions = self.create_predictions(
          decoder_output=decoder_output,
          features=features,
          labels=labels,
          losses=losses)

    # We add "useful" tensors to the graph collection so that we
    # can easly find them in our hooks/monitors.
    graph_utils.add_dict_to_collection(predictions, "predictions")

    return predictions, loss, train_op
Exemplo n.º 2
0
  def create_lookup_table(self):

    # Create vocabulary lookup for source
    source_vocab_to_id, source_id_to_vocab, source_word_to_count, source_origin_vocab_size = \
      vocab.create_tensor_vocab(self._vocab_instance)

    # Create vocabulary look for target
    target_vocab_to_id, target_id_to_vocab, target_word_to_count, target_origin_vocab_size = \
      source_vocab_to_id, source_id_to_vocab, source_word_to_count, source_origin_vocab_size

    # Add vocab tables to graph colection so that we can access them in
    # other places.
    graph_utils.add_dict_to_collection({
        "source_vocab_to_id": source_vocab_to_id,
        "source_id_to_vocab": source_id_to_vocab,
        "source_word_to_count": source_word_to_count,
        "target_vocab_to_id": target_vocab_to_id,
        "target_id_to_vocab": target_id_to_vocab,
        "target_word_to_count": target_word_to_count
    }, "vocab_tables")

    self._source_vocab_to_id = source_vocab_to_id
    self._source_id_to_vocab = source_id_to_vocab
    self._target_vocab_to_id = target_vocab_to_id
    self._target_id_to_vocab = target_id_to_vocab
    self._source_origin_vocab_size = source_origin_vocab_size
    self._target_origin_vocab_size = target_origin_vocab_size
Exemplo n.º 3
0
  def _build(self, features, labels, params):
    # Pre-process features and labels
    features, labels = self._preprocess(features, labels)

    encoder_output = self.encode(features, labels)
    decoder_output, _, = self.decode(encoder_output, features, labels)

# 判断是都是推理,实例化的predictions参数不一样
    if self.mode == tf.contrib.learn.ModeKeys.INFER:
      predictions = self._create_predictions(
          decoder_output=decoder_output, features=features, labels=labels)
      loss = None
      train_op = None
    else:
      losses, loss = self.compute_loss(decoder_output, features, labels)

      train_op = None
      if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
        train_op = self._build_train_op(loss)

      predictions = self._create_predictions(
          decoder_output=decoder_output,
          features=features,
          labels=labels,
          losses=losses)

    # We add "useful" tensors to the graph collection so that we
    # can easly find them in our hooks/monitors.
    graph_utils.add_dict_to_collection(predictions, "predictions")

    return predictions, loss, train_op
Exemplo n.º 4
0
  def _build(self, features, labels, params):
    # Pre-process features and labels
    features, labels = self._preprocess(features, labels)

    encoder_output = self.encode(features, labels)
    decoder_output, _, = self.decode(encoder_output, features, labels)

    if self.mode == tf.contrib.learn.ModeKeys.INFER:
      predictions = self._create_predictions(
          decoder_output=decoder_output, features=features, labels=labels)
      loss = None
      train_op = None
    else:
      losses, loss = self.compute_loss(decoder_output, features, labels)

      train_op = None
      if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
        train_op = self._build_train_op(loss)

      predictions = self._create_predictions(
          decoder_output=decoder_output,
          features=features,
          labels=labels,
          losses=losses)

    # We add "useful" tensors to the graph collection so that we
    # can easly find them in our hooks/monitors.
    graph_utils.add_dict_to_collection(predictions, "predictions")

    return predictions, loss, train_op
Exemplo n.º 5
0
    def _build(self, features, labels, params):
        # Pre-process features and labels
        features, labels = self._preprocess(features, labels)

        encoder_output = self.encode(features, labels)
        decoder_output, _, = self.decode(encoder_output, features, labels)

        if self.mode == tf.contrib.learn.ModeKeys.INFER:
            predictions = self._create_predictions(
                decoder_output=decoder_output,
                features=features,
                labels=labels)
            loss = None
            train_op = None
        else:
            losses, loss = self.compute_loss(decoder_output, features, labels)

            train_op = None
            if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
                train_op = self._build_train_op(loss)

            predictions = self._create_predictions(
                decoder_output=decoder_output,
                features=features,
                labels=labels,
                losses=losses)

        # We add "useful" tensors to the graph collection so that we
        # can easly find them in our hooks/monitors.
        graph_utils.add_dict_to_collection(predictions, "predictions")

        #here return 3 elements is ok, in estimator, it will be atomatically into model_fn_lib.ModelFnOps
        return predictions, loss, train_op
Exemplo n.º 6
0
 def setUp(self):
     super(TestTokenCounter, self).setUp()
     self.model_dir = tempfile.mkdtemp()
     graph_utils.add_dict_to_collection(
         {"source_len": tf.constant([[2, 3]])}, "features")
     graph_utils.add_dict_to_collection({"target_len": tf.constant([4, 6])},
                                        "labels")
Exemplo n.º 7
0
  def encode(self, features, labels):
    res = super(DiscriminatorSeq2Seq, self).encode(features, labels)
    # Add the encoder output to a graph collection so that we can find it.
    dict_items = collections.OrderedDict(zip(res._fields, res))
    dict_items.pop("final_state")
    graph_utils.add_dict_to_collection(dict_items, "encoder_output")

    return res
Exemplo n.º 8
0
  def setUp(self):
    super(TestTrainSampleHook, self).setUp()
    self.sample_dir = tempfile.mkdtemp()

    # The hook expects these collections to be in the graph
    pred_dict = {}
    pred_dict["predicted_tokens"] = tf.constant([["Hello", "World", "笑w"]])
    pred_dict["labels.target_tokens"] = tf.constant([["Hello", "World", "笑w"]])
    pred_dict["labels.target_len"] = tf.constant(2),
    graph_utils.add_dict_to_collection(pred_dict, "predictions")
Exemplo n.º 9
0
  def setUp(self):
    super(TestTrainSampleHook, self).setUp()
    self.model_dir = tempfile.mkdtemp()
    self.sample_dir = os.path.join(self.model_dir, "samples")

    # The hook expects these collections to be in the graph
    pred_dict = {}
    pred_dict["predicted_tokens"] = tf.constant([["Hello", "World", "笑w"]])
    pred_dict["labels.target_tokens"] = tf.constant([["Hello", "World", "笑w"]])
    pred_dict["labels.target_len"] = tf.constant(2),
    graph_utils.add_dict_to_collection(pred_dict, "predictions")
Exemplo n.º 10
0
  def _build(self, features, labels):
    # Create vocabulary lookup for source
    source_vocab_to_id, source_id_to_vocab, _ = \
      vocab.create_vocabulary_lookup_table(self.source_vocab_info.path)

    # Create vocabulary look for target
    target_vocab_to_id, target_id_to_vocab, _ = \
      vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

    # Add vocab tables to graph colection so that we can access them in
    # other places.
    graph_utils.add_dict_to_collection({
        "source_vocab_to_id": source_vocab_to_id,
        "source_id_to_vocab": source_id_to_vocab,
        "target_vocab_to_id": target_vocab_to_id,
        "target_id_to_vocab": target_id_to_vocab
    }, "vocab_tables")

    # Slice source to max_len
    if self.max_seq_len_source is not None:
      features["source_tokens"] = features[
          "source_tokens"][:, :self.max_seq_len_source]
      features["source_len"] = tf.minimum(
          features["source_len"], self.max_seq_len_source)

    # Look up the source ids in the vocabulary
    features["source_ids"] = source_vocab_to_id.lookup(features[
        "source_tokens"])

    features["source_len"] = tf.to_int32(features["source_len"])
    tf.summary.histogram("source_len", tf.to_float(features["source_len"]))

    if labels is None:
      return features, None

    labels = labels.copy()

    # Slices targets to max length
    if self.max_seq_len_target is not None:
      labels["target_tokens"] = labels[
          "target_tokens"][:, :self.max_seq_len_target]
      labels["target_len"] = tf.minimum(
          labels["target_len"], self.max_seq_len_target)

    # Look up the target ids in the vocabulary
    labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"])

    labels["target_len"] = tf.to_int32(labels["target_len"])
    tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

    return features, labels
Exemplo n.º 11
0
def topic_softmax(logits_message,logits_topic,batch_size):  ###(exp(Vi)+exp(Ki)) / (sum(exp(Vi))+sum(exp(Ki))) , if the word is a topic word in the mean while
    logits_message_exp = tf.exp(logits_message*0.1)
    logits_topic_exp = tf.exp(logits_topic*0.1)
    
    #logits_message_exp = tf.clip_by_norm(logits_message_exp,0.1)
    #logits_topic_exp = tf.clip_by_norm(logits_topic_exp,0.1)
    
    
    logits_message_exp_nan=tf.is_nan(logits_message_exp)
    logits_message_exp_nan=tf.where(logits_message_exp_nan)
    
    logits_topic_exp_nan=tf.is_nan(logits_topic_exp)
    logits_topic_exp_nan=tf.where(logits_topic_exp_nan)
            
    logits_exp_sum = logits_message_exp   ##require sum of the last dim
    ###logits_exp_sum = tf.concat([logits_message_exp, logits_topic_exp],-1)   ##require sum of the last dim
    ###logits_exp_sum = tf.add(logits_message_exp, topic_words_mask*logits_topic_exp)   ##require sum of the last dim
    logits_exp_sum = tf.reduce_sum(logits_exp_sum,-1)
    logits_exp_sum = tf.expand_dims(logits_exp_sum,-1)
    
    vocab_size = logits_message.get_shape().as_list()[-1]
    
    if tf.contrib.learn.ModeKeys.TRAIN:
        logits_exp_sum = tf.tile(logits_exp_sum,[1,1,vocab_size])  ###ke you ke wu
    
    ###logits_exp_sum = tf.clip_by_value(logits_exp_sum,10000,logits_exp_sum)
    ###logits_softmax_output = (logits_message_exp + topic_words_mask*logits_topic_exp)/logits_exp_sum
    ###logits_softmax_output = logits_topic_exp / logits_exp_sum
    logits_softmax_output = logits_message_exp / logits_exp_sum
    ###logits_softmax_output = (logits_message_exp + logits_topic_exp)/logits_exp_sum
    ###logits_softmax_output = tf.add(logits_message_exp, topic_words_mask*logits_topic_exp)/logits_exp_sum
    
    graph_utils.add_dict_to_collection({
      "logits_message_exp": logits_message_exp, 
      "logits_topic_exp": logits_topic_exp,
      "logits_message_exp_nan":logits_message_exp_nan,
      "logits_topic_exp_nan":logits_topic_exp_nan,
      "logits_exp_sum":logits_exp_sum,
      "logits_softmax_output": logits_softmax_output,
      }, "logits_softmax")
        
    return logits_softmax_output
Exemplo n.º 12
0
  def _build(self, features, labels, params):
    # Pre-process features and labels
    features, labels = self._preprocess(features, labels)

    encoder_output = self.encode(features, labels)
    decoder_output, _, = self.decode(encoder_output, features, labels)

    if self.mode == tf.contrib.learn.ModeKeys.INFER:
      loss = None
      train_op = None
      
      predictions = self._create_predictions(
          decoder_output=decoder_output, features=features, labels=labels)
    else:
      losses, loss = self.compute_loss(decoder_output, features, labels)

      train_op = None
      if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
        gradient_multipliers = {}
        # multiply the gradient by 1.0/(2*#att_layer)       
        for i in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/conv_seq2seq/encode'):
          if 'encode/W' in i.name or 'encode/pos' in i.name:
            continue
          tf.logging.info("tensor %s, name is %s", i, i.name)
          gradient_multipliers[i] = 1.0/(2*self.params["decoder.params"]["cnn.layers"])
        #tf.logging.info("gradient_multipliers %s",gradient_multipliers)
        train_op = self._build_train_op(loss, gradient_multipliers=gradient_multipliers)
      
      predictions = self._create_predictions(
          decoder_output=decoder_output,
          features=features,
          labels=labels,
          losses=losses)

    # We add "useful" tensors to the graph collection so that we
    # can easly find them in our hooks/monitors.
    graph_utils.add_dict_to_collection(predictions, "predictions")

    return predictions, loss, train_op
Exemplo n.º 13
0
    def _preprocess(self, features, labels):
        """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for target vocab
    - Converts tokens into vocabulary ids
    - Prepends a speical "SEQUENCE_START" token to the target
    - Appends a speical "SEQUENCE_END" token to the target
    """

        # Create vocabulary look for target
        target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

        # Add vocab tables to graph colection so that we can access them in
        # other places.
        graph_utils.add_dict_to_collection(
            {
                "target_vocab_to_id": target_vocab_to_id,
                "target_id_to_vocab": target_id_to_vocab,
                "target_word_to_count": target_word_to_count
            }, "vocab_tables")

        if labels is None:
            return features, None

        labels = labels.copy()

        # Slices targets to max length
        if self.params["target.max_seq_len"] is not None:
            labels["target_tokens"] = labels[
                "target_tokens"][:, :self.params["target.max_seq_len"]]
            labels["target_len"] = tf.minimum(
                labels["target_len"], self.params["target.max_seq_len"])

        # Look up the target ids in the vocabulary
        labels["target_ids"] = target_vocab_to_id.lookup(
            labels["target_tokens"])

        labels["target_len"] = tf.to_int32(labels["target_len"])
        tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

        # Add to graph collection for later use
        graph_utils.add_dict_to_collection(features, "features")
        if labels:
            graph_utils.add_dict_to_collection(labels, "labels")

        return features, labels
Exemplo n.º 14
0
  def _preprocess(self, features, labels):
    """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for target vocab
    - Converts tokens into vocabulary ids
    - Prepends a speical "SEQUENCE_START" token to the target
    - Appends a speical "SEQUENCE_END" token to the target
    """

    # Create vocabulary look for target
    target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
      vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

    # Add vocab tables to graph colection so that we can access them in
    # other places.
    graph_utils.add_dict_to_collection({
        "target_vocab_to_id": target_vocab_to_id,
        "target_id_to_vocab": target_id_to_vocab,
        "target_word_to_count": target_word_to_count
    }, "vocab_tables")

    if labels is None:
      return features, None

    labels = labels.copy()

    # Slices targets to max length
    if self.params["target.max_seq_len"] is not None:
      labels["target_tokens"] = labels["target_tokens"][:, :self.params[
          "target.max_seq_len"]]
      labels["target_len"] = tf.minimum(labels["target_len"],
                                        self.params["target.max_seq_len"])

    # Look up the target ids in the vocabulary
    labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"])

    labels["target_len"] = tf.to_int32(labels["target_len"])
    tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

    # Add to graph collection for later use
    graph_utils.add_dict_to_collection(features, "features")
    if labels:
      graph_utils.add_dict_to_collection(labels, "labels")

    return features, labels
Exemplo n.º 15
0
    def _build(self, features, labels, params, mode):
        # Pre-process features and labels
        features, labels = self.create_featurizer(mode)(features, labels)

        # Add to graph collection for later use
        graph_utils.add_dict_to_collection(features, "features")
        if labels:
            graph_utils.add_dict_to_collection(labels, "labels")

        source_ids = features["source_ids"]
        if self.params["source.reverse"] is True:
            source_ids = tf.reverse_sequence(
                input=features["source_ids"],
                seq_lengths=features["source_len"],
                seq_dim=1,
                batch_dim=0,
                name=None)

        # Create embedddings
        source_embedding = tf.get_variable(
            "source_embedding",
            [self.source_vocab_info.total_size, self.params["embedding.dim"]])
        target_embedding = tf.get_variable(
            "target_embedding",
            [self.target_vocab_info.total_size, self.params["embedding.dim"]])

        # Embed source
        source_embedded = tf.nn.embedding_lookup(source_embedding, source_ids)

        # Graph used for inference
        if mode == tf.contrib.learn.ModeKeys.INFER:
            target_start_id = self.target_vocab_info.special_vocab.SEQUENCE_START
            # Embed the "SEQUENCE_START" token
            initial_input = tf.nn.embedding_lookup(
                target_embedding,
                tf.ones_like(features["source_len"]) * target_start_id)

            def make_input_fn(predictions):
                """Use the embedded prediction as the input to the next time step
        """
                return tf.nn.embedding_lookup(target_embedding, predictions)

            def elements_finished_fn(_time_, predictions):
                """Returns true when a prediction is finished"""
                return tf.equal(
                    predictions,
                    tf.cast(self.target_vocab_info.special_vocab.SEQUENCE_END,
                            dtype=predictions.dtype))

            decoder_input_fn_infer = decoders.DynamicDecoderInputs(
                initial_inputs=initial_input,
                make_input_fn=make_input_fn,
                max_decode_length=self.params["inference.max_decode_length"],
                elements_finished_fn=elements_finished_fn)

            # Decode
            decoder_output = self.encode_decode(
                source=source_embedded,
                source_len=features["source_len"],
                decoder_input_fn=decoder_input_fn_infer,
                mode=mode)
            predictions = self._create_predictions(
                decoder_output=decoder_output,
                features=features,
                labels=labels)
            return predictions, None, None

        # Embed target
        target_embedded = tf.nn.embedding_lookup(target_embedding,
                                                 labels["target_ids"])

        # During training/eval, we have labels and use them for teacher forcing
        # We don't feed the last SEQUENCE_END token
        decoder_input_fn_train = decoders.FixedDecoderInputs(
            inputs=target_embedded[:, :-1],
            sequence_length=labels["target_len"] - 1)

        decoder_output = self.encode_decode(
            source=source_embedded,
            source_len=features["source_len"],
            decoder_input_fn=decoder_input_fn_train,
            mode=mode)

        # Calculate loss per example-timestep of shape [B, T]
        losses = seq2seq_losses.cross_entropy_sequence_loss(
            logits=decoder_output.logits[:, :, :],
            targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]),
            sequence_length=labels["target_len"] - 1)

        # Calculate the average log perplexity
        loss = tf.reduce_sum(losses) / tf.to_float(
            tf.reduce_sum(labels["target_len"] - 1))

        learning_rate_decay_fn = training_utils.create_learning_rate_decay_fn(
            decay_type=self.params["optimizer.lr_decay_type"] or None,
            decay_steps=self.params["optimizer.lr_decay_steps"],
            decay_rate=self.params["optimizer.lr_decay_rate"],
            start_decay_at=self.params["optimizer.lr_start_decay_at"],
            stop_decay_at=self.params["optimizer.lr_stop_decay_at"],
            min_learning_rate=self.params["optimizer.lr_min_learning_rate"],
            staircase=self.params["optimizer.lr_staircase"])

        train_op = tf.contrib.layers.optimize_loss(
            loss=loss,
            global_step=tf.contrib.framework.get_global_step(),
            learning_rate=self.params["optimizer.learning_rate"],
            learning_rate_decay_fn=learning_rate_decay_fn,
            clip_gradients=self.params["optimizer.clip_gradients"],
            optimizer=self.params["optimizer.name"],
            summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES)

        if mode == tf.contrib.learn.ModeKeys.EVAL:
            train_op = None

        predictions = self._create_predictions(decoder_output=decoder_output,
                                               features=features,
                                               labels=labels,
                                               losses=losses)

        # We add "useful" tensors to the graph collection so that we
        # can easly find them in our hooks/monitors.
        graph_utils.add_dict_to_collection(predictions, "predictions")

        return predictions, loss, train_op
Exemplo n.º 16
0
  def _preprocess(self, features, labels):
    """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for source and target vocab
    - Converts tokens into vocabulary ids
    """
    self.create_lookup_table()

    # int64_keys = ["source_len", "source_ids", "extend_source_ids", "source_oov_nums", "target_ids", "extend_target_ids"]

    # Slice source to max_len
    ###here can't
    if self.params["source.max_seq_len"] is not None:
      features["source_tokens"] = features["source_tokens"][:, :self.params[
          "source.max_seq_len"]]
      features["source_len"] = tf.minimum(features["source_len"],
                                          self.params["source.max_seq_len"])

    # Look up the source ids in the vocabulary

    graph_source_ids = self._source_vocab_to_id.lookup(features["source_tokens"]) #every sequence contains sequence end flag
    tf.assert_equal(graph_source_ids, features["source_ids"])

    features["source_oov_nums"] = tf.cast(features["source_oov_nums"], tf.int32)
    features["source_max_oov_num"] = tf.reduce_max(features["source_oov_nums"])

    # Maybe reverse the source
    if self.params["source.reverse"] is True:
      raise NotImplemented("reverse func is not stable now")
      reverse_keys = ["source_ids", "extend_source_ids", "source_pos_ids", "source_tfidfs", "source_ner_ids"]
      for key in reverse_keys:
        features[key] = tf.reverse_sequence(
            input=features[key],
            seq_lengths=features["source_len"],
            seq_dim=1,
            batch_dim=0,
            name=None)

    features["source_len"] = tf.cast(features["source_len"], tf.int32)

    tf.summary.histogram("source_len", features["source_len"])
    tf.summary.histogram("source_oov_nums", features["source_oov_nums"])
    tf.summary.scalar("batch_max_oov_words", features["source_max_oov_num"])

    if labels is None:
      return features, None

    labels = labels.copy()

    # Slices targets to max length
    if self.params["target.max_seq_len"] is not None:
      labels["target_tokens"] = labels["target_tokens"][:, :self.params[
          "target.max_seq_len"]]
      labels["target_len"] = tf.minimum(labels["target_len"],
                                        self.params["target.max_seq_len"])

    # Look up the target ids in the vocabulary
    graph_target_ids = self._target_vocab_to_id.lookup(labels["target_tokens"])
    tf.assert_equal(graph_target_ids, labels["target_ids"])

    labels["target_len"] = tf.to_int32(labels["target_len"])
    tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

    # Keep track of the number of processed tokens
    num_tokens = tf.reduce_sum(labels["target_len"])
    num_tokens += tf.reduce_sum(features["source_len"])
    token_counter_var = tf.Variable(0, "tokens_counter", dtype=tf.int32)
    total_tokens = tf.assign_add(token_counter_var, num_tokens)
    tf.summary.scalar("num_tokens", total_tokens)

    with tf.control_dependencies([total_tokens]):
      features["source_tokens"] = tf.identity(features["source_tokens"])

    # Add to graph collection for later use
    graph_utils.add_dict_to_collection(features, "features")
    if labels:
      graph_utils.add_dict_to_collection(labels, "labels")

    return features, labels
Exemplo n.º 17
0
    def _preprocess(self, features, labels):
        """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for source and target vocab
    - Converts tokens into vocabulary ids
    """

        # Create vocabulary lookup for source
        source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.source_vocab_info.path)

        # Create vocabulary look for target
        target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

        # Add vocab tables to graph colection so that we can access them in
        # other places.
        graph_utils.add_dict_to_collection(
            {
                "source_vocab_to_id": source_vocab_to_id,
                "source_id_to_vocab": source_id_to_vocab,
                "source_word_to_count": source_word_to_count,
                "target_vocab_to_id": target_vocab_to_id,
                "target_id_to_vocab": target_id_to_vocab,
                "target_word_to_count": target_word_to_count
            }, "vocab_tables")

        # Slice source to max_len
        if self.params["source.max_seq_len"] is not None:
            features["source_tokens"] = features[
                "source_tokens"][:, :self.params["source.max_seq_len"]]
            features["source_len"] = tf.minimum(
                features["source_len"], self.params["source.max_seq_len"])

        # Look up the source ids in the vocabulary
        features["source_ids"] = source_vocab_to_id.lookup(
            features["source_tokens"])

        # Maybe reverse the source
        if self.params["source.reverse"] is True:
            features["source_ids"] = tf.reverse_sequence(
                input=features["source_ids"],
                seq_lengths=features["source_len"],
                seq_dim=1,
                batch_dim=0,
                name=None)

        features["source_len"] = tf.to_int32(features["source_len"])
        tf.summary.histogram("source_len", tf.to_float(features["source_len"]))

        if labels is None:
            return features, None

        labels = labels.copy()

        # Slices targets to max length
        if self.params["target.max_seq_len"] is not None:
            labels["target_tokens"] = labels[
                "target_tokens"][:, :self.params["target.max_seq_len"]]
            labels["target_len"] = tf.minimum(
                labels["target_len"], self.params["target.max_seq_len"])

        # Look up the target ids in the vocabulary
        labels["target_ids"] = target_vocab_to_id.lookup(
            labels["target_tokens"])

        labels["target_len"] = tf.to_int32(labels["target_len"])
        tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

        # Keep track of the number of processed tokens
        num_tokens = tf.reduce_sum(labels["target_len"])
        num_tokens += tf.reduce_sum(features["source_len"])
        token_counter_var = tf.Variable(0, "tokens_counter")
        total_tokens = tf.assign_add(token_counter_var, num_tokens)
        tf.summary.scalar("num_tokens", total_tokens)

        with tf.control_dependencies([total_tokens]):
            features["source_tokens"] = tf.identity(features["source_tokens"])

        # Add to graph collection for later use
        graph_utils.add_dict_to_collection(features, "features")
        if labels:
            graph_utils.add_dict_to_collection(labels, "labels")

        return features, labels
Exemplo n.º 18
0
    def _preprocess(self, features, labels):
        """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for source and target vocab
    - Converts tokens into vocabulary ids
    - Appends a special "SEQUENCE_END" token to the source
    - Prepends a special "SEQUENCE_START" token to the target
    - Appends a special "SEQUENCE_END" token to the target
    """

        # Create vocabulary lookup for source
        source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.source_vocab_info.path)

        # Create vocabulary look for target
        target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

        # Add vocab tables to graph colection so that we can access them in
        # other places.
        graph_utils.add_dict_to_collection(
            {
                "source_vocab_to_id": source_vocab_to_id,
                "source_id_to_vocab": source_id_to_vocab,
                "source_word_to_count": source_word_to_count,
                "target_vocab_to_id": target_vocab_to_id,
                "target_id_to_vocab": target_id_to_vocab,
                "target_word_to_count": target_word_to_count
            }, "vocab_tables")

        # Slice source to max_len
        if self.params["source.max_seq_len"] is not None:
            features["source_tokens"] = features[
                "source_tokens"][:, :self.params["source.max_seq_len"]]
            features["source_len"] = tf.minimum(
                features["source_len"], self.params["source.max_seq_len"])

        # Look up the source ids in the vocabulary
        features["source_ids"] = source_vocab_to_id.lookup(
            features["source_tokens"])

        # Maybe reverse the source
        if self.params["source.reverse"] is True:
            features["source_ids"] = tf.reverse_sequence(
                input=features["source_ids"],
                seq_lengths=features["source_len"],
                seq_dim=1,
                batch_dim=0,
                name=None)

        features["source_len"] = tf.to_int32(features["source_len"])
        tf.summary.histogram("source_len", tf.to_float(features["source_len"]))

        if labels is None:
            return features, None

        labels = labels.copy()

        # Slices targets to max length
        if self.params["target.max_seq_len"] is not None:
            labels["target_tokens"] = labels[
                "target_tokens"][:, :self.params["target.max_seq_len"]]
            labels["target_len"] = tf.minimum(
                labels["target_len"], self.params["target.max_seq_len"])

        # Look up the target ids in the vocabulary
        labels["target_ids"] = target_vocab_to_id.lookup(
            labels["target_tokens"])

        labels["target_len"] = tf.to_int32(labels["target_len"])
        tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

        # Add to graph collection for later use
        graph_utils.add_dict_to_collection(features, "features")
        if labels:
            graph_utils.add_dict_to_collection(labels, "labels")

        return features, labels
Exemplo n.º 19
0
  def _preprocess(self, features, labels):
    """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for source and target vocab
    - Converts tokens into vocabulary ids
    """

    # Create vocabulary lookup for source
    source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \
      vocab.create_vocabulary_lookup_table(self.source_vocab_info.path)

    # Create vocabulary look for target
    target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
      vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

    # Add vocab tables to graph colection so that we can access them in
    # other places.
    graph_utils.add_dict_to_collection({
        "source_vocab_to_id": source_vocab_to_id,
        "source_id_to_vocab": source_id_to_vocab,
        "source_word_to_count": source_word_to_count,
        "target_vocab_to_id": target_vocab_to_id,
        "target_id_to_vocab": target_id_to_vocab,
        "target_word_to_count": target_word_to_count
    }, "vocab_tables")

    # Slice source to max_len
    if self.params["source.max_seq_len"] is not None:
      features["source_tokens"] = features["source_tokens"][:, :self.params[
          "source.max_seq_len"]]
      features["source_len"] = tf.minimum(features["source_len"],
                                          self.params["source.max_seq_len"])

    # Look up the source ids in the vocabulary
    features["source_ids"] = source_vocab_to_id.lookup(features[
        "source_tokens"])

    # Maybe reverse the source
    if self.params["source.reverse"] is True:
      features["source_ids"] = tf.reverse_sequence(
          input=features["source_ids"],
          seq_lengths=features["source_len"],
          seq_dim=1,
          batch_dim=0,
          name=None)

    features["source_len"] = tf.to_int32(features["source_len"])
    tf.summary.histogram("source_len", tf.to_float(features["source_len"]))

    if labels is None:
      return features, None

    labels = labels.copy()

    # Slices targets to max length
    if self.params["target.max_seq_len"] is not None:
      labels["target_tokens"] = labels["target_tokens"][:, :self.params[
          "target.max_seq_len"]]
      labels["target_len"] = tf.minimum(labels["target_len"],
                                        self.params["target.max_seq_len"])

    # Look up the target ids in the vocabulary
    labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"])

    labels["target_len"] = tf.to_int32(labels["target_len"])
    tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

    # Keep track of the number of processed tokens
    num_tokens = tf.reduce_sum(labels["target_len"])
    num_tokens += tf.reduce_sum(features["source_len"])
    token_counter_var = tf.Variable(0, "tokens_counter")
    total_tokens = tf.assign_add(token_counter_var, num_tokens)
    tf.summary.scalar("num_tokens", total_tokens)

    with tf.control_dependencies([total_tokens]):
      features["source_tokens"] = tf.identity(features["source_tokens"])

    # Add to graph collection for later use
    graph_utils.add_dict_to_collection(features, "features")
    if labels:
      graph_utils.add_dict_to_collection(labels, "labels")

    return features, labels
Exemplo n.º 20
0
    def _build_schema_lookup_tables(self):
        # May include: schema map, schema matrix, schema text.

        # Read in all the filenames from all schema_loc_files,
        # identifying unique filenames.
        schema_loc_files = self.params["schema_loc_files"]
        if len(schema_loc_files) < 1:
            return
        all_schema_locations = set()
        for loc_file in schema_loc_files:
            with open(loc_file, 'r') as f:
                locations = [l.strip() for l in f.readlines()]
            all_schema_locations.update(locations)
        all_schema_locations = list(all_schema_locations)  # fixed order

        # Build a lookup table of filename --> index
        # (Required for all models that use any schema representation)
        schema_file_lookup_table = tf.contrib.lookup.index_table_from_tensor(
            mapping=all_schema_locations, num_oov_buckets=0, default_value=-1)

        # For each filename, get its matrix from the npy file.
        # Note the length of the schema.
        schema_embeddings_matrices = []
        schema_lengths = []

        if self.params["build_schema_text_table"]:
            all_schema_strings = []
        if self.params["build_schema_map_table"]:
            schema_map_matrices = []
            schema_map_lengths = []

        def load_npy(matrix_list, length_list, file_location, fname):
            npy_file = os.path.join(file_location, fname)
            matrix_np = np.load(npy_file)
            matrix_list.append(matrix_np)
            length = matrix_np.shape[0]
            length_list.append(length)

        for schema_location in all_schema_locations:

            # Schema embeddings: required for all attn to schema models.
            load_npy(schema_embeddings_matrices, schema_lengths,
                     schema_location, "schema_embeddings.npy")

            if self.params["build_schema_map_table"]:
                load_npy(schema_map_matrices, schema_map_lengths,
                         schema_location, "schema_map.npy")

            # Schema strings: required for schema-copying models.
            if self.params["build_schema_text_table"]:
                schema_csv_file = os.path.join(schema_location, "schema.csv")
                schema_string = self.get_schema_strings(schema_csv_file)
                all_schema_strings.append(schema_string)

        max_emb_len = max(schema_lengths)
        schema_lengths = tf.constant(schema_lengths)

        # Pad matrices with zeros as needed.
        def pad_to_size(matrix, length):
            if matrix.shape[0] == length:
                return matrix
            padding_size = length - matrix.shape[0]
            padded = np.pad(matrix,
                            pad_width=((0, padding_size), (0, 0)),
                            mode='constant',
                            constant_values=0)
            return padded

        schema_embeddings_matrices = [
            pad_to_size(m, max_emb_len) for m in schema_embeddings_matrices
        ]
        # Assemble all the matrices into a big 3d tensor
        all_schema_embeddings = tf.convert_to_tensor(
            np.asarray(schema_embeddings_matrices), dtype=tf.float32)

        tables_dict = {
            "schema_file_lookup_table": schema_file_lookup_table,
            "all_schema_embeddings": all_schema_embeddings,
            "schema_lengths": schema_lengths,
        }

        # Assemble all the schema strings into a big lookup table.
        # (Required for schema-copying models)
        if self.params["build_schema_text_table"]:
            schema_strings_tbl = tf.contrib.lookup.index_to_string_table_from_tensor(
                all_schema_strings, name="schema_strings_lookup_table")
            tables_dict["all_schema_strings"] = schema_strings_tbl

        if self.params["build_schema_map_table"]:
            max_map_len = max(schema_map_lengths)
            schema_map_lengths = tf.constant(schema_map_lengths)
            schema_map_matrices = [
                pad_to_size(m, max_map_len) for m in schema_map_matrices
            ]
            all_schema_maps = tf.convert_to_tensor(
                np.asarray(schema_map_matrices), dtype=tf.float32)
            tables_dict["all_schema_maps"] = all_schema_maps
            tables_dict["schema_map_lengths"] = schema_map_lengths

        graph_utils.add_dict_to_collection(tables_dict, "schema_tables")