예제 #1
0
    def test_op(self):
        logits = np.random.randn(self.sequence_length, self.batch_size,
                                 self.vocab_size)
        logits = logits.astype(np.float32)
        sequence_length = np.array([1, 2, 3, 4])
        targets = np.random.randint(0, self.vocab_size,
                                    [self.sequence_length, self.batch_size])
        losses = seq2seq_losses.cross_entropy_sequence_loss(
            logits, targets, sequence_length)

        with self.test_session() as sess:
            losses_ = sess.run(losses)

        # Make sure all losses not past the sequence length are > 0
        np.testing.assert_array_less(np.zeros_like(losses_[:1, 0]), losses_[:1,
                                                                            0])
        np.testing.assert_array_less(np.zeros_like(losses_[:2, 1]), losses_[:2,
                                                                            1])
        np.testing.assert_array_less(np.zeros_like(losses_[:3, 2]), losses_[:3,
                                                                            2])

        # Make sure all losses past the sequence length are 0
        np.testing.assert_array_equal(losses_[1:, 0],
                                      np.zeros_like(losses_[1:, 0]))
        np.testing.assert_array_equal(losses_[2:, 1],
                                      np.zeros_like(losses_[2:, 1]))
        np.testing.assert_array_equal(losses_[3:, 2],
                                      np.zeros_like(losses_[3:, 2]))
예제 #2
0
    def test_gradients(self):
        """Ensures the parameter gradients can be computed and are not NaN
    """
        ex = self._create_example()
        decoder_input_fn = FixedDecoderInputs(
            inputs=tf.convert_to_tensor(ex.target, dtype=tf.float32),
            sequence_length=tf.convert_to_tensor(ex.target_len,
                                                 dtype=tf.int32))

        model = self.create_model()
        decoder_output = model.encode_decode(
            source=tf.convert_to_tensor(ex.source, dtype=tf.float32),
            source_len=tf.convert_to_tensor(ex.source_len, dtype=tf.int32),
            decoder_input_fn=decoder_input_fn)

        # Get a loss to optimize
        losses = seq2seq_losses.cross_entropy_sequence_loss(
            logits=decoder_output.logits,
            targets=tf.ones_like(decoder_output.predicted_ids),
            sequence_length=tf.convert_to_tensor(ex.target_len,
                                                 dtype=tf.int32))
        mean_loss = tf.reduce_mean(losses)

        optimizer = tf.train.AdamOptimizer()
        grads_and_vars = optimizer.compute_gradients(mean_loss)
        train_op = optimizer.apply_gradients(grads_and_vars)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            _, grads_and_vars_ = sess.run([train_op, grads_and_vars])

        for grad, _ in grads_and_vars_:
            self.assertFalse(np.isnan(grad).any())
예제 #3
0
  def compute_loss(self, decoder_output, _features, labels):
    """Computes the loss for this model.

    Returns a tuple `(losses, loss)`, where `losses` are the per-batch
    losses and loss is a single scalar tensor to minimize.
    """
    #pylint: disable=R0201
    # Calculate loss per example-timestep of shape [B, T]


    final_dists = self._calc_final_dist(decoder_output, _features)
    final_dists = final_dists.stack() #

    # losses = seq2seq_losses.cross_entropy_sequence_loss(
    #     logits=decoder_output.logits[:, :, :],
    #     targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]),
    #     sequence_length=labels["target_len"] - 1)

    losses = seq2seq_losses.cross_entropy_sequence_loss(
        logits=final_dists,
        targets=tf.transpose(labels["extend_target_ids"][:, 1:], [1, 0]),
        sequence_length=labels["target_len"] - 1)

    # Calculate the average log perplexity
    loss = tf.reduce_sum(losses) / tf.to_float(
        tf.reduce_sum(labels["target_len"] - 1))

    return losses, loss
 def compute_loss(self, decoder_output, _features, labels):
     """Computes the sequence loss for this model.
     seq_loss is the cross entropy loss for the output sequence.
     Returns a tuple `(losses, loss)`, where `losses` are the per-batch
     losses and loss is a single scalar tensor to minimize.
     """
     targets, seq_len = self._targets_and_seq_len(labels)
     seq_loss = seq2seq_losses.cross_entropy_sequence_loss(
         logits=decoder_output.logits[:, :, :],
         targets=targets,
         sequence_length=seq_len)
     return seq_loss
    def _copy_loss(self, targets, seq_len, attention_scores, copy_indices,
                   copy_id):
        copy_logits = attention_scores[:, :, :]
        copy_targets = tf.transpose(copy_indices[:, 1:], [1, 0])

        copy_loss = seq2seq_losses.cross_entropy_sequence_loss(
            logits=copy_logits, targets=copy_targets, sequence_length=seq_len)

        copy_mask = tf.equal(targets, copy_id, "target_equals_copy_id")
        copy_mask = tf.to_float(copy_mask, "copy_mask_to_float")
        masked_loss = copy_loss * copy_mask
        return masked_loss
예제 #6
0
def compute_loss(decoder_output, labels, labelLengths):
    """Computes the loss for this model.

    Returns a tuple `(losses, loss)`, where `losses` are the per-batch
    losses and loss is a single scalar tensor to minimize.
    """
    #pylint: disable=R0201
    # Calculate loss per example-timestep of shape [B, T]
    losses = seq2seq_losses.cross_entropy_sequence_loss(
        logits=decoder_output.logits[:, :, :],
        targets=tf.transpose(labels[:, 1:], [1, 0]),
        sequence_length=labelLengths - 1)

    # Calculate the average log perplexity
    loss = tf.reduce_sum(losses) / tf.to_float(tf.reduce_sum(labelLengths - 1))

    return losses, loss
예제 #7
0
  def compute_loss(self, decoder_output, _features, labels):
    """Computes the loss for this model.

    Returns a tuple `(losses, loss)`, where `losses` are the per-batch
    losses and loss is a single scalar tensor to minimize.
    """
    #pylint: disable=R0201
    # Calculate loss per example-timestep of shape [B, T]
    losses = seq2seq_losses.cross_entropy_sequence_loss(
        logits=decoder_output.logits[:, :, :],
        targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]),
        sequence_length=labels["target_len"] - 1)

    # Calculate the average log perplexity
    loss = tf.reduce_sum(losses) / tf.to_float(
        tf.reduce_sum(labels["target_len"] - 1))

    return losses, loss
예제 #8
0
  def test_op(self):
    logits = np.random.randn(self.sequence_length, self.batch_size,
                             self.vocab_size)
    logits = logits.astype(np.float32)
    sequence_length = np.array([1, 2, 3, 4])
    targets = np.random.randint(0, self.vocab_size,
                                [self.sequence_length, self.batch_size])
    losses = seq2seq_losses.cross_entropy_sequence_loss(logits, targets,
                                                        sequence_length)

    with self.test_session() as sess:
      losses_ = sess.run(losses)

    # Make sure all losses not past the sequence length are > 0
    np.testing.assert_array_less(np.zeros_like(losses_[:1, 0]), losses_[:1, 0])
    np.testing.assert_array_less(np.zeros_like(losses_[:2, 1]), losses_[:2, 1])
    np.testing.assert_array_less(np.zeros_like(losses_[:3, 2]), losses_[:3, 2])

    # Make sure all losses past the sequence length are 0
    np.testing.assert_array_equal(losses_[1:, 0], np.zeros_like(losses_[1:, 0]))
    np.testing.assert_array_equal(losses_[2:, 1], np.zeros_like(losses_[2:, 1]))
    np.testing.assert_array_equal(losses_[3:, 2], np.zeros_like(losses_[3:, 2]))
예제 #9
0
    def _build(self, features, labels, params, mode):
        # Pre-process features and labels
        features, labels = self.create_featurizer(mode)(features, labels)

        # Add to graph collection for later use
        graph_utils.add_dict_to_collection(features, "features")
        if labels:
            graph_utils.add_dict_to_collection(labels, "labels")

        source_ids = features["source_ids"]
        if self.params["source.reverse"] is True:
            source_ids = tf.reverse_sequence(
                input=features["source_ids"],
                seq_lengths=features["source_len"],
                seq_dim=1,
                batch_dim=0,
                name=None)

        # Create embedddings
        source_embedding = tf.get_variable(
            "source_embedding",
            [self.source_vocab_info.total_size, self.params["embedding.dim"]])
        target_embedding = tf.get_variable(
            "target_embedding",
            [self.target_vocab_info.total_size, self.params["embedding.dim"]])

        # Embed source
        source_embedded = tf.nn.embedding_lookup(source_embedding, source_ids)

        # Graph used for inference
        if mode == tf.contrib.learn.ModeKeys.INFER:
            target_start_id = self.target_vocab_info.special_vocab.SEQUENCE_START
            # Embed the "SEQUENCE_START" token
            initial_input = tf.nn.embedding_lookup(
                target_embedding,
                tf.ones_like(features["source_len"]) * target_start_id)

            def make_input_fn(predictions):
                """Use the embedded prediction as the input to the next time step
        """
                return tf.nn.embedding_lookup(target_embedding, predictions)

            def elements_finished_fn(_time_, predictions):
                """Returns true when a prediction is finished"""
                return tf.equal(
                    predictions,
                    tf.cast(self.target_vocab_info.special_vocab.SEQUENCE_END,
                            dtype=predictions.dtype))

            decoder_input_fn_infer = decoders.DynamicDecoderInputs(
                initial_inputs=initial_input,
                make_input_fn=make_input_fn,
                max_decode_length=self.params["inference.max_decode_length"],
                elements_finished_fn=elements_finished_fn)

            # Decode
            decoder_output = self.encode_decode(
                source=source_embedded,
                source_len=features["source_len"],
                decoder_input_fn=decoder_input_fn_infer,
                mode=mode)
            predictions = self._create_predictions(
                decoder_output=decoder_output,
                features=features,
                labels=labels)
            return predictions, None, None

        # Embed target
        target_embedded = tf.nn.embedding_lookup(target_embedding,
                                                 labels["target_ids"])

        # During training/eval, we have labels and use them for teacher forcing
        # We don't feed the last SEQUENCE_END token
        decoder_input_fn_train = decoders.FixedDecoderInputs(
            inputs=target_embedded[:, :-1],
            sequence_length=labels["target_len"] - 1)

        decoder_output = self.encode_decode(
            source=source_embedded,
            source_len=features["source_len"],
            decoder_input_fn=decoder_input_fn_train,
            mode=mode)

        # Calculate loss per example-timestep of shape [B, T]
        losses = seq2seq_losses.cross_entropy_sequence_loss(
            logits=decoder_output.logits[:, :, :],
            targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]),
            sequence_length=labels["target_len"] - 1)

        # Calculate the average log perplexity
        loss = tf.reduce_sum(losses) / tf.to_float(
            tf.reduce_sum(labels["target_len"] - 1))

        learning_rate_decay_fn = training_utils.create_learning_rate_decay_fn(
            decay_type=self.params["optimizer.lr_decay_type"] or None,
            decay_steps=self.params["optimizer.lr_decay_steps"],
            decay_rate=self.params["optimizer.lr_decay_rate"],
            start_decay_at=self.params["optimizer.lr_start_decay_at"],
            stop_decay_at=self.params["optimizer.lr_stop_decay_at"],
            min_learning_rate=self.params["optimizer.lr_min_learning_rate"],
            staircase=self.params["optimizer.lr_staircase"])

        train_op = tf.contrib.layers.optimize_loss(
            loss=loss,
            global_step=tf.contrib.framework.get_global_step(),
            learning_rate=self.params["optimizer.learning_rate"],
            learning_rate_decay_fn=learning_rate_decay_fn,
            clip_gradients=self.params["optimizer.clip_gradients"],
            optimizer=self.params["optimizer.name"],
            summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES)

        if mode == tf.contrib.learn.ModeKeys.EVAL:
            train_op = None

        predictions = self._create_predictions(decoder_output=decoder_output,
                                               features=features,
                                               labels=labels,
                                               losses=losses)

        # We add "useful" tensors to the graph collection so that we
        # can easly find them in our hooks/monitors.
        graph_utils.add_dict_to_collection(predictions, "predictions")

        return predictions, loss, train_op
예제 #10
0
    def _build(self, features, labels, params, mode):
        # Create embedddings
        source_embedding = tf.get_variable(
            "source_embedding",
            [self.source_vocab_info.total_size, self.params["embedding.dim"]])
        target_embedding = tf.get_variable(
            "target_embedding",
            [self.target_vocab_info.total_size, self.params["embedding.dim"]])

        # Embed source
        source_embedded = tf.nn.embedding_lookup(source_embedding,
                                                 features["source_ids"])

        # Graph used for inference
        if mode == tf.contrib.learn.ModeKeys.INFER:
            target_start_id = self.target_vocab_info.special_vocab.SEQUENCE_START
            # Embed the "SEQUENCE_START" token
            initial_input = tf.nn.embedding_lookup(
                target_embedding,
                tf.ones_like(features["source_len"]) * target_start_id)
            # Use the embedded prediction as the input to the next time step
            decoder_input_fn_infer = decoders.DynamicDecoderInputs(
                initial_inputs=initial_input,
                make_input_fn=lambda x: tf.nn.embedding_lookup(
                    target_embedding, x.predictions))
            # Decode
            decoder_output, _ = self.encode_decode(
                source=source_embedded,
                source_len=features["source_len"],
                decoder_input_fn=decoder_input_fn_infer,
                target_len=self.params["target.max_seq_len"],
                mode=mode)
            predictions = self._create_predictions(
                features=features,
                labels=-labels,
                decoder_output=decoder_output)
            return predictions, None, None

        # Embed target
        target_embedded = tf.nn.embedding_lookup(target_embedding,
                                                 labels["target_ids"])

        # During training/eval, we have labels and use them for teacher forcing
        # We don't feed the last SEQUENCE_END token
        decoder_input_fn_train = decoders.FixedDecoderInputs(
            inputs=target_embedded[:, :-1],
            sequence_length=labels["target_len"] - 1)

        decoder_output = self.encode_decode(
            source=source_embedded,
            source_len=features["source_len"],
            decoder_input_fn=decoder_input_fn_train,
            target_len=labels["target_len"],
            mode=mode)

        # TODO: For a long sequence  logits are a huge [B * T, vocab_size] matrix
        # which can lead to OOM errors on a GPU. Fixing this is TODO, maybe we
        # can use map_fn or slice the logits to max(sequence_length).
        # Should benchmark this.

        # Calculate loss per example-timestep of shape [B, T]
        losses = seq2seq_losses.cross_entropy_sequence_loss(
            logits=decoder_output.logits[:, :-1, :],
            targets=labels["target_ids"][:, 1:],
            sequence_length=labels["target_len"] - 1)

        # Calulate per-example losses of shape [B]
        log_perplexities = tf.div(tf.reduce_sum(losses, reduction_indices=1),
                                  tf.to_float(labels["target_len"] - 1))

        loss = tf.reduce_mean(log_perplexities)

        train_op = tf.contrib.layers.optimize_loss(
            loss=loss,
            global_step=tf.contrib.framework.get_global_step(),
            learning_rate=self.params["optimizer.learning_rate"],
            clip_gradients=self.params["optimizer.clip_gradients"],
            optimizer=self.params["optimizer.name"],
            summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES)

        if mode == tf.contrib.learn.ModeKeys.EVAL:
            train_op = None

        predictions = self._create_predictions(
            features=features,
            labels=labels,
            decoder_output=decoder_output,
            log_perplexities=log_perplexities)

        # We add "useful" tensors to the graph collection so that we
        # can easly find them in our hooks/monitors.
        # TODO: Is there a cleaner way to do this?
        for key, tensor in predictions.items():
            tf.add_to_collection("model_output_keys", key)
            tf.add_to_collection("model_output_values", tensor)

        for key, tensor in features.items():
            tf.add_to_collection("features_keys", key)
            tf.add_to_collection("features_values", tensor)

        for key, tensor in labels.items():
            tf.add_to_collection("labels_keys", key)
            tf.add_to_collection("labels_values", tensor)

        # Summaries
        tf.summary.scalar("loss", loss)

        return predictions, loss, train_op