def test_op(self): logits = np.random.randn(self.sequence_length, self.batch_size, self.vocab_size) logits = logits.astype(np.float32) sequence_length = np.array([1, 2, 3, 4]) targets = np.random.randint(0, self.vocab_size, [self.sequence_length, self.batch_size]) losses = seq2seq_losses.cross_entropy_sequence_loss( logits, targets, sequence_length) with self.test_session() as sess: losses_ = sess.run(losses) # Make sure all losses not past the sequence length are > 0 np.testing.assert_array_less(np.zeros_like(losses_[:1, 0]), losses_[:1, 0]) np.testing.assert_array_less(np.zeros_like(losses_[:2, 1]), losses_[:2, 1]) np.testing.assert_array_less(np.zeros_like(losses_[:3, 2]), losses_[:3, 2]) # Make sure all losses past the sequence length are 0 np.testing.assert_array_equal(losses_[1:, 0], np.zeros_like(losses_[1:, 0])) np.testing.assert_array_equal(losses_[2:, 1], np.zeros_like(losses_[2:, 1])) np.testing.assert_array_equal(losses_[3:, 2], np.zeros_like(losses_[3:, 2]))
def test_gradients(self): """Ensures the parameter gradients can be computed and are not NaN """ ex = self._create_example() decoder_input_fn = FixedDecoderInputs( inputs=tf.convert_to_tensor(ex.target, dtype=tf.float32), sequence_length=tf.convert_to_tensor(ex.target_len, dtype=tf.int32)) model = self.create_model() decoder_output = model.encode_decode( source=tf.convert_to_tensor(ex.source, dtype=tf.float32), source_len=tf.convert_to_tensor(ex.source_len, dtype=tf.int32), decoder_input_fn=decoder_input_fn) # Get a loss to optimize losses = seq2seq_losses.cross_entropy_sequence_loss( logits=decoder_output.logits, targets=tf.ones_like(decoder_output.predicted_ids), sequence_length=tf.convert_to_tensor(ex.target_len, dtype=tf.int32)) mean_loss = tf.reduce_mean(losses) optimizer = tf.train.AdamOptimizer() grads_and_vars = optimizer.compute_gradients(mean_loss) train_op = optimizer.apply_gradients(grads_and_vars) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) _, grads_and_vars_ = sess.run([train_op, grads_and_vars]) for grad, _ in grads_and_vars_: self.assertFalse(np.isnan(grad).any())
def compute_loss(self, decoder_output, _features, labels): """Computes the loss for this model. Returns a tuple `(losses, loss)`, where `losses` are the per-batch losses and loss is a single scalar tensor to minimize. """ #pylint: disable=R0201 # Calculate loss per example-timestep of shape [B, T] final_dists = self._calc_final_dist(decoder_output, _features) final_dists = final_dists.stack() # # losses = seq2seq_losses.cross_entropy_sequence_loss( # logits=decoder_output.logits[:, :, :], # targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]), # sequence_length=labels["target_len"] - 1) losses = seq2seq_losses.cross_entropy_sequence_loss( logits=final_dists, targets=tf.transpose(labels["extend_target_ids"][:, 1:], [1, 0]), sequence_length=labels["target_len"] - 1) # Calculate the average log perplexity loss = tf.reduce_sum(losses) / tf.to_float( tf.reduce_sum(labels["target_len"] - 1)) return losses, loss
def compute_loss(self, decoder_output, _features, labels): """Computes the sequence loss for this model. seq_loss is the cross entropy loss for the output sequence. Returns a tuple `(losses, loss)`, where `losses` are the per-batch losses and loss is a single scalar tensor to minimize. """ targets, seq_len = self._targets_and_seq_len(labels) seq_loss = seq2seq_losses.cross_entropy_sequence_loss( logits=decoder_output.logits[:, :, :], targets=targets, sequence_length=seq_len) return seq_loss
def _copy_loss(self, targets, seq_len, attention_scores, copy_indices, copy_id): copy_logits = attention_scores[:, :, :] copy_targets = tf.transpose(copy_indices[:, 1:], [1, 0]) copy_loss = seq2seq_losses.cross_entropy_sequence_loss( logits=copy_logits, targets=copy_targets, sequence_length=seq_len) copy_mask = tf.equal(targets, copy_id, "target_equals_copy_id") copy_mask = tf.to_float(copy_mask, "copy_mask_to_float") masked_loss = copy_loss * copy_mask return masked_loss
def compute_loss(decoder_output, labels, labelLengths): """Computes the loss for this model. Returns a tuple `(losses, loss)`, where `losses` are the per-batch losses and loss is a single scalar tensor to minimize. """ #pylint: disable=R0201 # Calculate loss per example-timestep of shape [B, T] losses = seq2seq_losses.cross_entropy_sequence_loss( logits=decoder_output.logits[:, :, :], targets=tf.transpose(labels[:, 1:], [1, 0]), sequence_length=labelLengths - 1) # Calculate the average log perplexity loss = tf.reduce_sum(losses) / tf.to_float(tf.reduce_sum(labelLengths - 1)) return losses, loss
def compute_loss(self, decoder_output, _features, labels): """Computes the loss for this model. Returns a tuple `(losses, loss)`, where `losses` are the per-batch losses and loss is a single scalar tensor to minimize. """ #pylint: disable=R0201 # Calculate loss per example-timestep of shape [B, T] losses = seq2seq_losses.cross_entropy_sequence_loss( logits=decoder_output.logits[:, :, :], targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]), sequence_length=labels["target_len"] - 1) # Calculate the average log perplexity loss = tf.reduce_sum(losses) / tf.to_float( tf.reduce_sum(labels["target_len"] - 1)) return losses, loss
def test_op(self): logits = np.random.randn(self.sequence_length, self.batch_size, self.vocab_size) logits = logits.astype(np.float32) sequence_length = np.array([1, 2, 3, 4]) targets = np.random.randint(0, self.vocab_size, [self.sequence_length, self.batch_size]) losses = seq2seq_losses.cross_entropy_sequence_loss(logits, targets, sequence_length) with self.test_session() as sess: losses_ = sess.run(losses) # Make sure all losses not past the sequence length are > 0 np.testing.assert_array_less(np.zeros_like(losses_[:1, 0]), losses_[:1, 0]) np.testing.assert_array_less(np.zeros_like(losses_[:2, 1]), losses_[:2, 1]) np.testing.assert_array_less(np.zeros_like(losses_[:3, 2]), losses_[:3, 2]) # Make sure all losses past the sequence length are 0 np.testing.assert_array_equal(losses_[1:, 0], np.zeros_like(losses_[1:, 0])) np.testing.assert_array_equal(losses_[2:, 1], np.zeros_like(losses_[2:, 1])) np.testing.assert_array_equal(losses_[3:, 2], np.zeros_like(losses_[3:, 2]))
def _build(self, features, labels, params, mode): # Pre-process features and labels features, labels = self.create_featurizer(mode)(features, labels) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") source_ids = features["source_ids"] if self.params["source.reverse"] is True: source_ids = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) # Create embedddings source_embedding = tf.get_variable( "source_embedding", [self.source_vocab_info.total_size, self.params["embedding.dim"]]) target_embedding = tf.get_variable( "target_embedding", [self.target_vocab_info.total_size, self.params["embedding.dim"]]) # Embed source source_embedded = tf.nn.embedding_lookup(source_embedding, source_ids) # Graph used for inference if mode == tf.contrib.learn.ModeKeys.INFER: target_start_id = self.target_vocab_info.special_vocab.SEQUENCE_START # Embed the "SEQUENCE_START" token initial_input = tf.nn.embedding_lookup( target_embedding, tf.ones_like(features["source_len"]) * target_start_id) def make_input_fn(predictions): """Use the embedded prediction as the input to the next time step """ return tf.nn.embedding_lookup(target_embedding, predictions) def elements_finished_fn(_time_, predictions): """Returns true when a prediction is finished""" return tf.equal( predictions, tf.cast(self.target_vocab_info.special_vocab.SEQUENCE_END, dtype=predictions.dtype)) decoder_input_fn_infer = decoders.DynamicDecoderInputs( initial_inputs=initial_input, make_input_fn=make_input_fn, max_decode_length=self.params["inference.max_decode_length"], elements_finished_fn=elements_finished_fn) # Decode decoder_output = self.encode_decode( source=source_embedded, source_len=features["source_len"], decoder_input_fn=decoder_input_fn_infer, mode=mode) predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels) return predictions, None, None # Embed target target_embedded = tf.nn.embedding_lookup(target_embedding, labels["target_ids"]) # During training/eval, we have labels and use them for teacher forcing # We don't feed the last SEQUENCE_END token decoder_input_fn_train = decoders.FixedDecoderInputs( inputs=target_embedded[:, :-1], sequence_length=labels["target_len"] - 1) decoder_output = self.encode_decode( source=source_embedded, source_len=features["source_len"], decoder_input_fn=decoder_input_fn_train, mode=mode) # Calculate loss per example-timestep of shape [B, T] losses = seq2seq_losses.cross_entropy_sequence_loss( logits=decoder_output.logits[:, :, :], targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]), sequence_length=labels["target_len"] - 1) # Calculate the average log perplexity loss = tf.reduce_sum(losses) / tf.to_float( tf.reduce_sum(labels["target_len"] - 1)) learning_rate_decay_fn = training_utils.create_learning_rate_decay_fn( decay_type=self.params["optimizer.lr_decay_type"] or None, decay_steps=self.params["optimizer.lr_decay_steps"], decay_rate=self.params["optimizer.lr_decay_rate"], start_decay_at=self.params["optimizer.lr_start_decay_at"], stop_decay_at=self.params["optimizer.lr_stop_decay_at"], min_learning_rate=self.params["optimizer.lr_min_learning_rate"], staircase=self.params["optimizer.lr_staircase"]) train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=tf.contrib.framework.get_global_step(), learning_rate=self.params["optimizer.learning_rate"], learning_rate_decay_fn=learning_rate_decay_fn, clip_gradients=self.params["optimizer.clip_gradients"], optimizer=self.params["optimizer.name"], summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES) if mode == tf.contrib.learn.ModeKeys.EVAL: train_op = None predictions = self._create_predictions(decoder_output=decoder_output, features=features, labels=labels, losses=losses) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. graph_utils.add_dict_to_collection(predictions, "predictions") return predictions, loss, train_op
def _build(self, features, labels, params, mode): # Create embedddings source_embedding = tf.get_variable( "source_embedding", [self.source_vocab_info.total_size, self.params["embedding.dim"]]) target_embedding = tf.get_variable( "target_embedding", [self.target_vocab_info.total_size, self.params["embedding.dim"]]) # Embed source source_embedded = tf.nn.embedding_lookup(source_embedding, features["source_ids"]) # Graph used for inference if mode == tf.contrib.learn.ModeKeys.INFER: target_start_id = self.target_vocab_info.special_vocab.SEQUENCE_START # Embed the "SEQUENCE_START" token initial_input = tf.nn.embedding_lookup( target_embedding, tf.ones_like(features["source_len"]) * target_start_id) # Use the embedded prediction as the input to the next time step decoder_input_fn_infer = decoders.DynamicDecoderInputs( initial_inputs=initial_input, make_input_fn=lambda x: tf.nn.embedding_lookup( target_embedding, x.predictions)) # Decode decoder_output, _ = self.encode_decode( source=source_embedded, source_len=features["source_len"], decoder_input_fn=decoder_input_fn_infer, target_len=self.params["target.max_seq_len"], mode=mode) predictions = self._create_predictions( features=features, labels=-labels, decoder_output=decoder_output) return predictions, None, None # Embed target target_embedded = tf.nn.embedding_lookup(target_embedding, labels["target_ids"]) # During training/eval, we have labels and use them for teacher forcing # We don't feed the last SEQUENCE_END token decoder_input_fn_train = decoders.FixedDecoderInputs( inputs=target_embedded[:, :-1], sequence_length=labels["target_len"] - 1) decoder_output = self.encode_decode( source=source_embedded, source_len=features["source_len"], decoder_input_fn=decoder_input_fn_train, target_len=labels["target_len"], mode=mode) # TODO: For a long sequence logits are a huge [B * T, vocab_size] matrix # which can lead to OOM errors on a GPU. Fixing this is TODO, maybe we # can use map_fn or slice the logits to max(sequence_length). # Should benchmark this. # Calculate loss per example-timestep of shape [B, T] losses = seq2seq_losses.cross_entropy_sequence_loss( logits=decoder_output.logits[:, :-1, :], targets=labels["target_ids"][:, 1:], sequence_length=labels["target_len"] - 1) # Calulate per-example losses of shape [B] log_perplexities = tf.div(tf.reduce_sum(losses, reduction_indices=1), tf.to_float(labels["target_len"] - 1)) loss = tf.reduce_mean(log_perplexities) train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=tf.contrib.framework.get_global_step(), learning_rate=self.params["optimizer.learning_rate"], clip_gradients=self.params["optimizer.clip_gradients"], optimizer=self.params["optimizer.name"], summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES) if mode == tf.contrib.learn.ModeKeys.EVAL: train_op = None predictions = self._create_predictions( features=features, labels=labels, decoder_output=decoder_output, log_perplexities=log_perplexities) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. # TODO: Is there a cleaner way to do this? for key, tensor in predictions.items(): tf.add_to_collection("model_output_keys", key) tf.add_to_collection("model_output_values", tensor) for key, tensor in features.items(): tf.add_to_collection("features_keys", key) tf.add_to_collection("features_values", tensor) for key, tensor in labels.items(): tf.add_to_collection("labels_keys", key) tf.add_to_collection("labels_values", tensor) # Summaries tf.summary.scalar("loss", loss) return predictions, loss, train_op