def _compute_loss(self, outputs, src_ids_out, src_length, tgt_ids_out, tgt_length, mu_src, logvar_src, mu_tgt, logvar_tgt, params, mode): if mode == "Training": mode = tf.estimator.ModeKeys.TRAIN else: mode = tf.estimator.ModeKeys.EVAL if self.Loss_type == "Cross_Entropy": if isinstance(outputs, dict): logits_src_from_src = outputs["logits_src_from_src"] logits_src_from_tgt = outputs["logits_src_from_tgt"] logits_tgt_from_src = outputs["logits_tgt_from_src"] logits_tgt_from_tgt = outputs["logits_tgt_from_tgt"] loss_src_from_src, loss_normalizer_src_from_src, loss_token_normalizer_src_from_src = \ cross_entropy_sequence_loss(logits_src_from_src, src_ids_out, src_length + 1, label_smoothing = params.get("label_smoothing", 0.0), average_in_time = params.get("average_loss_in_time", True), mode = mode) loss_src_from_tgt, loss_normalizer_src_from_tgt, loss_token_normalizer_src_from_tgt = \ cross_entropy_sequence_loss(logits_src_from_tgt, src_ids_out, src_length + 1, label_smoothing = params.get("label_smoothing", 0.0), average_in_time = params.get("average_loss_in_time", True), mode = mode) loss_tgt_from_src, loss_normalizer_tgt_from_src, loss_token_normalizer_tgt_from_src = \ cross_entropy_sequence_loss(logits_tgt_from_src, tgt_ids_out, tgt_length + 1, label_smoothing = params.get("label_smoothing", 0.0), average_in_time = params.get("average_loss_in_time", True), mode = mode) loss_tgt_from_tgt, loss_normalizer_tgt_from_tgt, loss_token_normalizer_tgt_from_tgt = \ cross_entropy_sequence_loss(logits_tgt_from_tgt, tgt_ids_out, tgt_length + 1, label_smoothing = params.get("label_smoothing", 0.0), average_in_time = params.get("average_loss_in_time", True), mode = mode) #----- Calculating kl divergence -------- kld_loss_src = -0.5 * tf.reduce_sum(logvar_src - tf.pow(mu_src, 2) - tf.exp(logvar_src) + 1, 1) kld_loss_tgt = -0.5 * tf.reduce_sum(logvar_tgt - tf.pow(mu_tgt, 2) - tf.exp(logvar_tgt) + 1, 1) return loss_src_from_src, loss_normalizer_src_from_src, loss_token_normalizer_src_from_src, \ loss_src_from_tgt, loss_normalizer_src_from_tgt, loss_token_normalizer_src_from_tgt, \ loss_tgt_from_src, loss_normalizer_tgt_from_src, loss_token_normalizer_tgt_from_src, \ loss_tgt_from_tgt, loss_normalizer_tgt_from_tgt, loss_token_normalizer_tgt_from_tgt, \ kld_loss_src, kld_loss_tgt
def compute_loss(self, outputs, labels, training=True, params=None): outputs, mask = outputs if params is None: params = {} if self.crf_decoding: log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( outputs, tf.cast(labels["tags_id"], tf.int32), labels["length"], transition_params=self.transition_params) loss = tf.reduce_sum(-log_likelihood) loss_normalizer = tf.cast(tf.shape(log_likelihood)[0], loss.dtype) return loss, loss_normalizer else: return cross_entropy_sequence_loss( outputs, labels["tags_id"], labels["length"], label_smoothing=params.get("label_smoothing", 0.0), average_in_time=params.get("average_loss_in_time", False), mask=mask, training=training)
def compute_loss(self, outputs, labels, training=True, params=None): if params is None: params = {} if isinstance(outputs, dict): logits = outputs["logits"] attention = outputs.get("attention") else: logits = outputs attention = None labels_lengths = self.labels_inputter.get_length(labels) loss, loss_normalizer, loss_token_normalizer = cross_entropy_sequence_loss( logits, labels["ids_out"], labels_lengths, label_smoothing=params.get("label_smoothing", 0.0), average_in_time=params.get("average_loss_in_time", False), training=training) if training: gold_alignments = labels.get("alignment") guided_alignment_type = params.get("guided_alignment_type") if gold_alignments is not None and guided_alignment_type is not None: if attention is None: tf.logging.warning( "This model did not return attention vectors; " "guided alignment will not be applied") else: loss += guided_alignment_cost( attention[:, :-1], # Do not constrain last timestep. gold_alignments, labels_lengths - 1, guided_alignment_type, guided_alignment_weight=params.get( "guided_alignment_weight", 1)) return loss, loss_normalizer, loss_token_normalizer
def testWeightedAndMaskedCrossEntropySequenceLoss(self): logits = tf.constant( [ [[0.1, 0.2, 0.9], [-1.2, 2.1, 0], [0.6, 0.3, 0.4]], [[-2.2, -0.2, -1.2], [2.3, 0.2, -0.1], [0.0, 0.1, 0.7]], ] ) labels = tf.constant([[2, 1, 0], [1, 0, 2]], dtype=tf.int32) lengths = tf.constant([3, 2], dtype=tf.int32) weights = tf.constant([0.6, 1.2]) loss, train_norm, stats_norm = losses.cross_entropy_sequence_loss( logits, labels, sequence_length=lengths, sequence_weight=weights, training=True, ) self.assertNear(loss, 1.77306, 1e-5) self.assertNear(train_norm, tf.reduce_sum(weights), 1e-5) self.assertNear( stats_norm, tf.reduce_sum(tf.cast(lengths, tf.float32) * weights), 1e-5, )
def _compute_loss(self, features, labels, outputs, params, mode): return cross_entropy_sequence_loss( outputs, labels["ids_out"], self._get_labels_length(labels), label_smoothing=params.get("label_smoothing", 0.0), mode=mode)
def build_model(source, target, mode, reuse=False): # Encode the source. with tf.variable_scope("encoder", reuse=reuse): source_embedding = source_inputter.make_inputs(source, training=True) memory, _, _ = encoder.encode(source_embedding, source["length"], mode=mode) # Decode the target. with tf.variable_scope("decoder", reuse=reuse): target_embedding = target_inputter.make_inputs(target, training=True) logits, _, _ = decoder.decode( target_embedding, target["length"], vocab_size=target_inputter.vocabulary_size, mode=mode, memory=memory, memory_sequence_length=source["length"]) #logits = tf.Print(logits, [tf.argmax(logits, -1)[0]], summarize=maximum_length) #logits = tf.Print(logits, [tr_target['ids_out'][0]], summarize=maximum_length) # Compute the loss. loss, normalizer, _ = losses.cross_entropy_sequence_loss( logits, target["ids_out"], target["length"], label_smoothing=0.1, average_in_time=True, mode=mode) loss /= normalizer return loss
def _compute_loss(self, features, labels, outputs, params, mode): if isinstance(outputs, dict): logits = outputs["logits"] attention = outputs.get("attention") else: logits = outputs attention = None labels_lengths = self.labels_inputter.get_length(labels) loss, loss_normalizer, loss_token_normalizer = cross_entropy_sequence_loss( logits, labels["ids_out"], labels_lengths, label_smoothing=params.get("label_smoothing", 0.0), average_in_time=params.get("average_loss_in_time", False), mode=mode) if mode == tf.estimator.ModeKeys.TRAIN: gold_alignments = labels.get("alignment") guided_alignment_type = params.get("guided_alignment_type") if gold_alignments is not None and guided_alignment_type is not None: if attention is None: tf.logging.warning( "This model did not return attention vectors; " "guided alignment will not be applied") else: # Note: the first decoder input is <s> for which we don't want any alignment. loss += guided_alignment_cost( attention[:, 1:], gold_alignments, labels_lengths - 1, guided_alignment_type, guided_alignment_weight=params.get( "guided_alignment_weight", 1)) return loss, loss_normalizer, loss_token_normalizer
def _compute_loss(self, features, labels, outputs, params, mode): return cross_entropy_sequence_loss( outputs, labels["ids_out"], self._get_labels_length(labels), label_smoothing=params.get("label_smoothing", 0.0), average_in_time=params.get("average_loss_in_time", False), mode=mode)
def compute_loss(self, outputs, labels, training=True): return losses.cross_entropy_sequence_loss( outputs["logits"], labels["ids_out"], labels["length"], label_smoothing=self.params.get("label_smoothing", 0.0), average_in_time=self.params.get("average_loss_in_time", False), training=training)
def testCrossEntropySequenceLoss(self): logits = tf.constant([ [[0.1, 0.2, 0.9], [-1.2, 2.1, 0], [0.6, 0.3, 0.4]], [[-2.2, -0.2, -1.2], [2.3, 0.2, -0.1], [0.0, 0.1, 0.7]], ]) labels = tf.constant([[2, 1, 0], [1, 0, 2]], dtype=tf.int32) loss, training_norm, stats_norm = losses.cross_entropy_sequence_loss( logits, labels, training=True) self.assertNear(loss, 3.06985, 1e-5) self.assertEqual(training_norm, 2) self.assertEqual(stats_norm, 6) _, training_norm, stats_norm = losses.cross_entropy_sequence_loss( logits, labels, average_in_time=True, training=True) self.assertEqual(training_norm, 6) self.assertEqual(stats_norm, 6)
def testMaskedCrossEntropySequenceLoss(self): logits = tf.constant([ [[0.1, 0.2, 0.9], [-1.2, 2.1, 0], [0.6, 0.3, 0.4]], [[-2.2, -0.2, -1.2], [2.3, 0.2, -0.1], [0.0, 0.1, 0.7]], ]) labels = tf.constant([[2, 1, 0], [1, 0, 2]], dtype=tf.int32) lengths = tf.constant([2, 1], dtype=tf.int32) loss, _, stats_norm = losses.cross_entropy_sequence_loss( logits, labels, sequence_length=lengths, training=True) self.assertNear(loss, 1.22118, 1e-5) self.assertEqual(stats_norm, 3)
def compute_loss(self, outputs, labels, training=True): params = self.params if not isinstance(outputs, dict): outputs = dict(logits=outputs) logits = outputs["logits"] noisy_logits = outputs.get("noisy_logits") attention = outputs.get("attention") if noisy_logits is not None and params.get("contrastive_learning"): return losses.max_margin_loss( logits, labels["ids_out"], labels["length"], noisy_logits, labels["noisy_ids_out"], labels["noisy_length"], eta=params.get("max_margin_eta", 0.1), ) ( loss, loss_normalizer, loss_token_normalizer, ) = losses.cross_entropy_sequence_loss( logits, labels["ids_out"], sequence_length=labels["length"], sequence_weight=labels.get("weight"), label_smoothing=params.get("label_smoothing", 0.0), average_in_time=params.get("average_loss_in_time", False), training=training, ) if training: gold_alignments = labels.get("alignment") guided_alignment_type = params.get("guided_alignment_type") if gold_alignments is not None and guided_alignment_type is not None: if attention is None: tf.get_logger().warning( "This model did not return attention vectors; " "guided alignment will not be applied" ) else: loss += losses.guided_alignment_cost( attention[:, :-1], # Do not constrain last timestep. gold_alignments, sequence_length=self.labels_inputter.get_length( labels, ignore_special_tokens=True ), cost_type=guided_alignment_type, weight=params.get("guided_alignment_weight", 1), ) return loss, loss_normalizer, loss_token_normalizer
def _compute_loss(self, features, labels, outputs, params, mode): length = self._get_features_length(features) if self.crf_decoding: with tf.variable_scope(tf.get_variable_scope(), reuse=mode != tf.estimator.ModeKeys.TRAIN): log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( outputs, tf.cast(labels["tags_id"], tf.int32), length) return tf.reduce_mean(-log_likelihood) else: return cross_entropy_sequence_loss(outputs, labels["tags_id"], length, label_smoothing=params.get( "label_smoothing", 0.0), mode=mode)
def compute_loss(self, outputs, labels, training=True): if self.crf_decoding: log_likelihood, _ = tfa.text.crf_log_likelihood( outputs, tf.cast(labels["tags_id"], tf.int32), labels["length"], transition_params=self.transition_params) batch_size = tf.shape(log_likelihood)[0] return tf.reduce_sum(-log_likelihood) / tf.cast( batch_size, log_likelihood.dtype) else: return cross_entropy_sequence_loss( outputs, labels["tags_id"], labels["length"], label_smoothing=self.params.get("label_smoothing", 0.0), average_in_time=self.params.get("average_loss_in_time", False), training=training)
def _compute_loss(self, features, labels, outputs, params, mode): length = self._get_features_length(features) if self.crf_decoding: with tf.variable_scope(tf.get_variable_scope(), reuse=mode != tf.estimator.ModeKeys.TRAIN): log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( outputs, tf.cast(labels["tags_id"], tf.int32), length) loss = tf.reduce_sum(-log_likelihood) loss_normalizer = tf.shape(log_likelihood)[0] return loss, loss_normalizer else: return cross_entropy_sequence_loss( outputs, labels["tags_id"], length, label_smoothing=params.get("label_smoothing", 0.0), average_in_time=params.get("average_loss_in_time", False), mode=mode)
def denoise(x, embedding, encoder_outputs, generator, decoder, reuse=None): """Denoises from the noisy encoding. Args: x: The input data from the dataset. embedding: The embedding variable. encoder_outputs: A tuple with the encoder outputs. generator: A tf.layers.Dense instance for projecting the logits. reuse: If True, reuse the decoder variables. Returns: The decoder loss. """ with tf.variable_scope("decoder", reuse=reuse): logits, _, _ = decoder.decode(tf.nn.embedding_lookup(embedding, x["ids_in"]), x["length"] + 1,\ initial_state=encoder_outputs[1], output_layer=generator,\ memory=encoder_outputs[0], memory_sequence_length=encoder_outputs[2]) cumulated_loss, _, normalizer = cross_entropy_sequence_loss( logits, x["ids_out"], x["length"] + 1) return cumulated_loss / normalizer
def compute_loss(self, outputs, labels, training=True, params=None): if params is None: params = {} if self.crf_decoding: with tf.variable_scope(tf.get_variable_scope(), reuse=not training): log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( outputs, tf.cast(labels["tags_id"], tf.int32), labels["length"]) loss = tf.reduce_sum(-log_likelihood) loss_normalizer = tf.cast(tf.shape(log_likelihood)[0], loss.dtype) return loss, loss_normalizer else: return cross_entropy_sequence_loss( outputs, labels["tags_id"], labels["length"], label_smoothing=params.get("label_smoothing", 0.0), average_in_time=params.get("average_loss_in_time", False), mode=tf.estimator.ModeKeys.TRAIN if training else tf.estimator.ModeKeys.EVAL)
def denoise(x, embedding, encoder_outputs, generator, reuse=None): """Denoises from the noisy encoding. Args: x: The input data from the dataset. embedding: The embedding variable. encoder_outputs: A tuple with the encoder outputs. generator: A tf.layers.Dense instance for projecting the logits. reuse: If True, reuse the decoder variables. Returns: The decoder loss. """ with tf.variable_scope("decoder", reuse=reuse): logits, _, _ = decoder.decode( tf.nn.embedding_lookup(embedding, x["ids_in"]), x["length"] + 1, initial_state=encoder_outputs[1], output_layer=generator, memory=encoder_outputs[0], memory_sequence_length=encoder_outputs[2]) cumulated_loss, _, normalizer = cross_entropy_sequence_loss( logits, x["ids_out"], x["length"] + 1) return cumulated_loss / normalizer
def train(model_dir, example_inputter, source_file, target_file, maximum_length=100, shuffle_buffer_size=1000000, gradients_accum=8, train_steps=100000, save_every=1000, report_every=50): """Runs the training loop. Args: model_dir: Directory where checkpoints are saved. example_inputter: The inputter instance that produces the training examples. source_file: The source training file. target_file: The target training file. maximum_length: Filter sequences longer than this. shuffle_buffer_size: How many examples to load for shuffling. gradients_accum: Accumulate gradients of this many iterations. train_steps: Train for this many iterations. save_every: Save a checkpoint every this many iterations. report_every: Report training progress every this many iterations. """ mode = tf.estimator.ModeKeys.TRAIN # Create the dataset. dataset = example_inputter.make_training_dataset( source_file, target_file, batch_size=3072, batch_type="tokens", shuffle_buffer_size=shuffle_buffer_size, bucket_width=1, # Bucketize sequences by the same length for efficiency. maximum_features_length=maximum_length, maximum_labels_length=maximum_length) iterator = dataset.make_initializable_iterator() source, target = iterator.get_next() # Encode the source. with tf.variable_scope("encoder"): source_embedding = source_inputter.make_inputs(source, training=True) memory, _, _ = encoder.encode(source_embedding, source["length"], mode=mode) # Decode the target. with tf.variable_scope("decoder"): target_embedding = target_inputter.make_inputs(target, training=True) logits, _, _ = decoder.decode( target_embedding, target["length"], vocab_size=target_inputter.vocabulary_size, mode=mode, memory=memory, memory_sequence_length=source["length"]) # Compute the loss. loss, normalizer, _ = losses.cross_entropy_sequence_loss( logits, target["ids_out"], target["length"], label_smoothing=0.1, average_in_time=True, mode=mode) loss /= normalizer # Define the learning rate schedule. step = tf.train.create_global_step() learning_rate = decay.noam_decay_v2(2.0, step, model_dim=512, warmup_steps=4000) # Define the optimization op. optimizer = tf.train.AdamOptimizer(learning_rate) gradients = optimizer.compute_gradients(loss) train_op, optim_variables = optim.delayed_update( optimizer, gradients, step, accum_count=gradients_accum) # Runs the training loop. saver = tf.train.Saver() checkpoint_path = None if os.path.exists(model_dir): checkpoint_path = tf.train.latest_checkpoint(model_dir) with tf.Session() as sess: if checkpoint_path is not None: print("Restoring parameters from %s" % checkpoint_path) saver.restore(sess, checkpoint_path) else: sess.run(tf.global_variables_initializer()) sess.run(tf.variables_initializer(optim_variables)) sess.run(tf.tables_initializer()) sess.run(iterator.initializer) last_step = -1 while True: step_, lr_, loss_, _ = sess.run( [step, learning_rate, loss, train_op]) if step_ != last_step: if step_ % report_every == 0: print("Step = %d ; Learning rate = %f ; Loss = %f" % (step_, lr_, loss_)) if step_ % save_every == 0: print("Saving checkpoint for step %d" % step_) saver.save(sess, "%s/model" % model_dir, global_step=step_) if step_ == train_steps: break last_step = step_