def __init__(self, model, optimizer, checkpoint=None, is_master=True): """Initializes the trainer. Args: model: A :class:`opennmt.models.Model` instance to train. optimizer: A ``tf.keras.optimizers.Optimizer`` instance. checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance. If not set, no checkpoints will be saved. is_master: Whether this trainer instance is the master trainer. """ self._checkpoint = checkpoint self._is_master = is_master self._model = model if checkpoint is not None: self._summary_writer = tf.summary.create_file_writer( checkpoint.model_dir) else: self._summary_writer = tf.summary.create_noop_writer() self._training_stats = None self._gradient_accumulator = optimizer_util.GradientAccumulator() if optimizer is None: raise ValueError("No optimizer is defined") graph_optimizer_options = tf.config.optimizer.get_experimental_options( ) mixed_precision_enabled = graph_optimizer_options.get( "auto_mixed_precision") if (mixed_precision_enabled and not isinstance( optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer)): optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic") self._optimizer = optimizer
def __init__(self, checkpoint, devices=None, mixed_precision=False): """Initializes the trainer. Args: checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance. devices: List of device strings to use for training. mixed_precision: Whether mixed precision is enabled or not. """ if not devices: devices = misc.get_devices( count=1) # Train with 1 device by default. self._checkpoint = checkpoint self._mixed_precision = mixed_precision self._model = checkpoint.model self._strategy = tf.distribute.MirroredStrategy(devices=devices) self._summary_writer = tf.summary.create_file_writer( checkpoint.model_dir) optimizer = checkpoint.optimizer if optimizer is None: raise ValueError("No optimizer is defined") if mixed_precision: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic") self._optimizer = optimizer with self._strategy.scope(): # Create some variables under the strategy scope. _ = self._optimizer.iterations self._model.create_variables() self._gradient_accumulator = optimizer_util.GradientAccumulator()
def __init__(self, model, optimizer, checkpoint=None): """Initializes the trainer. Args: model: A :class:`opennmt.models.Model` instance to train. optimizer: A ``tf.keras.optimizers.Optimizer`` instance. checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance. If not set, no checkpoints will be saved. """ self._checkpoint = checkpoint self._model = model if checkpoint is not None: self._summary_writer = tf.summary.create_file_writer( checkpoint.model_dir) else: self._summary_writer = tf.summary.create_noop_writer() self._training_stats = None self._gradient_accumulator = optimizer_util.GradientAccumulator() self._mixed_precision = misc.mixed_precision_enabled() if optimizer is None: raise ValueError("No optimizer is defined") if self._mixed_precision: optimizer = _add_mixed_precision_wrapper(optimizer) self._optimizer = optimizer
def testGradientAccumulator(self): accumulator = utils.GradientAccumulator() accumulator([tf.constant([1.0, 2.0])]) accumulator([tf.constant([-2.0, 1.0])]) accumulator([tf.constant([-1.0, 2.0])]) with self.assertRaises(ValueError): accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])]) self.assertEqual(accumulator.step, 3) self.assertEqual(len(accumulator.gradients), 1) self.assertAllEqual(accumulator.gradients[0], [-2.0, 5.0]) accumulator.reset() self.assertEqual(accumulator.step, 0) self.assertAllEqual(accumulator.gradients[0], [0.0, 0.0])
def testGradientAccumulatorDistributionStrategy(self): devices = tf.config.list_logical_devices(device_type="CPU") strategy = tf.distribute.MirroredStrategy(devices=devices[:2]) with strategy.scope(): accumulator = utils.GradientAccumulator() variable = tf.Variable([4.0, 3.0]) sgd = tf.keras.optimizers.SGD(1.0) gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False) def accumulate_on_replica(gradient): accumulator([gradient]) def apply_on_replica(): sgd.apply_gradients(list(zip(accumulator.gradients, [variable]))) @tf.function def accumulate(grad1, grad2): with strategy.scope(): local_variables = strategy.experimental_local_results( gradient_placeholder) local_variables[0].assign(grad1) local_variables[1].assign(grad2) strategy.run(accumulate_on_replica, args=(gradient_placeholder, )) @tf.function def apply_grad(): with strategy.scope(): strategy.run(apply_on_replica) def _check_local_values(grad1, grad2): values = strategy.experimental_local_results( accumulator._gradients[0]) self.assertAllEqual(values[0].value(), grad1) self.assertAllEqual(values[1].value(), grad2) accumulate([1.0, 2.0], [-1.0, 1.0]) accumulate([3.0, -1.0], [-1.0, -1.0]) accumulate([-2.0, 2.0], [3.0, -2.0]) self.assertEqual(accumulator.step, 3) _check_local_values([2.0, 3.0], [1.0, -2.0]) apply_grad() self.assertAllEqual( variable.value(), [1.0, 2.0]) # [4.0 - (2.0 + 1.0), 3.0 - (3.0 - 2.0)] accumulator.reset() self.assertEqual(accumulator.step, 0) _check_local_values([0.0, 0.0], [0.0, 0.0])
def __init__(self, checkpoint, devices=None): """Initializes the trainer. Args: checkpoint: A :class:`opennmt.utils.checkpoint.Checkpoint` instance. devices: List of device strings to use for training. """ super(DistributionStrategyTrainer, self).__init__(checkpoint) if not devices: devices = misc.get_devices(count=1) # Train with 1 device by default. self._strategy = tf.distribute.MirroredStrategy(devices=devices) self._words_counters = {} with self._strategy.scope(): # Create some variables under the strategy scope. _ = self._optimizer.iterations self._gradient_accumulator = optimizer_util.GradientAccumulator()
def testGradientAccumulatorDistributionStrategy(self): physical_devices = tf.config.experimental.list_physical_devices("CPU") tf.config.experimental.set_virtual_device_configuration( physical_devices[0], [tf.config.experimental.VirtualDeviceConfiguration(), tf.config.experimental.VirtualDeviceConfiguration()]) devices = tf.config.experimental.list_logical_devices(device_type="CPU") strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices]) with strategy.scope(): accumulator = utils.GradientAccumulator() variable = tf.Variable([4.0, 3.0]) sgd = tf.keras.optimizers.SGD(1.0) gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False) def accumulate_on_replica(gradient): accumulator([gradient]) def apply_on_replica(): sgd.apply_gradients(list(zip(accumulator.gradients, [variable]))) @tf.function def accumulate(grad1, grad2): with strategy.scope(): gradient_placeholder.values[0].assign(grad1) gradient_placeholder.values[1].assign(grad2) strategy.experimental_run_v2(accumulate_on_replica, args=(gradient_placeholder,)) @tf.function def apply_grad(): with strategy.scope(): strategy.experimental_run_v2(apply_on_replica) accumulate([1.0, 2.0], [-1.0, 1.0]) accumulate([3.0, -1.0], [-1.0, -1.0]) accumulate([-2.0, 2.0], [3.0, -2.0]) self.assertEqual(accumulator.step, 3) self.assertAllEqual(accumulator._gradients[0].values[0].value(), [2.0, 3.0]) self.assertAllEqual(accumulator._gradients[0].values[1].value(), [1.0, -2.0]) apply_grad() self.assertAllEqual(variable.value(), [1.0, 2.0]) # [4.0 - (2.0 + 1.0), 3.0 - (3.0 - 2.0)] accumulator.reset() self.assertEqual(accumulator.step, 0) self.assertAllEqual(accumulator._gradients[0].values[0].value(), [0.0, 0.0]) self.assertAllEqual(accumulator._gradients[0].values[1].value(), [0.0, 0.0])
def __call__(self, dataset, max_step=None, accum_steps=1, report_steps=100, save_steps=5000, evaluator=None, eval_steps=5000, export_on_best=None): """Runs the training. Args: dataset: A training dataset. max_step: The final training step. accum_steps: The number of gradient accumulation steps. report_steps: Report status every this many steps. save_steps: Save a checkpoint every this many steps. evaluator: A :class:`opennmt.evaluation.Evaluator` instance to call for evaluation. eval_steps: Evaluate every this many steps. export_on_best: Export a SavedModel when this evaluation metric has the best value so far. """ if max_step is not None and self._optimizer.iterations.numpy( ) >= max_step: tf.get_logger().warning( "Model already reached max_step = %d. Exiting.", max_step) return if evaluator is not None and evaluator.should_stop(): tf.get_logger().warning( "Early stopping conditions are already met. Exiting.") return with self._strategy.scope(): self._model.create_variables(optimizer=self._optimizer) variables = self._model.trainable_variables base_dataset = dataset # We prefer not to use experimental_distribute_dataset here because it # sometimes fails to split the batches (noticed with tokens batch type). # We also assume for now that we are training with a single worker # otherwise we would need to correctly shard the input dataset. dataset = self._strategy.experimental_distribute_datasets_from_function( lambda _: base_dataset) gradient_accumulator = optimizer_util.GradientAccumulator() if self._mixed_precision: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( self._optimizer, "dynamic") else: optimizer = self._optimizer def _accumulate_gradients(source, target): outputs, _ = self._model(source, labels=target, training=True, step=self._optimizer.iterations) loss = self._model.compute_loss(outputs, target, training=True) if isinstance(loss, tuple): training_loss = loss[0] / loss[1] reported_loss = loss[0] / loss[2] else: training_loss, reported_loss = loss, loss training_loss = self._model.regularize_loss(training_loss, variables=variables) gradients = optimizer.get_gradients(training_loss, variables) gradient_accumulator(gradients) tf.summary.scalar("gradients/global_norm", tf.linalg.global_norm(gradients)) num_words = {} if "length" in source: num_words["source"] = tf.reduce_sum(source["length"]) if "length" in target: num_words["target"] = tf.reduce_sum(target["length"]) return reported_loss, num_words def _apply_gradients(): grads_and_vars = [] for gradient, variable in zip(gradient_accumulator.gradients, variables): # optimizer.apply_gradients will sum the gradients accross replicas. scaled_gradient = gradient / ( self._strategy.num_replicas_in_sync * accum_steps) grads_and_vars.append((scaled_gradient, variable)) optimizer.apply_gradients(grads_and_vars) gradient_accumulator.reset() @dataset_util.function_on_next(dataset) def _forward(next_fn): tf.summary.experimental.set_step(self._optimizer.iterations) should_record_summaries = tf.logical_and( tf.equal(self._optimizer.iterations % report_steps, 0), tf.equal(gradient_accumulator.step, 0)) with tf.summary.record_if(should_record_summaries): with self._strategy.scope(): per_replica_source, per_replica_target = next_fn() def _run(): per_replica_loss, per_replica_words = self._strategy.experimental_run_v2( _accumulate_gradients, args=(per_replica_source, per_replica_target)) # TODO: these reductions could be delayed until _step is called. loss = self._strategy.reduce( tf.distribute.ReduceOp.MEAN, per_replica_loss, None) num_words = { k: self._strategy.reduce(tf.distribute.ReduceOp.SUM, v, None) for k, v in six.iteritems(per_replica_words) } return loss, num_words, False def _skip(): loss = tf.constant(0, dtype=tf.float32) num_words = {} if "length" in per_replica_source: num_words["source"] = tf.constant(0, dtype=tf.int32) if "length" in per_replica_target: num_words["target"] = tf.constant(0, dtype=tf.int32) return loss, num_words, True # We verify here that each replica receives a non empty batch. If not, # we skip this iteration. This typically happens at the last iteration # when training on a finite dataset. # TODO: is there a simpler way to handle this case? per_replica_non_empty_batch = self._strategy.experimental_run_v2( lambda tensor: tf.math.count_nonzero( tf.shape(tensor)[0]), args=(tf.nest.flatten(per_replica_source)[0], )) non_empty_batch_count = self._strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_non_empty_batch, None) return tf.cond(tf.math.equal( non_empty_batch_count, self._strategy.num_replicas_in_sync), true_fn=_run, false_fn=_skip) @tf.function def _step(): with self._strategy.scope(): self._strategy.experimental_run_v2(_apply_gradients) accum_num_words = collections.defaultdict(int) last_report_time = time.time() last_step = 0 with self._summary_writer.as_default(): if self._optimizer.iterations.numpy() == 0: self._checkpoint.save(0) self._model.visualize(self._checkpoint.model_dir) for i, (loss, num_words, skipped) in enumerate(_forward()): # pylint: disable=no-value-for-parameter if skipped: # We assume only the last partial batch can possibly be skipped. tf.get_logger().warning( "Batch %d is partial, i.e. some training replicas " "received an empty batch as input. Skipping.", i + 1) break if tf.math.is_nan(loss): raise RuntimeError("Model diverged with loss = NaN.") if i == 0 or (i + 1) % accum_steps == 0: _step() for key, value in six.iteritems(num_words): accum_num_words[key] += value.numpy() step = self._optimizer.iterations.numpy() if step == last_step: continue # Do not process same step twice. last_step = step if step % report_steps == 0: last_report_time = _report_training_status( step, loss, self._optimizer.learning_rate, accum_num_words, last_report_time) if save_steps is not None and step % save_steps == 0: self._checkpoint.save(step) if evaluator is not None and eval_steps is not None and step % eval_steps == 0: self._evaluate(evaluator, step, export_on_best=export_on_best) if evaluator.should_stop(): tf.get_logger().warning( "Early stopping conditions are met. Exiting.") break if step == max_step: break if evaluator is not None and step != evaluator.last_evaluated_step: self._evaluate(evaluator, step, export_on_best=export_on_best) self._checkpoint.save(step)