def test_recovery_nan_error(self, distribution_strategy, flag_mode): model_dir = self.get_temp_dir() flags_dict = dict(experiment='mock', mode=flag_mode, model_dir=model_dir, params_override=json.dumps(self._test_config)) with flagsaver.flagsaver(**flags_dict): params = train_utils.parse_configuration(flags.FLAGS) train_utils.serialize_config(params, model_dir) with distribution_strategy.scope(): # task = task_factory.get_task(params.task, logging_dir=model_dir) task = mock_task.MockTask(params.task, logging_dir=model_dir) # Set the loss to NaN to trigger RunTimeError. def build_losses(labels, model_outputs, aux_losses=None): del labels, model_outputs return tf.constant([np.nan], tf.float32) + aux_losses task.build_losses = build_losses with self.assertRaises(RuntimeError): train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=flag_mode, params=params, model_dir=model_dir)
def test_configure_optimizer(self, mixed_precision_dtype, loss_scale): config = cfg.ExperimentConfig( runtime=cfg.RuntimeConfig( mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale), trainer=cfg.TrainerConfig(optimizer_config=cfg.OptimizationConfig({ 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } }))) task = mock_task.MockTask() trainer = trainer_lib.Trainer(config, task) if mixed_precision_dtype != 'float16': self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD) elif mixed_precision_dtype == 'float16' and loss_scale is None: self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD) else: self.assertIsInstance( trainer.optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer) metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('training_loss', metrics)
def test_task_with_step_override(self, distribution): with distribution.scope(): task = mock_task.MockTask() model = task.build_model() model = task.compile_model( model, optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3), metrics=task.build_metrics(), train_step=task.train_step, validation_step=task.validation_step) dataset = task.build_inputs(params=None) logs = model.fit(dataset, epochs=1, steps_per_epoch=2) self.assertIn('loss', logs.history) self.assertIn('acc', logs.history) # Without specifying metrics through compile. with distribution.scope(): train_metrics = task.build_metrics(training=True) val_metrics = task.build_metrics(training=False) model = task.build_model() model = task.compile_model( model, optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3), train_step=functools.partial(task.train_step, metrics=train_metrics), validation_step=functools.partial(task.validation_step, metrics=val_metrics)) logs = model.fit(dataset, epochs=1, steps_per_epoch=2) self.assertIn('loss', logs.history) self.assertIn('acc', logs.history)
def test_export_best_ckpt(self, distribution): config = cfg.ExperimentConfig( trainer=cfg.TrainerConfig( best_checkpoint_export_subdir='best_ckpt', best_checkpoint_eval_metric='acc', optimizer_config=cfg.OptimizationConfig({ 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } }))) model_dir = self.get_temp_dir() task = mock_task.MockTask(config.task, logging_dir=model_dir) ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir) trainer = trainer_lib.Trainer( config, task, model=task.build_model(), checkpoint_exporter=ckpt_exporter) trainer.train(tf.convert_to_tensor(1, dtype=tf.int32)) trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32)) self.assertTrue( tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
def create_test_trainer(self, config, model_dir=None): task = mock_task.MockTask(config.task, logging_dir=model_dir) ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir) trainer = trainer_lib.Trainer( config, task, model=task.build_model(), optimizer=trainer_lib.create_optimizer(config.trainer, config.runtime), checkpoint_exporter=ckpt_exporter) return trainer
def test_task_invalid_compile(self): task = mock_task.MockTask() model = task.build_model() with self.assertRaises(ValueError): _ = task.compile_model( model, optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=task.build_metrics(), train_step=task.train_step)
def test_recovery(self): config = cfg.ExperimentConfig( trainer=cfg.TrainerConfig(loss_upper_bound=0.5, recovery_max_trials=2, optimizer_config=cfg.OptimizationConfig({ 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } }))) model_dir = self.get_temp_dir() trainer = self.create_test_trainer(config, model_dir=model_dir) checkpoint_manager = tf.train.CheckpointManager(trainer.checkpoint, self.get_temp_dir(), max_to_keep=2) checkpoint_manager.save() trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager) before_weights = trainer.model.get_weights() _ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32)) # The training loss is 1.0 and upper_bound is 0.5, so the recover happens. after_weights = trainer.model.get_weights() for left, right in zip(before_weights, after_weights): self.assertAllEqual(left, right) # Let's the loss be NaN and max_trials = 0 to see RuntimeError. config = cfg.ExperimentConfig( trainer=cfg.TrainerConfig(recovery_max_trials=0, optimizer_config=cfg.OptimizationConfig({ 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } }))) task = mock_task.MockTask(config.task, logging_dir=model_dir) def build_losses(labels, model_outputs, aux_losses=None): del labels, model_outputs return tf.constant([np.nan], tf.float32) + aux_losses task.build_losses = build_losses trainer = trainer_lib.Trainer(config, task, model=task.build_model(), optimizer=task.create_optimizer( config.trainer.optimizer_config, config.runtime)) trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager) with self.assertRaises(RuntimeError): _ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
def test_model_with_compiled_loss(self): task = mock_task.MockTask() model = task.build_model() model.compile(loss=tf.keras.losses.CategoricalCrossentropy()) trainer = trainer_lib.Trainer( self._config, task, model=model, optimizer=task.create_optimizer(self._config.trainer.optimizer_config)) logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('training_loss', logs)
def test_task_with_fit(self): task = mock_task.MockTask() model = task.build_model() model = task.compile_model( model, optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=task.build_metrics()) dataset = task.build_inputs(params=None) logs = model.fit(dataset, epochs=1, steps_per_epoch=2) self.assertIn('loss', logs.history) self.assertIn('acc', logs.history) self.assertLen(model.evaluate(dataset, steps=1), 2)
def test_trainer_passing_datasets(self, distribution): with distribution.scope(): task = mock_task.MockTask(self._config) train_dataset = orbit.utils.make_distributed_dataset( distribution, task.build_inputs, self._config.task.train_data) validation_dataset = orbit.utils.make_distributed_dataset( distribution, task.build_inputs, self._config.task.validation_data) self._config.task.train_data = None self._config.task.validation_data = None trainer = trainer_lib.Trainer( self._config, task, model=task.build_model(), optimizer=task.create_optimizer(self._config.trainer.optimizer_config, self._config.runtime), train_dataset=train_dataset, validation_dataset=validation_dataset) logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('training_loss', logs) self.assertIn('learning_rate', logs) logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32)) self.assertIn('validation_loss', logs)
def create_test_trainer(self): task = mock_task.MockTask() trainer = trainer_lib.Trainer(self._config, task) return trainer
def create_test_trainer(self, config): task = mock_task.MockTask() trainer = trainer_lib.Trainer(config, task, model=task.build_model()) return trainer