예제 #1
0
    def test_recovery_nan_error(self, distribution_strategy, flag_mode):
        model_dir = self.get_temp_dir()
        flags_dict = dict(experiment='mock',
                          mode=flag_mode,
                          model_dir=model_dir,
                          params_override=json.dumps(self._test_config))
        with flagsaver.flagsaver(**flags_dict):
            params = train_utils.parse_configuration(flags.FLAGS)
            train_utils.serialize_config(params, model_dir)
            with distribution_strategy.scope():
                # task = task_factory.get_task(params.task, logging_dir=model_dir)
                task = mock_task.MockTask(params.task, logging_dir=model_dir)

                # Set the loss to NaN to trigger RunTimeError.
                def build_losses(labels, model_outputs, aux_losses=None):
                    del labels, model_outputs
                    return tf.constant([np.nan], tf.float32) + aux_losses

                task.build_losses = build_losses

            with self.assertRaises(RuntimeError):
                train_lib.run_experiment(
                    distribution_strategy=distribution_strategy,
                    task=task,
                    mode=flag_mode,
                    params=params,
                    model_dir=model_dir)
예제 #2
0
    def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
        config = cfg.ExperimentConfig(
            runtime=cfg.RuntimeConfig(
                mixed_precision_dtype=mixed_precision_dtype,
                loss_scale=loss_scale),
            trainer=cfg.TrainerConfig(optimizer_config=cfg.OptimizationConfig({
                'optimizer': {
                    'type': 'sgd'
                },
                'learning_rate': {
                    'type': 'constant'
                }
            })))
        task = mock_task.MockTask()
        trainer = trainer_lib.Trainer(config, task)
        if mixed_precision_dtype != 'float16':
            self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
        elif mixed_precision_dtype == 'float16' and loss_scale is None:
            self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
        else:
            self.assertIsInstance(
                trainer.optimizer,
                tf.keras.mixed_precision.experimental.LossScaleOptimizer)

        metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
        self.assertIn('training_loss', metrics)
예제 #3
0
    def test_task_with_step_override(self, distribution):
        with distribution.scope():
            task = mock_task.MockTask()
            model = task.build_model()
            model = task.compile_model(
                model,
                optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
                metrics=task.build_metrics(),
                train_step=task.train_step,
                validation_step=task.validation_step)

        dataset = task.build_inputs(params=None)
        logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
        self.assertIn('loss', logs.history)
        self.assertIn('acc', logs.history)

        # Without specifying metrics through compile.
        with distribution.scope():
            train_metrics = task.build_metrics(training=True)
            val_metrics = task.build_metrics(training=False)
            model = task.build_model()
            model = task.compile_model(
                model,
                optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
                train_step=functools.partial(task.train_step,
                                             metrics=train_metrics),
                validation_step=functools.partial(task.validation_step,
                                                  metrics=val_metrics))
        logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
        self.assertIn('loss', logs.history)
        self.assertIn('acc', logs.history)
예제 #4
0
 def test_export_best_ckpt(self, distribution):
   config = cfg.ExperimentConfig(
       trainer=cfg.TrainerConfig(
           best_checkpoint_export_subdir='best_ckpt',
           best_checkpoint_eval_metric='acc',
           optimizer_config=cfg.OptimizationConfig({
               'optimizer': {
                   'type': 'sgd'
               },
               'learning_rate': {
                   'type': 'constant'
               }
           })))
   model_dir = self.get_temp_dir()
   task = mock_task.MockTask(config.task, logging_dir=model_dir)
   ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
   trainer = trainer_lib.Trainer(
       config,
       task,
       model=task.build_model(),
       checkpoint_exporter=ckpt_exporter)
   trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
   trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
   self.assertTrue(
       tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
예제 #5
0
 def create_test_trainer(self, config, model_dir=None):
   task = mock_task.MockTask(config.task, logging_dir=model_dir)
   ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
   trainer = trainer_lib.Trainer(
       config,
       task,
       model=task.build_model(),
       optimizer=trainer_lib.create_optimizer(config.trainer, config.runtime),
       checkpoint_exporter=ckpt_exporter)
   return trainer
예제 #6
0
 def test_task_invalid_compile(self):
     task = mock_task.MockTask()
     model = task.build_model()
     with self.assertRaises(ValueError):
         _ = task.compile_model(
             model,
             optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
             loss=tf.keras.losses.CategoricalCrossentropy(),
             metrics=task.build_metrics(),
             train_step=task.train_step)
    def test_recovery(self):
        config = cfg.ExperimentConfig(
            trainer=cfg.TrainerConfig(loss_upper_bound=0.5,
                                      recovery_max_trials=2,
                                      optimizer_config=cfg.OptimizationConfig({
                                          'optimizer': {
                                              'type': 'sgd'
                                          },
                                          'learning_rate': {
                                              'type': 'constant'
                                          }
                                      })))
        model_dir = self.get_temp_dir()
        trainer = self.create_test_trainer(config, model_dir=model_dir)
        checkpoint_manager = tf.train.CheckpointManager(trainer.checkpoint,
                                                        self.get_temp_dir(),
                                                        max_to_keep=2)
        checkpoint_manager.save()
        trainer.add_recovery(config.trainer,
                             checkpoint_manager=checkpoint_manager)
        before_weights = trainer.model.get_weights()
        _ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
        # The training loss is 1.0 and upper_bound is 0.5, so the recover happens.
        after_weights = trainer.model.get_weights()
        for left, right in zip(before_weights, after_weights):
            self.assertAllEqual(left, right)

        # Let's the loss be NaN and max_trials = 0 to see RuntimeError.
        config = cfg.ExperimentConfig(
            trainer=cfg.TrainerConfig(recovery_max_trials=0,
                                      optimizer_config=cfg.OptimizationConfig({
                                          'optimizer': {
                                              'type': 'sgd'
                                          },
                                          'learning_rate': {
                                              'type': 'constant'
                                          }
                                      })))
        task = mock_task.MockTask(config.task, logging_dir=model_dir)

        def build_losses(labels, model_outputs, aux_losses=None):
            del labels, model_outputs
            return tf.constant([np.nan], tf.float32) + aux_losses

        task.build_losses = build_losses
        trainer = trainer_lib.Trainer(config,
                                      task,
                                      model=task.build_model(),
                                      optimizer=task.create_optimizer(
                                          config.trainer.optimizer_config,
                                          config.runtime))
        trainer.add_recovery(config.trainer,
                             checkpoint_manager=checkpoint_manager)
        with self.assertRaises(RuntimeError):
            _ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
예제 #8
0
 def test_model_with_compiled_loss(self):
   task = mock_task.MockTask()
   model = task.build_model()
   model.compile(loss=tf.keras.losses.CategoricalCrossentropy())
   trainer = trainer_lib.Trainer(
       self._config,
       task,
       model=model,
       optimizer=task.create_optimizer(self._config.trainer.optimizer_config))
   logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
   self.assertIn('training_loss', logs)
예제 #9
0
 def test_task_with_fit(self):
     task = mock_task.MockTask()
     model = task.build_model()
     model = task.compile_model(
         model,
         optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
         loss=tf.keras.losses.CategoricalCrossentropy(),
         metrics=task.build_metrics())
     dataset = task.build_inputs(params=None)
     logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
     self.assertIn('loss', logs.history)
     self.assertIn('acc', logs.history)
     self.assertLen(model.evaluate(dataset, steps=1), 2)
예제 #10
0
 def test_trainer_passing_datasets(self, distribution):
   with distribution.scope():
     task = mock_task.MockTask(self._config)
     train_dataset = orbit.utils.make_distributed_dataset(
         distribution, task.build_inputs, self._config.task.train_data)
     validation_dataset = orbit.utils.make_distributed_dataset(
         distribution, task.build_inputs, self._config.task.validation_data)
     self._config.task.train_data = None
     self._config.task.validation_data = None
     trainer = trainer_lib.Trainer(
         self._config,
         task,
         model=task.build_model(),
         optimizer=task.create_optimizer(self._config.trainer.optimizer_config,
                                         self._config.runtime),
         train_dataset=train_dataset,
         validation_dataset=validation_dataset)
   logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
   self.assertIn('training_loss', logs)
   self.assertIn('learning_rate', logs)
   logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
   self.assertIn('validation_loss', logs)
예제 #11
0
 def create_test_trainer(self):
     task = mock_task.MockTask()
     trainer = trainer_lib.Trainer(self._config, task)
     return trainer
예제 #12
0
 def create_test_trainer(self, config):
   task = mock_task.MockTask()
   trainer = trainer_lib.Trainer(config, task, model=task.build_model())
   return trainer