def test_update_classy_model(self):
     """
     Tests that the update_classy_model successfully updates from a
     checkpoint
     """
     config = get_fast_test_task_config()
     task = build_task(config)
     trainer = LocalTrainer()
     trainer.train(task)
     for reset_heads in [False, True]:
         task_2 = build_task(config)
         # prepare task_2 for the right device
         task_2.prepare()
         update_classy_model(task_2.model,
                             task.model.get_classy_state(deep_copy=True),
                             reset_heads)
         self._compare_model_state(
             task.model.get_classy_state(),
             task_2.model.get_classy_state(),
             check_heads=not reset_heads,
         )
         if reset_heads:
             # the model head states should be different
             with self.assertRaises(Exception):
                 self._compare_model_state(
                     task.model.get_classy_state(),
                     task_2.model.get_classy_state(),
                     check_heads=True,
                 )
示例#2
0
    def test_checkpointing(self):
        """
        Tests checkpointing by running train_steps to make sure the train_steps
        run the same way after loading from a checkpoint.
        """
        config = get_fast_test_task_config()
        task = build_task(config).set_hooks([LossLrMeterLoggingHook()])
        task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()])

        task.set_use_gpu(torch.cuda.is_available())

        # prepare the tasks for the right device
        task.prepare()

        # test in both train and test mode
        for _ in range(2):
            task.advance_phase()

            # set task's state as task_2's checkpoint
            task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True))
            task_2.prepare()

            # task 2 should have the same state
            self._compare_states(task.get_classy_state(), task_2.get_classy_state())

            # this tests that both states' iterators return the same samples
            sample = next(task.get_data_iterator())
            sample_2 = next(task_2.get_data_iterator())
            self._compare_samples(sample, sample_2)

            # test that the train step runs the same way on both states
            # and the loss remains the same
            task.train_step()
            task_2.train_step()
            self._compare_states(task.get_classy_state(), task_2.get_classy_state())
示例#3
0
 def test_training(self):
     config = get_fast_test_task_config()
     config["amp_args"] = {"opt_level": "O2"}
     task = build_task(config)
     task.set_use_gpu(True)
     trainer = LocalTrainer()
     trainer.train(task)
    def test_checkpointing(self):
        # make checkpoint directory
        checkpoint_folder = self.base_dir + "/checkpoint/"
        os.mkdir(checkpoint_folder)

        config = get_fast_test_task_config()
        cuda_available = torch.cuda.is_available()
        task = build_task(config)

        task.prepare(use_gpu=cuda_available)

        # create a checkpoint hook
        checkpoint_hook = CheckpointHook(checkpoint_folder, {},
                                         phase_types=["train"])

        # call the on end phase function
        checkpoint_hook.on_phase_end(task)

        # we should be able to train a task using the checkpoint on all available
        # devices
        for use_gpu in {False, cuda_available}:
            # load the checkpoint
            checkpoint = load_checkpoint(checkpoint_folder)

            # create a new task
            task = build_task(config)

            # set the checkpoint
            task.set_checkpoint(checkpoint)

            task.prepare(use_gpu=use_gpu)

            # we should be able to run the trainer using the checkpoint
            trainer = LocalTrainer(use_gpu=use_gpu)
            trainer.train(task)
    def test_test_only_task(self):
        """
        Tests the task in test mode by running train_steps
        to make sure the train_steps run as expected on a
        test_only task
        """
        test_config = get_fast_test_task_config()
        test_config["test_only"] = True

        # delete train dataset
        del test_config["dataset"]["train"]

        test_only_task = build_task(test_config).set_hooks(
            [LossLrMeterLoggingHook()])

        test_only_task.prepare()
        test_state = test_only_task.get_classy_state()

        # We expect that test only state is test, no matter what train state is
        self.assertFalse(test_state["train"])

        # Num updates should be 0
        self.assertEqual(test_state["num_updates"], 0)

        # Verify task will run
        trainer = LocalTrainer()
        trainer.train(test_only_task)
    def test_final_train_checkpoint(self):
        """Test that a train phase checkpoint with a where of 1.0 can be loaded"""

        config = get_fast_test_task_config()
        task = build_task(config).set_hooks(
            [CheckpointHook(self.base_dir, {}, phase_types=["train"])])
        task_2 = build_task(config)

        use_gpu = torch.cuda.is_available()

        trainer = LocalTrainer(use_gpu=use_gpu)
        trainer.train(task)

        # load the final train checkpoint
        checkpoint = load_checkpoint(self.base_dir)

        # make sure fetching the where raises an exception, which means that
        # where is >= 1.0
        with self.assertRaises(Exception):
            task.where

        # set task_2's state as task's final train checkpoint
        task_2.set_checkpoint(checkpoint)
        task_2.prepare(use_gpu=use_gpu)

        # we should be able to train the task
        trainer.train(task_2)
示例#7
0
    def test_checkpointing(self):
        """
        Tests checkpointing by running train_steps to make sure the train_steps
        run the same way after loading from a checkpoint.
        """
        config = get_fast_test_task_config()
        task = build_task(config).set_hooks([LossLrMeterLoggingHook()])
        task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()])

        task.set_use_gpu(torch.cuda.is_available())

        # only train 1 phase at a time
        trainer = LimitedPhaseTrainer(num_phases=1)

        while not task.done_training():
            # set task's state as task_2's checkpoint
            task_2._set_checkpoint_dict(
                get_checkpoint_dict(task, {}, deep_copy=True))

            # task 2 should have the same state before training
            self._compare_states(task.get_classy_state(),
                                 task_2.get_classy_state())

            # train for one phase
            trainer.train(task)
            trainer.train(task_2)

            # task 2 should have the same state after training
            self._compare_states(task.get_classy_state(),
                                 task_2.get_classy_state())
    def test_test_only_checkpointing(self):
        """
        Tests checkpointing by running train_steps to make sure the
        train_steps run the same way after loading from a training
        task checkpoint on a test_only task.
        """
        train_config = get_fast_test_task_config()
        train_config["num_epochs"] = 10
        test_config = get_fast_test_task_config()
        test_config["test_only"] = True
        train_task = build_task(train_config).set_hooks(
            [LossLrMeterLoggingHook()])
        test_only_task = build_task(test_config).set_hooks(
            [LossLrMeterLoggingHook()])

        use_gpu = torch.cuda.is_available()

        # prepare the tasks for the right device
        train_task.prepare(use_gpu=use_gpu)

        # test in both train and test mode
        trainer = LocalTrainer(use_gpu=use_gpu)
        trainer.train(train_task)

        # set task's state as task_2's checkpoint
        test_only_task.set_checkpoint(
            get_checkpoint_dict(train_task, {}, deep_copy=True))
        test_only_task.prepare(use_gpu=use_gpu)
        test_state = test_only_task.get_classy_state()

        # We expect the phase idx to be different for a test only task
        self.assertEqual(test_state["phase_idx"], -1)

        # We expect that test only state is test, no matter what train state is
        self.assertFalse(test_state["train"])

        # Num updates should be 0
        self.assertEqual(test_state["num_updates"], 0)

        # train_phase_idx should -1
        self.assertEqual(test_state["train_phase_idx"], -1)

        # Verify task will run
        trainer = LocalTrainer(use_gpu=use_gpu)
        trainer.train(test_only_task)
    def test_train_step(self):
        # test that the model can be run in a train step
        model = models.resnet34(pretrained=False)
        classy_model = ClassyModelWrapper(model)

        config = get_fast_test_task_config()
        task = build_task(config)
        task.set_model(classy_model)
        trainer = LocalTrainer()
        trainer.train(task)
示例#10
0
    def test_synchronize_losses_when_losses_empty(self):
        config = get_fast_test_task_config()
        task = build_task(config)
        task.prepare()

        task.set_use_gpu(torch.cuda.is_available())

        # Losses should be empty when creating task
        self.assertEqual(len(task.losses), 0)

        task.synchronize_losses()
示例#11
0
    def _get_task_config(self):
        config = get_fast_test_task_config()
        config["optimizer"] = {
            "name": "zero",
            "base_optimizer": {
                "name": "sgd",
                "momentum": 0.9
            },
        }

        return config
    def _get_fine_tuning_config(self,
                                head_num_classes=100,
                                pretrained_checkpoint=False):
        config = get_fast_test_task_config(head_num_classes=head_num_classes)
        config["name"] = "fine_tuning"
        config["num_epochs"] = 2

        if pretrained_checkpoint:
            config["pretrained_checkpoint"] = "/path/to/pretrained/checkpoint"

        return config
示例#13
0
    def test_synchronize_losses_non_distributed(self):
        """
        Tests that synchronize losses has no side effects in a non-distributed setting.
        """
        test_config = get_fast_test_task_config()
        task = build_task(test_config)
        task.prepare()

        old_losses = copy.deepcopy(task.losses)
        task.synchronize_losses()
        self.assertEqual(old_losses, task.losses)
示例#14
0
    def test_fp16_grad_compression(self):
        # there is no API defined to check that a DDP hook has been enabled, so we just
        # test that we set the right variables
        config = copy.deepcopy(get_fast_test_task_config())
        task = build_task(config)
        self.assertFalse(task.fp16_grad_compress)

        config.setdefault("distributed", {})
        config["distributed"]["fp16_grad_compress"] = True

        task = build_task(config)
        self.assertTrue(task.fp16_grad_compress)
示例#15
0
 def test_update_classy_state(self):
     """
     Tests that the update_classy_state successfully updates from a
     checkpoint
     """
     config = get_fast_test_task_config()
     task = build_task(config)
     task_2 = build_task(config)
     task_2.prepare()
     trainer = LocalTrainer()
     trainer.train(task)
     update_classy_state(task_2, task.get_classy_state(deep_copy=True))
     self._compare_states(task.get_classy_state(), task_2.get_classy_state())
    def test_checkpointing_different_device(self):
        config = get_fast_test_task_config()
        task = build_task(config)
        task_2 = build_task(config)

        for use_gpu in [True, False]:
            task.prepare(use_gpu=use_gpu)

            # set task's state as task_2's checkpoint
            task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True))

            # we should be able to run the trainer using state from a different device
            trainer = LocalTrainer(use_gpu=not use_gpu)
            trainer.train(task_2)
    def test_training(self):
        # Test an Apex AMP training
        config = get_fast_test_task_config()
        config["amp_args"] = {"opt_level": "O2"}
        task = build_task(config)
        task.set_use_gpu(True)
        trainer = LocalTrainer()
        trainer.train(task)

        # Test a Pytorch AMP training
        config["amp_args"] = {"amp_type": "pytorch"}
        task = build_task(config)
        task.set_use_gpu(True)
        trainer = LocalTrainer()
        trainer.train(task)
示例#18
0
    def test_train_only_task(self):
        """
        Tests that the task runs when only a train dataset is specified.
        """
        test_config = get_fast_test_task_config()

        # delete the test dataset from the config
        del test_config["dataset"]["test"]

        task = build_task(test_config).set_hooks([LossLrMeterLoggingHook()])
        task.prepare()

        # verify the the task can still be trained
        trainer = LocalTrainer()
        trainer.train(task)
示例#19
0
    def test_train(self) -> None:
        for use_gpu in {False, torch.cuda.is_available()}:
            folder = f"{self.base_dir}/train_test/{use_gpu}"
            os.makedirs(folder)

            task = build_task(get_fast_test_task_config(head_num_classes=2))

            csv_hook = OutputCSVHook(folder)
            task.set_hooks([csv_hook])
            task.set_use_gpu(use_gpu)

            trainer = LocalTrainer()
            trainer.train(task)

            self.assertEqual(parse_csv(csv_hook.output_path), 10)
示例#20
0
        def train_with_clipped_gradients(amp_args=None):
            task = build_task(get_fast_test_task_config())
            task.set_num_epochs(1)
            task.set_model(SimpleModel())
            task.set_loss(SimpleLoss())
            task.set_meters([])
            task.set_use_gpu(torch.cuda.is_available())
            task.set_clip_grad_norm(0.5)
            task.set_amp_args(amp_args)

            task.set_optimizer(SGD(lr=1))

            trainer = LocalTrainer()
            trainer.train(task)

            return task.model.param.grad.norm()
示例#21
0
    def test_clip_stateful_loss(self):
        config = get_fast_test_task_config()
        config["loss"] = {"name": "test_stateful_loss", "in_plane": 256}
        config["grad_norm_clip"] = grad_norm_clip = 1
        task = build_task(config)
        task.set_use_gpu(False)
        task.prepare()

        # set fake gradients with norm > grad_norm_clip
        for param in itertools.chain(task.base_model.parameters(),
                                     task.base_loss.parameters()):
            param.grad = 1.1 + torch.rand(param.shape)
            self.assertGreater(param.grad.norm(), grad_norm_clip)

        task._clip_gradients(grad_norm_clip)

        for param in itertools.chain(task.base_model.parameters(),
                                     task.base_loss.parameters()):
            self.assertLessEqual(param.grad.norm(), grad_norm_clip)
示例#22
0
    def test_final_train_checkpoint(self):
        """Test that a train phase checkpoint with a where of 1.0 can be loaded"""

        config = get_fast_test_task_config()
        task = build_task(config).set_hooks(
            [CheckpointHook(self.base_dir, {}, phase_types=["train"])])
        task_2 = build_task(config)

        task.set_use_gpu(torch.cuda.is_available())

        trainer = LocalTrainer()
        trainer.train(task)

        self.assertAlmostEqual(task.where, 1.0, delta=1e-3)

        # set task_2's state as task's final train checkpoint
        task_2.set_checkpoint(self.base_dir)
        task_2.prepare()

        # we should be able to train the task
        trainer.train(task_2)
示例#23
0
    def train_with_batch(self, simulated_bs, actual_bs, clip_grad_norm=None):
        config = copy.deepcopy(get_fast_test_task_config())
        config["dataset"]["train"]["num_samples"] = 12
        config["dataset"]["train"]["batchsize_per_replica"] = actual_bs
        del config["dataset"]["test"]

        task = build_task(config)
        task.set_num_epochs(1)
        task.set_model(SimpleModel())
        task.set_loss(SimpleLoss())
        task.set_meters([])
        task.set_use_gpu(torch.cuda.is_available())
        if simulated_bs is not None:
            task.set_simulated_global_batchsize(simulated_bs)
        if clip_grad_norm is not None:
            task.set_clip_grad_norm(clip_grad_norm)

        task.set_optimizer(SGD(lr=1))

        trainer = LocalTrainer()
        trainer.train(task)

        return task.model.param
 def test_training(self):
     config = get_fast_test_task_config()
     config["amp_opt_level"] = "O2"
     task = build_task(config)
     trainer = LocalTrainer(use_gpu=True)
     trainer.train(task)
示例#25
0
 def test_get_classy_state_on_loss(self):
     config = get_fast_test_task_config()
     config["loss"] = {"name": "test_stateful_loss", "in_plane": 256}
     task = build_task(config)
     task.prepare()
     self.assertIn("alpha", task.get_classy_state()["loss"])
示例#26
0
 def _get_fine_tuning_config(self, head_num_classes=1000):
     config = get_fast_test_task_config(head_num_classes=head_num_classes)
     config["name"] = "fine_tuning"
     config["num_epochs"] = 10
     return config
 def _get_pre_train_config(self, head_num_classes=100):
     config = get_fast_test_task_config(head_num_classes=head_num_classes)
     config["num_epochs"] = 2
     return config