示例#1
0
    def test_save_load(self):
        fd, checkpoint_path = tempfile.mkstemp()

        task1 = create_task("task1")
        task2 = create_task("task2")
        # Make task2's second linear layer have different weights
        task2.module_pool["linear2"] = nn.Linear(2, 2)

        model = MultitaskModel([task1])
        self.assertTrue(
            torch.eq(
                task1.module_pool["linear2"].weight,
                model.module_pool["linear2"].module.weight,
            ).all())
        model.save(checkpoint_path)
        model = MultitaskModel([task2])
        self.assertFalse(
            torch.eq(
                task1.module_pool["linear2"].weight,
                model.module_pool["linear2"].module.weight,
            ).all())
        model.load(checkpoint_path)
        self.assertTrue(
            torch.eq(
                task1.module_pool["linear2"].weight,
                model.module_pool["linear2"].module.weight,
            ).all())

        os.close(fd)
示例#2
0
    def checkpoint(
        self, iteration: float, model: MultitaskModel, metric_dict: Metrics
    ) -> None:
        """Check if iteration and current metrics necessitate a checkpoint.

        Parameters
        ----------
        iteration
            Current training iteration
        model
            Model to checkpoint
        metric_dict
            Current performance metrics for model
        """
        # Check if the checkpoint_runway condition is met
        if iteration < self.checkpoint_runway:
            return
        elif not self.checkpoint_condition_met and iteration >= self.checkpoint_runway:
            self.checkpoint_condition_met = True
            logging.info(
                "checkpoint_runway condition has been met. Start checkpointing."
            )

        checkpoint_path = f"{self.checkpoint_dir}/checkpoint_{iteration}.pth"
        model.save(checkpoint_path)
        logging.info(
            f"Save checkpoint at {iteration} {self.checkpoint_unit} "
            f"at {checkpoint_path}."
        )

        if not set(self.checkpoint_task_metrics.keys()).isdisjoint(
            set(metric_dict.keys())
        ):
            new_best_metrics = self._is_new_best(metric_dict)
            for metric in new_best_metrics:
                copyfile(
                    checkpoint_path,
                    f"{self.checkpoint_dir}/best_model_"
                    f"{metric.replace('/', '_')}.pth",
                )

                logging.info(
                    f"Save best model of metric {metric} at {self.checkpoint_dir}"
                    f"/best_model_{metric.replace('/', '_')}.pth"
                )