def test_save_load(self): fd, checkpoint_path = tempfile.mkstemp() task1 = create_task("task1") task2 = create_task("task2") # Make task2's second linear layer have different weights task2.module_pool["linear2"] = nn.Linear(2, 2) model = MultitaskModel([task1]) self.assertTrue( torch.eq( task1.module_pool["linear2"].weight, model.module_pool["linear2"].module.weight, ).all()) model.save(checkpoint_path) model = MultitaskModel([task2]) self.assertFalse( torch.eq( task1.module_pool["linear2"].weight, model.module_pool["linear2"].module.weight, ).all()) model.load(checkpoint_path) self.assertTrue( torch.eq( task1.module_pool["linear2"].weight, model.module_pool["linear2"].module.weight, ).all()) os.close(fd)
def checkpoint( self, iteration: float, model: MultitaskModel, metric_dict: Metrics ) -> None: """Check if iteration and current metrics necessitate a checkpoint. Parameters ---------- iteration Current training iteration model Model to checkpoint metric_dict Current performance metrics for model """ # Check if the checkpoint_runway condition is met if iteration < self.checkpoint_runway: return elif not self.checkpoint_condition_met and iteration >= self.checkpoint_runway: self.checkpoint_condition_met = True logging.info( "checkpoint_runway condition has been met. Start checkpointing." ) checkpoint_path = f"{self.checkpoint_dir}/checkpoint_{iteration}.pth" model.save(checkpoint_path) logging.info( f"Save checkpoint at {iteration} {self.checkpoint_unit} " f"at {checkpoint_path}." ) if not set(self.checkpoint_task_metrics.keys()).isdisjoint( set(metric_dict.keys()) ): new_best_metrics = self._is_new_best(metric_dict) for metric in new_best_metrics: copyfile( checkpoint_path, f"{self.checkpoint_dir}/best_model_" f"{metric.replace('/', '_')}.pth", ) logging.info( f"Save best model of metric {metric} at {self.checkpoint_dir}" f"/best_model_{metric.replace('/', '_')}.pth" )