def test_get_checkpoints_to_test( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() manage_recovery = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) # Set a local_weights_path to get checkpoint from. Model has not trained and no run recovery provided, # so the local weights should be used ignoring any epochs to test config.epochs_to_test = [1, 2] local_weights_path = test_output_dirs.root_dir / "exist.pth" stored_checkpoint = create_checkpoint_path( full_ml_test_data_path("checkpoints"), epoch=1) shutil.copyfile(str(stored_checkpoint), local_weights_path) config.local_weights_path = local_weights_path manage_recovery.discover_and_download_checkpoints_from_previous_runs() checkpoint_and_paths = manage_recovery.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 1 assert checkpoint_and_paths[0].epoch == 0 assert checkpoint_and_paths[0].checkpoint_paths == [ manage_recovery.model_config.outputs_folder / WEIGHTS_FILE ] # Now set a run recovery object and set the start epoch to 1, so we get one epoch from # run recovery and one from the training checkpoints manage_recovery.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID config.start_epoch = 1 manage_recovery.additional_training_done() manage_recovery.discover_and_download_checkpoints_from_previous_runs() # Copy checkpoint to make it seem like training has happened stored_checkpoint = create_checkpoint_path( path=full_ml_test_data_path("checkpoints"), epoch=1) expected_checkpoint = create_checkpoint_path(path=config.checkpoint_folder, epoch=2) shutil.copyfile(str(stored_checkpoint), str(expected_checkpoint)) checkpoint_and_paths = manage_recovery.get_checkpoints_to_test() assert checkpoint_and_paths assert len(checkpoint_and_paths) == 2 assert checkpoint_and_paths[0].epoch == 1 assert checkpoint_and_paths[0].checkpoint_paths == [ create_checkpoint_path(path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1], epoch=1) ] assert checkpoint_and_paths[1].epoch == 2 assert checkpoint_and_paths[1].checkpoint_paths == [ create_checkpoint_path(path=config.checkpoint_folder, epoch=2) ] # This epoch does not exist config.epochs_to_test = [3] checkpoint_and_paths = manage_recovery.get_checkpoints_to_test() assert checkpoint_and_paths is None
def get_path_to_checkpoint(self, epoch: int) -> Path: """ Returns full path to a checkpoint given an epoch :param epoch: the epoch number :return: path to a checkpoint given an epoch """ return create_checkpoint_path(self.checkpoint_folder, epoch=epoch)
def get_checkpoint_paths( self, epoch: int, for_mean_teacher_model: bool = False) -> List[Path]: return [ create_checkpoint_path(x, epoch, for_mean_teacher_model) for x in self.checkpoints_roots ]
def get_path_to_checkpoint(self, epoch: int) -> Path: """ Returns full path to a checkpoint given an epoch :param epoch: the epoch number :param for_mean_teacher_model: if True looking returns path to the mean teacher checkpoint. Else returns the path to the (main / student) model checkpoint. :return: path to a checkpoint given an epoch """ return create_checkpoint_path( path=fixed_paths.repository_root_directory() / self.checkpoint_folder, epoch=epoch)
def test_create_inference_pipeline(config: ModelConfigBase, checkpoint_folder: str, inference_type: type, ensemble_type: type, test_output_dirs: OutputFolderForTests) -> None: config.set_output_to(test_output_dirs.root_dir) # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder. stored_checkpoints = full_ml_test_data_path(checkpoint_folder) shutil.copytree(str(stored_checkpoints), str(config.checkpoint_folder)) checkpoint_path = create_checkpoint_path(stored_checkpoints, epoch=1) assert isinstance(create_inference_pipeline(config, [checkpoint_path]), inference_type) assert isinstance(create_inference_pipeline(config, [checkpoint_path] * 2), ensemble_type)
def get_checkpoint_paths(self, epoch: int) -> List[Path]: return [create_checkpoint_path(x, epoch) for x in self.checkpoints_roots]
def test_recover_testing_from_run_recovery( mean_teacher_model: bool, test_output_dirs: OutputFolderForTests) -> None: """ Checks that inference results are the same whether from a checkpoint in the same run, from a run recovery or from a local_weights_path param. """ # Train for 4 epochs config = DummyClassification() if mean_teacher_model: config.mean_teacher_alpha = 0.999 config.set_output_to(test_output_dirs.root_dir / "original") os.makedirs(str(config.outputs_folder)) config.save_start_epoch = 2 config.save_step_epochs = 2 checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) train_results = model_train(config, checkpoint_handler=checkpoint_handler) assert len(train_results.learning_rates_per_epoch) == config.num_epochs # Run inference on this test_results = model_test(config=config, data_split=ModelExecutionMode.TEST, checkpoint_handler=checkpoint_handler) assert isinstance(test_results, InferenceMetricsForClassification) assert list(test_results.epochs.keys()) == [config.num_epochs] # Mimic using a run recovery and see if it is the same config_run_recovery = DummyClassification() if mean_teacher_model: config_run_recovery.mean_teacher_alpha = 0.999 config_run_recovery.set_output_to(test_output_dirs.root_dir / "run_recovery") os.makedirs(str(config_run_recovery.outputs_folder)) checkpoint_handler_run_recovery = get_default_checkpoint_handler( model_config=config_run_recovery, project_root=test_output_dirs.root_dir) # make it seem like run recovery objects have been downloaded checkpoint_root = config_run_recovery.checkpoint_folder / "recovered" shutil.copytree(str(config.checkpoint_folder), str(checkpoint_root)) checkpoint_handler_run_recovery.run_recovery = RunRecovery( [checkpoint_root]) test_results_run_recovery = model_test( config_run_recovery, data_split=ModelExecutionMode.TEST, checkpoint_handler=checkpoint_handler_run_recovery) assert isinstance(test_results_run_recovery, InferenceMetricsForClassification) assert list(test_results_run_recovery.epochs.keys()) == [config.num_epochs] assert test_results.epochs[config.num_epochs].values()[MetricType.CROSS_ENTROPY.value] == \ test_results_run_recovery.epochs[config.num_epochs].values()[MetricType.CROSS_ENTROPY.value] # Run inference with the local checkpoints config_local_weights = DummyClassification() if mean_teacher_model: config_local_weights.mean_teacher_alpha = 0.999 config_local_weights.set_output_to(test_output_dirs.root_dir / "local_weights_path") os.makedirs(str(config_local_weights.outputs_folder)) local_weights_path = test_output_dirs.root_dir / "local_weights_file.pth" shutil.copyfile( str( create_checkpoint_path(config.checkpoint_folder, epoch=config.num_epochs)), local_weights_path) config_local_weights.local_weights_path = local_weights_path checkpoint_handler_local_weights = get_default_checkpoint_handler( model_config=config_local_weights, project_root=test_output_dirs.root_dir) checkpoint_handler_local_weights.discover_and_download_checkpoints_from_previous_runs( ) test_results_local_weights = model_test( config_local_weights, data_split=ModelExecutionMode.TEST, checkpoint_handler=checkpoint_handler_local_weights) assert isinstance(test_results_local_weights, InferenceMetricsForClassification) assert list(test_results_local_weights.epochs.keys()) == [0] assert test_results.epochs[config.num_epochs].values()[MetricType.CROSS_ENTROPY.value] == \ test_results_local_weights.epochs[0].values()[MetricType.CROSS_ENTROPY.value]
def test_get_recovery_path_train( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) assert checkpoint_handler.get_recovery_path_train() is None checkpoint_handler.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() # We have not set a start_epoch but we are trying to use run_recovery, this should fail with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert "Run recovery set, but start epoch is 0" in ex.value.args[0] # Run recovery with start epoch provided should succeed config.start_epoch = 20 expected_path = create_checkpoint_path( path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1], epoch=config.start_epoch) assert checkpoint_handler.get_recovery_path_train() == expected_path # set an ensemble run as recovery - not supported checkpoint_handler.azure_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert "Found more than one checkpoint for epoch" in ex.value.args[0] # weights from local_weights_path and weights_url will be modified if needed and stored at this location expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE # Set a weights_url to get checkpoint from checkpoint_handler.azure_config.run_recovery_id = "" config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() assert checkpoint_handler.local_weights_path == expected_path config.start_epoch = 0 assert checkpoint_handler.get_recovery_path_train() == expected_path # Can't resume training from an external checkpoint config.start_epoch = 20 with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training." # Set a local_weights_path to get checkpoint from config.weights_url = "" local_weights_path = test_output_dirs.root_dir / "exist.pth" stored_checkpoint = create_checkpoint_path( full_ml_test_data_path("checkpoints"), epoch=1) shutil.copyfile(str(stored_checkpoint), local_weights_path) config.local_weights_path = local_weights_path checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() assert checkpoint_handler.local_weights_path == expected_path config.start_epoch = 0 assert checkpoint_handler.get_recovery_path_train() == expected_path # Can't resume training from an external checkpoint config.start_epoch = 20 with pytest.raises(ValueError) as ex: checkpoint_handler.get_recovery_path_train() assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."
def test_discover_and_download_checkpoints_from_previous_runs( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() # No checkpoint handling options set. checkpoint_handler = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() assert not checkpoint_handler.run_recovery assert not checkpoint_handler.local_weights_path # Set a run recovery object - non ensemble checkpoint_handler.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() expected_checkpoint_root = config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split( ":")[1] expected_paths = [ create_checkpoint_path(path=expected_checkpoint_root, epoch=epoch) for epoch in [1, 2, 3, 4, 20] ] assert checkpoint_handler.run_recovery assert checkpoint_handler.run_recovery.checkpoints_roots == [ expected_checkpoint_root ] for path in expected_paths: assert path.is_file() # Set a run recovery object - ensemble checkpoint_handler.azure_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() expected_checkpoint_roots = [ config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(i) for i in range(3) ] expected_path_lists = [[ create_checkpoint_path(path=expected_checkpoint_root, epoch=epoch) for epoch in [1, 2] ] for expected_checkpoint_root in expected_checkpoint_roots] assert set(checkpoint_handler.run_recovery.checkpoints_roots) == set( expected_checkpoint_roots) for path_list in expected_path_lists: for path in path_list: assert path.is_file() # weights from local_weights_path and weights_url will be modified if needed and stored at this location expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE # Set a weights_path checkpoint_handler.azure_config.run_recovery_id = "" config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() assert checkpoint_handler.local_weights_path == expected_path assert checkpoint_handler.local_weights_path.is_file() # set a local_weights_path config.weights_url = "" local_weights_path = test_output_dirs.root_dir / "exist.pth" stored_checkpoint = create_checkpoint_path( path=full_ml_test_data_path("checkpoints"), epoch=1) shutil.copyfile(str(stored_checkpoint), local_weights_path) config.local_weights_path = local_weights_path checkpoint_handler.discover_and_download_checkpoints_from_previous_runs() assert checkpoint_handler.local_weights_path == expected_path
def test_get_checkpoint_from_epoch( test_output_dirs: OutputFolderForTests) -> None: config = ModelConfigBase(should_validate=False) config.set_output_to(test_output_dirs.root_dir) config.outputs_folder.mkdir() manage_recovery = get_default_checkpoint_handler( model_config=config, project_root=test_output_dirs.root_dir) # We have not set a run_recovery, nor have we trained, so this should fail to get a checkpoint with pytest.raises(ValueError) as ex: manage_recovery.get_checkpoint_from_epoch(1) assert "no run recovery object provided and no training has been done in this run" in ex.value.args[ 0] # We have set a run_recovery_id now, so this should work manage_recovery.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID manage_recovery.discover_and_download_checkpoints_from_previous_runs() expected_checkpoint = create_checkpoint_path( path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1], epoch=1) checkpoint = manage_recovery.get_checkpoint_from_epoch(1) assert checkpoint assert len(checkpoint.checkpoint_paths) == 1 assert expected_checkpoint == checkpoint.checkpoint_paths[0] assert checkpoint.epoch == 1 # ensemble run recovery manage_recovery.azure_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID manage_recovery.discover_and_download_checkpoints_from_previous_runs() expected_checkpoints = [ create_checkpoint_path(path=config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(i), epoch=1) for i in range(3) ] checkpoint = manage_recovery.get_checkpoint_from_epoch(1) assert checkpoint assert len(checkpoint.checkpoint_paths) == 3 assert set(expected_checkpoints) == set(checkpoint.checkpoint_paths) assert checkpoint.epoch == 1 # From now on, the checkpoint handler will think that the run was started from epoch 1, i.e. we should use the # run recovery checkpoint for epoch 1 and the training run checkpoint for epoch 2 manage_recovery.additional_training_done() # go back to non ensemble run recovery manage_recovery.azure_config.run_recovery_id = DEFAULT_RUN_RECOVERY_ID manage_recovery.discover_and_download_checkpoints_from_previous_runs() config.start_epoch = 1 # We haven't actually done a training run ,so the checkpoint for epoch 2 is missing - and we should not use the one # from run recovery assert manage_recovery.get_checkpoint_from_epoch(2) is None # Should work for epoch 1 checkpoint = manage_recovery.get_checkpoint_from_epoch(1) expected_checkpoint = create_checkpoint_path( path=config.checkpoint_folder / DEFAULT_RUN_RECOVERY_ID.split(":")[1], epoch=1) assert checkpoint assert len(checkpoint.checkpoint_paths) == 1 assert checkpoint.checkpoint_paths[0] == expected_checkpoint assert checkpoint.epoch == 1 # Copy over checkpoints to make it look like training has happened stored_checkpoint = create_checkpoint_path( path=full_ml_test_data_path("checkpoints"), epoch=1) expected_checkpoint = create_checkpoint_path(path=config.checkpoint_folder, epoch=2) shutil.copyfile(str(stored_checkpoint), str(expected_checkpoint)) # Should now work for epoch 2 checkpoint = manage_recovery.get_checkpoint_from_epoch(2) assert checkpoint assert len(checkpoint.checkpoint_paths) == 1 assert expected_checkpoint == checkpoint.checkpoint_paths[0] assert checkpoint.epoch == 2