def __init__(self, config: DeepLearningConfig, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.outputs_folder = config.outputs_folder self.checkpoint_folder = config.checkpoint_folder self.model: DeviceAwareModule = DeviceAwareModule() # These two will be set later in set_optimizer_and_scheduler. # The ddp_spawn accelerator only works if the model configuration object is # not stored in here. Hence, need to do operations that require a full config # in a way that does not require storing the config. self.optimizer: Optional[Optimizer] = None self.l_rate_scheduler: Optional[_LRScheduler] = None self.cross_validation_split_index = config.cross_validation_split_index self.effective_random_seed = config.get_effective_random_seed() # This should be re-assigned on the outside, to a logger that is hooked up with the Trainer object. self.storing_logger = StoringLogger() # This will be initialized correctly in epoch_start self.random_state: Optional[RandomStateSnapshot] = None # training loggers self.train_metrics_folder = self.outputs_folder / ModelExecutionMode.TRAIN.value self.val_metrics_folder = self.outputs_folder / ModelExecutionMode.VAL.value fixed_logger_columns = { LoggingColumns.CrossValidationSplitIndex.value: config.cross_validation_split_index } self.train_epoch_metrics_logger = DataframeLogger( self.train_metrics_folder / EPOCH_METRICS_FILE_NAME, fixed_columns=fixed_logger_columns) self.val_epoch_metrics_logger = DataframeLogger( self.val_metrics_folder / EPOCH_METRICS_FILE_NAME, fixed_columns=fixed_logger_columns) # Stores information the checkpoint that created this model, if any. self.checkpoint_loading_message = ""
def on_train_start(self) -> None: """ Initializes the per-rank logger objects that write to the file system. """ # These loggers store the per-subject model outputs. They cannot be initialized in the constructor because # the trainer object will not yet be set, and we need to get the rank from there. fixed_logger_columns = {LoggingColumns.CrossValidationSplitIndex.value: self.cross_validation_split_index} subject_output_file = get_subject_output_file_per_rank(self.trainer.global_rank) self.train_subject_outputs_logger = DataframeLogger(self.train_metrics_folder / subject_output_file, fixed_columns=fixed_logger_columns) self.val_subject_outputs_logger = DataframeLogger(self.val_metrics_folder / subject_output_file, fixed_columns=fixed_logger_columns)
def test_load_metrics_from_df_with_hues( test_output_dirs: OutputFolderForTests) -> None: """ Test if we can re-create a MetricsDict object with model predictions and labels, when the data file contains a prediction target value. """ df_str = """prediction_target,epoch,subject,model_output,label,cross_validation_split_index,data_split 01,1,2137.00005,0.54349,1.0,0,Val 01,1,2137.00125,0.54324,0.0,1,Val 01,1,3250.00005,0.50822,0.0,0,Val 01,1,3250.12345,0.47584,0.0,1,Val 02,1,2137.00005,0.55538,1.0,0,Val 02,1,2137.00125,0.55759,0.0,1,Val 02,1,3250.00005,0.47255,0.0,0,Val 02,1,3250.12345,0.46996,0.0,1,Val 03,1,2137.00005,0.56670,1.0,0,Val 03,1,2137.00125,0.57003,0.0,1,Val 03,1,3250.00005,0.46321,0.0,0,Val 03,1,3250.12345,0.47309,0.0,1,Val """ df = pd.read_csv(StringIO(df_str), converters={LoggingColumns.Hue.value: lambda x: x}) metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df( df, is_classification_metrics=True) mode = ModelExecutionMode.VAL epoch = 1 assert mode in metrics assert epoch in metrics[mode] metrics_dict = metrics[mode][epoch] expected_hues = ["01", "02", "03"] assert metrics_dict.get_hue_names(include_default=False) == expected_hues for hue in expected_hues: assert len(metrics_dict._get_hue(hue).get_predictions()) == 4 logger_output_file = test_output_dirs.create_file_or_folder_path( "output.csv") logger = DataframeLogger(csv_path=logger_output_file) ScalarMetricsDict.aggregate_and_save_execution_mode_metrics( metrics, logger) output = pd.read_csv(logger_output_file, dtype=str) assert LoggingColumns.Hue.value in output assert list(output[LoggingColumns.Hue.value]) == expected_hues assert LoggingColumns.DataSplit.value in output assert list(output[LoggingColumns.DataSplit.value].unique()) == [ ModelExecutionMode.VAL.value ] assert LoggingColumns.Epoch.value in output assert list(output[LoggingColumns.Epoch.value].unique()) == ["1"] assert LoggingColumns.AreaUnderPRCurve.value in output assert list(output[LoggingColumns.AreaUnderPRCurve.value]) == [ '1.00000', '0.25000', '0.25000' ]
def test_dataframe_logger() -> None: fixed_columns = {"cross_validation_split_index": 1} records = [ { "bar": math.pi, MetricType.LEARNING_RATE.value: 1e-5 }, { "bar": math.pi, MetricType.LEARNING_RATE.value: 1 }, ] out_buffer = StringIO() df = DataframeLogger(csv_path=out_buffer, fixed_columns=fixed_columns) for r in records: df.add_record(r) df.flush() assert out_buffer.getvalue().splitlines() == [ 'bar,LearningRate,cross_validation_split_index', '3.141593,1.000000e-05,1', '3.141593,1.000000e+00,1' ]
def classification_model_test( config: ScalarModelBase, data_split: ModelExecutionMode, checkpoint_paths: List[Path], model_proc: ModelProcessing, cross_val_split_index: int) -> InferenceMetricsForClassification: """ The main testing loop for classification models. It runs a loop over all epochs for which testing should be done. It loads the model and datasets, then proceeds to test the model for all requested checkpoints. :param config: The model configuration. :param data_split: The name of the folder to store the results inside each epoch folder in the outputs_dir, used mainly in model evaluation using different dataset splits. :param checkpoint_paths: Checkpoint paths to initialize model :param model_proc: whether we are testing an ensemble or single model :return: InferenceMetricsForClassification object that contains metrics related for all of the checkpoint epochs. """ pipeline = create_inference_pipeline(config=config, checkpoint_paths=checkpoint_paths) if pipeline is None: raise ValueError("Inference pipeline could not be created.") # for mypy assert isinstance(pipeline, ScalarInferencePipelineBase) ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing") ds = config.get_torch_dataset_for_inference(data_split).as_data_loader( shuffle=False, batch_size=1, num_dataload_workers=0) logging.info(f"Starting to evaluate model on {data_split.value} set.") results_folder = config.outputs_folder / get_best_epoch_results_path( data_split, model_proc) os.makedirs(str(results_folder), exist_ok=True) metrics_dict = create_metrics_dict_for_scalar_models(config) output_logger: Optional[DataframeLogger] = DataframeLogger( csv_path=results_folder / MODEL_OUTPUT_CSV) for sample in ds: result = pipeline.predict(sample) model_output = result.posteriors label = result.labels.to(device=model_output.device) sample_id = result.subject_ids[0] if output_logger: for i in range(len(config.target_names)): output_logger.add_record({ LoggingColumns.Patient.value: sample_id, LoggingColumns.Hue.value: config.target_names[i], LoggingColumns.Label.value: label[0][i].item(), LoggingColumns.ModelOutput.value: model_output[0][i].item(), LoggingColumns.CrossValidationSplitIndex.value: cross_val_split_index }) compute_scalar_metrics(metrics_dict, subject_ids=[sample_id], model_output=model_output, labels=label, loss_type=config.loss_type) logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}") average = metrics_dict.average(across_hues=False) logging.info(average.to_string()) if isinstance(metrics_dict, ScalarMetricsDict): csv_file = results_folder / SUBJECT_METRICS_FILE_NAME logging.info( f"Writing {data_split.value} metrics to file {str(csv_file)}") # If we are running inference after a training run, the validation set metrics may have been written # during train time. If this is not the case, or we are running on the test set, create the metrics # file. if not csv_file.exists(): df_logger = DataframeLogger(csv_file) # For test if ensemble split should be default, else record which fold produced this prediction cv_index = DEFAULT_CROSS_VALIDATION_SPLIT_INDEX if model_proc == ModelProcessing.ENSEMBLE_CREATION \ else cross_val_split_index metrics_dict.store_metrics_per_subject( df_logger=df_logger, mode=data_split, cross_validation_split_index=cv_index, epoch=BEST_EPOCH_FOLDER_NAME) # write to disk df_logger.flush() if output_logger: output_logger.flush() return InferenceMetricsForClassification(metrics=metrics_dict)
def plot_cross_validation_from_files( config_and_files: OfflineCrossvalConfigAndFiles, root_folder: Path, is_ensemble_run: bool = False) -> None: """ Runs various plots for the results of a cross validation run, and writes them to a given folder. :param config_and_files: The setup for plotting results and the set of data files to analyse. :param root_folder: The folder into which the results should be written. :param is_ensemble_run: If True, assume that this run of cross validation analysis is for an ensemble model and assert that there are N+1 data files available. If false, this analysis only concerns the cross validation runs, and check that the number of files is N. """ config = config_and_files.config if config.number_of_cross_validation_splits > 1: check_result_file_counts(config_and_files, is_ensemble_run=is_ensemble_run) result_files = config_and_files.files metrics_dfs = load_dataframes(result_files, config) full_csv_file = root_folder / FULL_METRICS_DATAFRAME_FILE initial_metrics = pd.concat(list(metrics_dfs.values())) if config.model_category == ModelCategory.Segmentation: if config.create_plots: plot_metrics(config, metrics_dfs, root_folder) save_outliers(config, metrics_dfs, root_folder) all_metrics, focus_splits = add_comparison_data( config, initial_metrics) all_metrics.to_csv(full_csv_file, index=False) run_statistical_tests_on_file(root_folder, full_csv_file, config, focus_splits) else: # For classification runs, we also want to compute the aggregated training metrics for # each fold. metrics = ScalarMetricsDict.load_execution_mode_metrics_from_df( initial_metrics, config.model_category == ModelCategory.Classification) ScalarMetricsDict.aggregate_and_save_execution_mode_metrics( metrics=metrics, data_frame_logger=DataframeLogger(csv_path=root_folder / METRICS_AGGREGATES_FILE)) # The full metrics file saves the prediction for each individual subject. Do not include the training # results in this file (as in cross-validation a subject is used in several folds.) val_and_test_metrics = initial_metrics.loc[initial_metrics[ LoggingColumns.DataSplit.value] != ModelExecutionMode.TRAIN.value] val_and_test_metrics.to_csv(full_csv_file, index=False) # Copy one instance of the dataset.CSV files to the root of the results folder. It is possible # that the different CV folds run with different dataset files, but not expected for classification # models at the moment (could change with ensemble models) dataset_csv = None for file in result_files: if file.dataset_csv_file: dataset_csv = file.dataset_csv_file break if dataset_csv: shutil.copy(str(dataset_csv), str(root_folder)) name_dct = config_and_files.config.short_names if name_dct: pairs = [(val, key) for key, val in name_dct.items()] with Path(root_folder / RUN_DICTIONARY_NAME).open("w") as out: max_len = max(len(short_name) for short_name, _ in pairs) for short_name, long_name in sorted(pairs): out.write(f"{short_name:{max_len}s} {long_name}\n")
def classification_model_test(config: ScalarModelBase, data_split: ModelExecutionMode, checkpoint_handler: CheckpointHandler, model_proc: ModelProcessing) -> InferenceMetricsForClassification: """ The main testing loop for classification models. It runs a loop over all epochs for which testing should be done. It loads the model and datasets, then proceeds to test the model for all requested checkpoints. :param config: The model configuration. :param data_split: The name of the folder to store the results inside each epoch folder in the outputs_dir, used mainly in model evaluation using different dataset splits. :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization :param model_proc: whether we are testing an ensemble or single model :return: InferenceMetricsForClassification object that contains metrics related for all of the checkpoint epochs. """ def test_epoch(checkpoint_paths: List[Path]) -> Optional[MetricsDict]: pipeline = create_inference_pipeline(config=config, checkpoint_paths=checkpoint_paths) if pipeline is None: return None # for mypy assert isinstance(pipeline, ScalarInferencePipelineBase) ml_util.set_random_seed(config.get_effective_random_seed(), "Model Testing") ds = config.get_torch_dataset_for_inference(data_split).as_data_loader( shuffle=False, batch_size=1, num_dataload_workers=0 ) logging.info(f"Starting to evaluate model on {data_split.value} set.") metrics_dict = create_metrics_dict_for_scalar_models(config) for sample in ds: result = pipeline.predict(sample) model_output = result.posteriors label = result.labels.to(device=model_output.device) sample_id = result.subject_ids[0] compute_scalar_metrics(metrics_dict, subject_ids=[sample_id], model_output=model_output, labels=label, loss_type=config.loss_type) logging.debug(f"Example {sample_id}: {metrics_dict.to_string()}") average = metrics_dict.average(across_hues=False) logging.info(average.to_string()) return metrics_dict checkpoints_to_test = checkpoint_handler.get_checkpoints_to_test() if not checkpoints_to_test: raise ValueError("There were no checkpoints available for model testing.") result = test_epoch(checkpoint_paths=checkpoints_to_test) if result is None: raise ValueError("There was no single checkpoint file available for model testing.") else: if isinstance(result, ScalarMetricsDict): results_folder = config.outputs_folder / get_epoch_results_path(data_split, model_proc) csv_file = results_folder / SUBJECT_METRICS_FILE_NAME logging.info(f"Writing {data_split.value} metrics to file {str(csv_file)}") # If we are running inference after a training run, the validation set metrics may have been written # during train time. If this is not the case, or we are running on the test set, create the metrics # file. if not csv_file.exists(): os.makedirs(str(results_folder), exist_ok=False) df_logger = DataframeLogger(csv_file) # cross validation split index not relevant during test time result.store_metrics_per_subject(df_logger=df_logger, mode=data_split) # write to disk df_logger.flush() return InferenceMetricsForClassification(metrics=result)