Пример #1
0
def aggregate_segmentation_metrics(metrics: MetricsDict) -> MetricsDict:
    """
    Computes aggregate metrics for segmentation models, from a metrics dictionary that contains the results for
    individual minibatches. Specifically, average Dice scores for only the foreground structures and proportions
    of foreground voxels are computed. All metrics for the background class will be removed.
    All other metrics that are already present in the input metrics will be averaged and available in the result.
    Diagnostic values present in the input will be passed through unchanged.
    :param metrics: A metrics dictionary that contains the per-minibatch results.
    """
    class_names_with_background = metrics.get_hue_names(include_default=False)
    has_background_class = class_names_with_background[0] == BACKGROUND_CLASS_NAME
    foreground_classes = class_names_with_background[1:] if has_background_class else class_names_with_background
    result = metrics.average(across_hues=False)
    result.diagnostics = metrics.diagnostics.copy()
    if has_background_class:
        result.delete_hue(BACKGROUND_CLASS_NAME)
    add_average_foreground_dice(result)
    # Total number of voxels per class, including the background class
    total_voxels = []
    voxel_count = MetricType.VOXEL_COUNT.value
    for g in class_names_with_background:
        values = metrics.values(hue=g)
        if voxel_count in values:
            total_voxels.append(sum(values[voxel_count]))
    if len(total_voxels) > 0:
        # Proportion of voxels in foreground classes only
        proportion_foreground = np.array(total_voxels[1:], dtype=float) / sum(total_voxels)
        for i, foreground_class in enumerate(foreground_classes):
            result.add_metric(MetricType.PROPORTION_FOREGROUND_VOXELS, proportion_foreground[i], hue=foreground_class)
        result.add_metric(MetricType.PROPORTION_FOREGROUND_VOXELS, np.sum(proportion_foreground).item())
    return result
def test_classification_metrics_avg() -> None:
    hue1 = "H1"
    hue2 = "H2"
    m = MetricsDict(hues=[hue1, hue2], is_classification_metrics=True)
    m.add_metric("foo", 1.0)
    m.add_metric("foo", 2.0)
    # Perfect predictions for hue1, should give AUC == 1.0
    m.add_predictions(["S1", "S2"], np.array([0.0, 1.0]), np.array([0.0, 1.0]), hue=hue1)
    expected_hue1_auc = 1.0
    # Worst possible predictions for hue2, should give AUC == 0.0
    m.add_predictions(["S1", "S2"], np.array([1.0, 0.0]), np.array([0.0, 1.0]), hue=hue2)
    expected_hue2_auc = 0.0
    averaged = m.average(across_hues=False)
    g1_averaged = averaged.values(hue=hue1)
    assert MetricType.AREA_UNDER_ROC_CURVE.value in g1_averaged
    assert g1_averaged[MetricType.AREA_UNDER_ROC_CURVE.value] == [expected_hue1_auc]
    assert MetricType.AREA_UNDER_PR_CURVE.value in g1_averaged
    assert MetricType.SUBJECT_COUNT.value in g1_averaged
    assert g1_averaged[MetricType.SUBJECT_COUNT.value] == [2.0]
    default_averaged = averaged.values()
    assert default_averaged == {"foo": [1.5]}
    can_enumerate = list(averaged.enumerate_single_values())
    assert len(can_enumerate) >= 8
    assert can_enumerate[0] == (hue1, MetricType.AREA_UNDER_ROC_CURVE.value, 1.0)
    assert can_enumerate[-1] == (MetricsDict.DEFAULT_HUE_KEY, "foo", 1.5)

    g2_averaged = averaged.values(hue=hue2)
    assert MetricType.AREA_UNDER_ROC_CURVE.value in g2_averaged
    assert g2_averaged[MetricType.AREA_UNDER_ROC_CURVE.value] == [expected_hue2_auc]

    averaged_across_hues = m.average(across_hues=True)
    assert averaged_across_hues.get_hue_names() == [MetricsDict.DEFAULT_HUE_KEY]
    assert MetricType.AREA_UNDER_ROC_CURVE.value in averaged_across_hues.values()
    expected_averaged_auc = 0.5 * (expected_hue1_auc + expected_hue2_auc)
    assert averaged_across_hues.values()[MetricType.AREA_UNDER_ROC_CURVE.value] == [expected_averaged_auc]
def test_add_foreground_dice() -> None:
    g1 = "Liver"
    g2 = "Lung"
    ground_truth_ids = [BACKGROUND_CLASS_NAME, g1, g2]
    dice = [0.85, 0.75, 0.55]
    m = MetricsDict(hues=ground_truth_ids)
    for j, ground_truth_id in enumerate(ground_truth_ids):
        m.add_metric(MetricType.DICE, dice[j], hue=ground_truth_id)
    metrics.add_average_foreground_dice(m)
    assert m.get_single_metric(MetricType.DICE) == 0.5 * (dice[1] + dice[2])
def test_diagnostics() -> None:
    """
    Test if we can store diagnostic values (no restrictions on data types) in the metrics dict.
    """
    name = "foo"
    value1 = "something"
    value2 = (1, 2, 3)
    m = MetricsDict()
    m.add_diagnostics(name, value1)
    m.add_diagnostics(name, value2)
    assert m.diagnostics == {name: [value1, value2]}
Пример #5
0
def add_average_foreground_dice(metrics: MetricsDict) -> None:
    """
    If the given metrics dictionary contains an entry for Dice score, and only one value for the Dice score per class,
    then add an average Dice score for all foreground classes to the metrics dictionary (modified in place).
    :param metrics: The object that holds metrics. The average Dice score will be written back into this object.
    """
    all_dice = []
    for structure_name in metrics.get_hue_names(include_default=False):
        if structure_name != BACKGROUND_CLASS_NAME:
            all_dice.append(metrics.get_single_metric(MetricType.DICE, hue=structure_name))
    metrics.add_metric(MetricType.DICE, np.nanmean(all_dice).item())
def test_metrics_dict_to_string_with_hues() -> None:
    """
    Test to make sure metrics dict is able to be stringified correctly with hues
    """
    m = MetricsDict(hues=["G1"])
    m.add_metric("foo", 1.0)
    m.add_metric("bar", math.pi, hue="G1")
    m.add_metric("baz", 2.0, hue="G1")
    info_df = pd.DataFrame(columns=MetricsDict.DATAFRAME_COLUMNS)
    info_df = info_df.append({MetricsDict.DATAFRAME_COLUMNS[0]: "G1",
                              MetricsDict.DATAFRAME_COLUMNS[1]: "bar: 3.1416, baz: 2.0000"}, ignore_index=True)
    info_df = info_df.append({MetricsDict.DATAFRAME_COLUMNS[0]: MetricsDict.DEFAULT_HUE_KEY,
                              MetricsDict.DATAFRAME_COLUMNS[1]: "foo: 1.0000"}, ignore_index=True)
    assert m.to_string() == tabulate_dataframe(info_df)
    assert m.to_string(tabulate=False) == info_df.to_string(index=False)
Пример #7
0
def store_epoch_metrics(
        azure_and_tensorboard_logger: AzureAndTensorboardLogger,
        df_logger: DataframeLogger, epoch: int, metrics: MetricsDict,
        learning_rates: List[float], config: ModelConfigBase) -> None:
    """
    Writes the loss, Dice scores, and learning rates into a file for Tensorboard visualization,
    and into the AzureML run context.
    :param azure_and_tensorboard_logger: An instance of AzureAndTensorboardLogger.
    :param df_logger: An instance of DataframeLogger, for logging results to csv.
    :param epoch: The epoch corresponding to the results.
    :param metrics: The metrics of the specified epoch, averaged along its batches.
    :param learning_rates: The logged learning rates.
    :param config: one of SegmentationModelBase
    """
    if config.is_segmentation_model:
        azure_and_tensorboard_logger.log_segmentation_epoch_metrics(
            metrics, learning_rates)
        logger_row = {
            LoggingColumns.Dice.value:
            metrics.get_single_metric(MetricType.DICE),
            LoggingColumns.Loss.value:
            metrics.get_single_metric(MetricType.LOSS),
            LoggingColumns.SecondsPerEpoch.value:
            metrics.get_single_metric(MetricType.SECONDS_PER_EPOCH)
        }

    elif config.is_scalar_model:
        assert isinstance(metrics, MetricsDict)
        azure_and_tensorboard_logger.log_classification_epoch_metrics(metrics)
        logger_row: Dict[str, float] = {}  # type: ignore
        for hue_name, metric_name, metric_value in metrics.enumerate_single_values(
        ):
            logging_column_name = get_column_name_for_logging(
                metric_name, hue_name=hue_name)
            logger_row[logging_column_name] = metric_value
    else:
        raise ValueError(
            "Model must be either classification, regression or segmentation model"
        )

    logger_row.update({
        LoggingColumns.Epoch.value:
        epoch,
        LoggingColumns.CrossValidationSplitIndex.value:
        config.cross_validation_split_index
    })

    df_logger.add_record(logger_row)
def test_delete_hue() -> None:
    h1 = "a"
    h2 = "b"
    a = MetricsDict(hues=[h1, h2])
    a.add_metric("foo", 1.0, hue=h1)
    a.add_metric("bar", 2.0, hue=h2)
    a.delete_hue(h1)
    assert a.get_hue_names(include_default=False) == [h2]
    assert list(a.enumerate_single_values()) == [(h2, "bar", 2.0)]
def test_delete_metric() -> None:
    """
    Deleting a set of metrics from the dictionary.
    """
    m = MetricsDict()
    m.add_metric(MetricType.LOSS, 1)
    assert m.values()[MetricType.LOSS.value] == [1.0]
    m.delete_metric(MetricType.LOSS)
    assert MetricType.LOSS.value not in m.values()
Пример #10
0
 def log_classification_epoch_metrics(self, metrics: MetricsDict) -> None:
     """
     Writes all values from MetricsDict object into a file for Tensorboard visualization,
     and into the AzureML run context.
     :param metrics: dictionary containing the metrics to be logged, averaged over minibatches.
     """
     for hue_name, label, metric in metrics.enumerate_single_values():
         self.log_to_azure_and_tensorboard(get_metric_name_with_hue_prefix(label, hue_name), metric)
Пример #11
0
def store_epoch_stats_for_segmentation(
        outputs_dir: Path, epoch: int, learning_rates: List[float],
        training_results: MetricsDict,
        validation_results: MetricsDict) -> None:
    """
    Writes a dictionary of statistics for a segmentation training run to a file. Successive calls to the function
    append another line of metrics. The first line of the file contains the column headers (names of the metrics).
    :param training_results: A MetricsDict object with all metrics that were achieved on the training set in the
    current epoch.
    :param validation_results: A MetricsDict object with all metrics that were achieved on the validation set in the
    current epoch.
    :param learning_rates: The learning rates that were used in the current epoch.
    :param epoch: The number of the current training epoch.
    :param outputs_dir: The directory in which the statistics file should be created.
    :return:
    """
    epoch_stats = {
        "Epoch":
        str(epoch),
        "LearningRate":
        format_learning_rates(learning_rates),
        "TrainLoss":
        metrics_util.format_metric(
            training_results.get_single_metric(MetricType.LOSS)),
        "TrainDice":
        metrics_util.format_metric(
            training_results.get_single_metric(MetricType.DICE)),
        "ValLoss":
        metrics_util.format_metric(
            validation_results.get_single_metric(MetricType.LOSS)),
        "ValDice":
        metrics_util.format_metric(
            validation_results.get_single_metric(MetricType.DICE)),
    }
    # When using os.linesep, additional LF characters are inserted. Expected behaviour only when
    # using this on both Windows and Linux.
    line_sep = "\n"
    tab = "\t"
    full_file = outputs_dir / TRAIN_STATS_FILE
    if not full_file.exists():
        header = tab.join(epoch_stats.keys())
        full_file.write_text(header + line_sep)
    line = tab.join(epoch_stats.values())
    with full_file.open("a") as f:
        f.write(line + line_sep)
Пример #12
0
    def log_segmentation_epoch_metrics(self, metrics: MetricsDict,
                                       learning_rates: List[float]) -> None:
        """
        Logs segmentation metrics (e.g. loss, dice scores, learning rates) to an event file for TensorBoard
        visualization and to the AzureML run context
        :param learning_rates: The logged learning rates.
        :param metrics: The metrics of the specified epoch, averaged along its batches.
        """
        logging_fn = self.log_to_azure_and_tensorboard
        logging_fn(MetricType.LOSS.value,
                   metrics.get_single_metric(MetricType.LOSS))
        logging_fn("Dice/AverageExceptBackground",
                   metrics.get_single_metric(MetricType.DICE))
        logging_fn(
            "Voxels/ProportionForeground",
            metrics.get_single_metric(MetricType.PROPORTION_FOREGROUND_VOXELS))
        logging_fn("TimePerEpoch_Seconds",
                   metrics.get_single_metric(MetricType.SECONDS_PER_EPOCH))

        if learning_rates is not None:
            for i, lr in enumerate(learning_rates):
                logging_fn("LearningRate/Index_{}".format(i), lr)

        for class_name in metrics.get_hue_names(include_default=False):
            # Tensorboard groups metrics by what is before the slash.
            # With metrics Dice/Foo and Dice/Bar, it will create a section for "Dice",
            # and inside of it, there are graphs for Foo and Bar
            get_label = lambda x, y: "{}/{}".format(x, y)
            logging_fn(
                get_label("Dice", class_name),
                metrics.get_single_metric(MetricType.DICE, hue=class_name))
            logging_fn(
                get_label("Voxels", class_name),
                metrics.get_single_metric(
                    MetricType.PROPORTION_FOREGROUND_VOXELS, hue=class_name))
def test_metrics_dict_get_hues() -> None:
    """
    Test to make sure metrics dict is configured properly with/without hues
    """
    m = MetricsDict()
    assert m.get_hue_names() == [MetricsDict.DEFAULT_HUE_KEY]
    assert m.get_hue_names(include_default=False) == []
    _hues = ["A", "B", "C"]
    m = MetricsDict(hues=_hues)
    assert m.get_hue_names() == _hues + [MetricsDict.DEFAULT_HUE_KEY]
    assert m.get_hue_names(include_default=False) == _hues
Пример #14
0
 def __init__(self, model_config: SegmentationModelBase,
              train_val_params: TrainValidateParameters[DeviceAwareModule]):
     """
     Creates a new instance of the class.
     :param model_config: The configuration of a segmentation model.
     :param train_val_params: The parameters for training the model, including the optimizer and the data loaders.
     """
     super().__init__(model_config, train_val_params)
     self.example_to_save = np.random.randint(
         0, len(train_val_params.data_loader))
     self.pipeline = SegmentationForwardPass(
         model=self.train_val_params.model,
         model_config=self.model_config,
         batch_size=self.model_config.train_batch_size,
         optimizer=self.train_val_params.optimizer,
         in_training_mode=self.train_val_params.in_training_mode,
         criterion=self.compute_loss,
         gradient_scaler=train_val_params.gradient_scaler)
     self.metrics = MetricsDict(hues=[BACKGROUND_CLASS_NAME] +
                                model_config.ground_truth_ids)
def test_metrics_dict_average_additional_metrics() -> None:
    """
    Test if computing the ROC entries and metrics at optimal threshold with MetricsDict.average() works
    as expected and returns the correct values.
    """
    # Prepare a vector of predictions and labels.
    predictions = np.array([0.5, 0.6, 0.1, 0.8, 0.2, 0.9])
    labels = np.array([0, 1.0, 0, 0, 1, 1], dtype=np.float)
    split_length = [3, 2, 1]

    # Get MetricsDict
    assert sum(split_length) == len(predictions)
    summed = np.cumsum(split_length)
    # MetricsDict will get that supplied in 3 chunks.
    m = MetricsDict()
    for i, end in enumerate(summed):
        start = 0 if i == 0 else summed[i - 1]
        pred = predictions[start:end]
        label = labels[start:end]
        subject_ids = list(range(len(pred)))
        m.add_predictions(subject_ids, pred, label)
    assert m.has_prediction_entries

    # Compute average MetricsDict
    averaged = m.average()

    # Compute additional expected metrics for the averaged MetricsDict
    expected_auc = roc_auc_score(labels, predictions)
    expected_fpr, expected_tpr, thresholds = roc_curve(labels, predictions)
    expected_optimal_idx = np.argmax(expected_tpr - expected_fpr)
    expected_optimal_threshold = float(thresholds[expected_optimal_idx])
    expected_accuracy = np.mean((predictions > expected_optimal_threshold) == labels)

    # Check computed values against expected
    assert averaged.values()[MetricType.OPTIMAL_THRESHOLD.value][0] == pytest.approx(expected_optimal_threshold)
    assert averaged.values()[MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD.value][0] == pytest.approx(expected_accuracy)
    assert averaged.values()[MetricType.FALSE_POSITIVE_RATE_AT_OPTIMAL_THRESHOLD.value][0] == \
           pytest.approx(expected_fpr[expected_optimal_idx])
    assert averaged.values()[MetricType.FALSE_NEGATIVE_RATE_AT_OPTIMAL_THRESHOLD.value][0] == \
           pytest.approx(1 - expected_tpr[expected_optimal_idx])
    assert averaged.values()[MetricType.AREA_UNDER_ROC_CURVE.value][0] == pytest.approx(expected_auc, 1e-6)
def test_metrics_store_mixed_hues() -> None:
    """
    Test to make sure metrics dict is able to handle default and non-default hues
    """
    m = MetricsDict(hues=["A", "B"])
    m.add_metric("foo", 1)
    m.add_metric("foo", 1, hue="B")
    m.add_metric("bar", 2, hue="A")
    assert list(m.enumerate_single_values()) == \
           [('A', 'bar', 2), ('B', 'foo', 1), (MetricsDict.DEFAULT_HUE_KEY, 'foo', 1)]
def test_metrics_dict_roc() -> None:
    """
    Test if adding ROC entries to a MetricsDict instance works, and returns the correct AUC.
    """
    # Prepare a vector of predictions and labels. We can compute AUC off those to compare.
    # MetricsDict will get that supplied in 3 chunks, and should return the same AUC value.
    predictions = np.array([0.5, 0.6, 0.1, 0.8, 0.2, 0.9])
    labels = np.array([0, 1.0, 0, 0, 1, 1], dtype=np.float)
    split_length = [3, 2, 1]
    assert sum(split_length) == len(predictions)
    summed = np.cumsum(split_length)
    m = MetricsDict()
    for i, end in enumerate(summed):
        start = 0 if i == 0 else summed[i - 1]
        pred = predictions[start:end]
        label = labels[start:end]
        subject_ids = list(range(len(pred)))
        m.add_predictions(subject_ids, pred, label)
    assert m.has_prediction_entries
    actual_auc = m.get_roc_auc()
    expected_auc = roc_auc_score(labels, predictions)
    assert actual_auc == pytest.approx(expected_auc, 1e-6)
    actual_pr_auc = m.get_pr_auc()
    expected_pr_auc = 0.7111111
    assert actual_pr_auc == pytest.approx(expected_pr_auc, 1e-6)
def test_metrics_dict_add_integer() -> None:
    """
    Adding a scalar metric where the value is an integer by accident should still store the metric.
    """
    m = MetricsDict()
    m.add_metric("foo", 1)
    assert "foo" in m.values()
    assert m.values()["foo"] == [1.0]
def test_metrics_dict_flatten(hues: Optional[List[str]]) -> None:
    m = MetricsDict(hues=hues)
    _hues = hues or [MetricsDict.DEFAULT_HUE_KEY] * 2
    m.add_metric("foo", 1.0, hue=_hues[0])
    m.add_metric("foo", 2.0, hue=_hues[1])
    m.add_metric("bar", 3.0, hue=_hues[0])
    m.add_metric("bar", 4.0, hue=_hues[1])

    if hues is None:
        average = m.average(across_hues=True)
        # We should be able to flatten out all the singleton values that the `average` operation returns
        all_values = list(average.enumerate_single_values())
        assert all_values == [(MetricsDict.DEFAULT_HUE_KEY, "foo", 1.5), (MetricsDict.DEFAULT_HUE_KEY, "bar", 3.5)]
        # When trying to flatten off a dictionary that has two values, this should fail:
        with pytest.raises(ValueError) as ex:
            list(m.enumerate_single_values())
        assert "only hold 1 item" in str(ex)
    else:
        average = m.average(across_hues=False)
        all_values = list(average.enumerate_single_values())
        assert all_values == [('A', 'foo', 1.0), ('A', 'bar', 3.0), ('B', 'foo', 2.0), ('B', 'bar', 4.0)]
Пример #20
0
def get_correct_and_misclassified_examples(val_metrics_csv: Path,
                                           test_metrics_csv: Path) -> Results:
    """
    Given the paths to the metrics files for the validation and test sets, get a list of true positives,
    false positives, false negatives and true negatives.
    The threshold for classification is obtained by looking at the validation file, and applied to the test set to get
    label predictions.
    """
    df_val = pd.read_csv(val_metrics_csv)

    if not df_val[LoggingColumns.Patient.value].is_unique:
        raise ValueError(
            f"Subject IDs should be unique, but found duplicate entries "
            f"in column {LoggingColumns.Patient.value} in the csv file.")

    fpr, tpr, thresholds = roc_curve(df_val[LoggingColumns.Label.value],
                                     df_val[LoggingColumns.ModelOutput.value])
    optimal_idx = MetricsDict.get_optimal_idx(fpr=fpr, tpr=tpr)
    optimal_threshold = thresholds[optimal_idx]

    df_test = pd.read_csv(test_metrics_csv)

    if not df_test[LoggingColumns.Patient.value].is_unique:
        raise ValueError(
            f"Subject IDs should be unique, but found duplicate entries "
            f"in column {LoggingColumns.Patient.value} in the csv file.")

    df_test["predicted"] = df_test.apply(lambda x: int(x[
        LoggingColumns.ModelOutput.value] >= optimal_threshold),
                                         axis=1)

    true_positives = df_test[(df_test["predicted"] == 1)
                             & (df_test[LoggingColumns.Label.value] == 1)]
    false_positives = df_test[(df_test["predicted"] == 1)
                              & (df_test[LoggingColumns.Label.value] == 0)]
    false_negatives = df_test[(df_test["predicted"] == 0)
                              & (df_test[LoggingColumns.Label.value] == 1)]
    true_negatives = df_test[(df_test["predicted"] == 0)
                             & (df_test[LoggingColumns.Label.value] == 0)]

    return Results(true_positives=true_positives,
                   true_negatives=true_negatives,
                   false_positives=false_positives,
                   false_negatives=false_negatives)
def test_metrics_dict_roc_degenerate() -> None:
    """
    Test if adding ROC entries to a MetricsDict instance works, if there is only 1 class present.
    """
    # Prepare a vector of predictions and labels. We can compute AUC off those to compare.
    # MetricsDict will get that supplied in 3 chunks, and should return the same AUC value.
    predictions = np.array([0.5, 0.6, 0.1, 0.8, 0.2, 0.9])
    m = MetricsDict()
    subject_ids = list(range(len(predictions)))
    m.add_predictions(subject_ids, predictions, np.ones_like(predictions))
    assert m.has_prediction_entries
    assert m.get_roc_auc() == 1.0
    assert m.get_pr_auc() == 1.0
Пример #22
0
def get_metric(val_metrics_csv: Path, test_metrics_csv: Path,
               metric: ReportedMetrics) -> float:
    """
    Given a csv file, read the predicted values and ground truth labels and return the specified metric.
    """
    results_val = get_results(val_metrics_csv)
    fpr, tpr, thresholds = roc_curve(results_val.labels,
                                     results_val.model_outputs)
    optimal_idx = MetricsDict.get_optimal_idx(fpr=fpr, tpr=tpr)
    optimal_threshold = thresholds[optimal_idx]

    if metric is ReportedMetrics.OptimalThreshold:
        return optimal_threshold

    results_test = get_results(test_metrics_csv)

    if metric is ReportedMetrics.AUC_ROC:
        return roc_auc_score(results_test.labels, results_test.model_outputs)
    elif metric is ReportedMetrics.AUC_PR:
        precision, recall, _ = precision_recall_curve(
            results_test.labels, results_test.model_outputs)
        return auc(recall, precision)
    elif metric is ReportedMetrics.Accuracy:
        return binary_classification_accuracy(
            model_output=results_test.model_outputs,
            label=results_test.labels,
            threshold=optimal_threshold)
    elif metric is ReportedMetrics.FalsePositiveRate:
        tnr = recall_score(results_test.labels,
                           results_test.model_outputs >= optimal_threshold,
                           pos_label=0)
        return 1 - tnr
    elif metric is ReportedMetrics.FalseNegativeRate:
        return 1 - recall_score(
            results_test.labels,
            results_test.model_outputs >= optimal_threshold)
    else:
        raise ValueError("Unknown metric")
def train_or_validate_epoch(
        training_steps: ModelTrainingStepsBase
) -> ModelOutputsAndMetricsForEpoch:
    """
    Trains or validates the model for one epoch.
    :param training_steps: Training pipeline to use.
    :returns: The results for training or validation. Result type depends on the type of model that is trained.
    """
    epoch_start_time = time()
    training_random_state = None
    train_val_params = training_steps.train_val_params
    config = training_steps.model_config
    if not train_val_params.in_training_mode:
        # take the snapshot of the existing random state
        training_random_state = RandomStateSnapshot.snapshot_random_state()
        # reset the random state for validation
        ml_util.set_random_seed(config.get_effective_random_seed(),
                                "Model validation")

    status_string = "training" if train_val_params.in_training_mode else "validation"
    item_start_time = time()
    num_load_time_warnings = 0
    num_load_time_exceeded = 0
    num_batches = 0
    total_extra_load_time = 0.0
    total_load_time = 0.0
    model_outputs_epoch = []
    for batch_index, sample in enumerate(train_val_params.data_loader):
        item_finish_time = time()
        item_load_time = item_finish_time - item_start_time
        # Having slow minibatch loading is OK in the very first batch of the every epoch, where processes
        # are spawned. Later, the load time should be zero.
        if batch_index == 0:
            logging.info(
                f"Loaded the first minibatch of {status_string} data in {item_load_time:0.2f} sec."
            )
        elif item_load_time > MAX_ITEM_LOAD_TIME_SEC:
            num_load_time_exceeded += 1
            total_extra_load_time += item_load_time
            if num_load_time_warnings < MAX_LOAD_TIME_WARNINGS:
                logging.warning(
                    f"Loading {status_string} minibatch {batch_index} took {item_load_time:0.2f} sec. "
                    f"This can mean that there are not enough data loader worker processes, or that there "
                    f"is a "
                    f"performance problem in loading. This warning will be printed at most "
                    f"{MAX_LOAD_TIME_WARNINGS} times.")
                num_load_time_warnings += 1
        model_outputs_minibatch = training_steps.forward_and_backward_minibatch(
            sample, batch_index, train_val_params.epoch)
        model_outputs_epoch.append(model_outputs_minibatch)
        train_finish_time = time()
        logging.debug(
            f"Epoch {train_val_params.epoch} {status_string} batch {batch_index}: "
            f"Loaded in {item_load_time:0.2f}sec, "
            f"{status_string} in {(train_finish_time - item_finish_time):0.2f}sec. "
            f"Loss = {model_outputs_minibatch.loss}")
        total_load_time += item_finish_time - item_start_time
        num_batches += 1
        item_start_time = time()

    # restore the training random state when validation has finished
    if training_random_state is not None:
        training_random_state.restore_random_state()

    epoch_time_seconds = time() - epoch_start_time
    logging.info(
        f"Epoch {train_val_params.epoch} {status_string} took {epoch_time_seconds:0.2f} sec, "
        f"of which waiting for next minibatch took {total_load_time:0.2f} sec total. {num_batches} "
        "minibatches in total.")
    if num_load_time_exceeded > 0:
        logging.warning(
            "The dataloaders were not fast enough to always supply the next batch in less than "
            f"{MAX_ITEM_LOAD_TIME_SEC}sec.")
        logging.warning(
            f"In this epoch, {num_load_time_exceeded} out of {num_batches} batches exceeded the load time "
            f"threshold. The total loading time for the slow batches was {total_extra_load_time:0.2f}sec."
        )

    _metrics = training_steps.get_epoch_results_and_store(epoch_time_seconds) \
        if train_val_params.save_metrics else MetricsDict()
    return ModelOutputsAndMetricsForEpoch(
        metrics=_metrics,
        model_outputs=model_outputs_epoch,
        is_train=train_val_params.in_training_mode)
def test_metrics_dict_average_metrics_averaging() -> None:
    """
    Test if averaging metrics avoid NaN as expected.
    """
    m = MetricsDict()
    metric1 = "foo"
    v1 = 1.0
    m.add_metric(metric1, v1)
    m.add_metric(metric1, np.nan, skip_nan_when_averaging=True)
    metric2 = "bar"
    v2 = 2.0
    m.add_metric(metric2, v2)
    m.add_metric(metric2, np.nan, skip_nan_when_averaging=False)
    average = m.average()
    assert average.values()[metric1] == [v1]
    assert np.isnan(average.values()[metric2])
Пример #25
0
def calculate_metrics_per_class(
        segmentation: np.ndarray,
        ground_truth: np.ndarray,
        ground_truth_ids: List[str],
        voxel_spacing: TupleFloat3,
        patient_id: Optional[int] = None) -> MetricsDict:
    """
    Calculate the dice for all foreground structures (the background class is completely ignored).
    Returns a MetricsDict with metrics for each of the foreground
    structures. Metrics are NaN if both ground truth and prediction are all zero for a class.
    :param ground_truth_ids: The names of all foreground classes.
    :param segmentation: predictions multi-value array with dimensions: [Z x Y x X]
    :param ground_truth: ground truth binary array with dimensions: [C x Z x Y x X]
    :param voxel_spacing: voxel_spacing in 3D Z x Y x X
    :param patient_id: for logging
    """
    number_of_classes = ground_truth.shape[0]
    if len(ground_truth_ids) != (number_of_classes - 1):
        raise ValueError(
            f"Received {len(ground_truth_ids)} foreground class names, but "
            f"the label tensor indicates that there are {number_of_classes - 1} classes."
        )
    binaries = binaries_from_multi_label_array(segmentation, number_of_classes)

    all_classes_are_binary = [
        is_binary_array(ground_truth[label_id])
        for label_id in range(ground_truth.shape[0])
    ]
    if not np.all(all_classes_are_binary):
        raise ValueError("Ground truth values should be 0 or 1")
    overlap_measures_filter = sitk.LabelOverlapMeasuresImageFilter()
    hausdorff_distance_filter = sitk.HausdorffDistanceImageFilter()
    metrics = MetricsDict(hues=ground_truth_ids)
    for i, prediction in enumerate(binaries):
        if i == 0:
            continue
        check_size_matches(prediction,
                           ground_truth[i],
                           arg1_name="prediction",
                           arg2_name="ground_truth")
        if not is_binary_array(prediction):
            raise ValueError("Predictions values should be 0 or 1")
        # simpleitk returns a Dice score of 0 if both ground truth and prediction are all zeros.
        # We want to be able to fish out those cases, and treat them specially later.
        prediction_zero = np.all(prediction == 0)
        gt_zero = np.all(ground_truth[i] == 0)
        dice = mean_surface_distance = hausdorff_distance = math.nan
        if not (prediction_zero and gt_zero):
            prediction_image = sitk.GetImageFromArray(
                prediction.astype(np.uint8))
            prediction_image.SetSpacing(
                sitk.VectorDouble(reverse_tuple_float3(voxel_spacing)))
            ground_truth_image = sitk.GetImageFromArray(ground_truth[i].astype(
                np.uint8))
            ground_truth_image.SetSpacing(
                sitk.VectorDouble(reverse_tuple_float3(voxel_spacing)))
            overlap_measures_filter.Execute(prediction_image,
                                            ground_truth_image)
            dice = overlap_measures_filter.GetDiceCoefficient()
            if prediction_zero or gt_zero:
                hausdorff_distance = mean_surface_distance = math.inf
            else:
                try:
                    hausdorff_distance_filter.Execute(prediction_image,
                                                      ground_truth_image)
                    hausdorff_distance = hausdorff_distance_filter.GetHausdorffDistance(
                    )
                except Exception as e:
                    logging.warning(
                        "Cannot calculate Hausdorff distance for "
                        f"structure {i} of patient {patient_id}: {e}")
                try:
                    mean_surface_distance = surface_distance(
                        prediction_image, ground_truth_image)
                except Exception as e:
                    logging.warning(
                        f"Cannot calculate mean distance for structure {i} of patient {patient_id}: {e}"
                    )
            logging.debug(
                f"Patient {patient_id}, class {i} has Dice score {dice}")

        def add_metric(metric_type: MetricType, value: float) -> None:
            metrics.add_metric(metric_type,
                               value,
                               skip_nan_when_averaging=True,
                               hue=ground_truth_ids[i - 1])

        add_metric(MetricType.DICE, dice)
        add_metric(MetricType.HAUSDORFF_mm, hausdorff_distance)
        add_metric(MetricType.MEAN_SURFACE_DIST_mm, mean_surface_distance)
    return metrics
def test_aggregate_segmentation_metrics() -> None:
    """
    Test how per-epoch segmentation metrics are aggregated to computed foreground dice and voxel count proportions.
    """
    g1 = "Liver"
    g2 = "Lung"
    ground_truth_ids = [BACKGROUND_CLASS_NAME, g1, g2]
    dice = [0.85, 0.75, 0.55]
    voxels_proportion = [0.85, 0.10, 0.05]
    loss = 3.14
    other_metric = 2.71
    m = MetricsDict(hues=ground_truth_ids)
    voxel_count = 200
    # Add 3 values per metric, but such that the averages are back at the value given in dice[i]
    for i in range(3):
        delta = (i - 1) * 0.05
        for j, ground_truth_id in enumerate(ground_truth_ids):
            m.add_metric(MetricType.DICE, dice[j] + delta, hue=ground_truth_id)
            m.add_metric(MetricType.VOXEL_COUNT, int(voxels_proportion[j] * voxel_count), hue=ground_truth_id)
        m.add_metric(MetricType.LOSS, loss + delta)
        m.add_metric("foo", other_metric)
    m.add_diagnostics("foo", "bar")
    aggregate = metrics.aggregate_segmentation_metrics(m)
    assert aggregate.diagnostics == m.diagnostics
    enumerated = list((g, s, v) for g, s, v in aggregate.enumerate_single_values())
    expected = [
        # Dice and voxel count per foreground structure should be retained during averaging
        (g1, MetricType.DICE.value, dice[1]),
        (g1, MetricType.VOXEL_COUNT.value, voxels_proportion[1] * voxel_count),
        # Proportion of foreground voxels is computed during averaging
        (g1, MetricType.PROPORTION_FOREGROUND_VOXELS.value, voxels_proportion[1]),
        (g2, MetricType.DICE.value, dice[2]),
        (g2, MetricType.VOXEL_COUNT.value, voxels_proportion[2] * voxel_count),
        (g2, MetricType.PROPORTION_FOREGROUND_VOXELS.value, voxels_proportion[2]),
        # Loss is present in the default metrics group, and should be retained.
        (MetricsDict.DEFAULT_HUE_KEY, MetricType.LOSS.value, loss),
        (MetricsDict.DEFAULT_HUE_KEY, "foo", other_metric),
        # Dice averaged across the foreground structures is added during the function call, as is proportion of voxels
        (MetricsDict.DEFAULT_HUE_KEY, MetricType.DICE.value, 0.5 * (dice[1] + dice[2])),
        (MetricsDict.DEFAULT_HUE_KEY, MetricType.PROPORTION_FOREGROUND_VOXELS.value,
         voxels_proportion[1] + voxels_proportion[2]),
    ]
    assert len(enumerated) == len(expected)
    # Numbers won't match up precisely because of rounding during averaging
    for (actual, e) in zip(enumerated, expected):
        assert actual[0:2] == e[0:2]
        assert actual[2] == pytest.approx(e[2])
def test_get_single_metric() -> None:
    h1 = "a"
    m = MetricsDict(hues=[h1])
    m1, v1 = ("foo", 1.0)
    m2, v2 = (MetricType.LOSS, 2.0)
    m.add_metric(m1, v1, hue=h1)
    m.add_metric(m2, v2)
    assert m.get_single_metric(m1, h1) == v1
    assert m.get_single_metric(m2) == v2
    with pytest.raises(KeyError) as ex1:
        m.get_single_metric(m1, "no such hue")
    assert "no such hue" in str(ex1)
    with pytest.raises(KeyError) as ex2:
        m.get_single_metric("no such metric", h1)
    assert "no such metric" in str(ex2)
    m.add_metric(m2, v2)
    with pytest.raises(ValueError) as ex3:
        m.get_single_metric(m2)
    assert "Expected a single entry" in str(ex3)
Пример #28
0
class ModelTrainingStepsForSegmentation(
        ModelTrainingStepsBase[SegmentationModelBase, DeviceAwareModule]):
    """
    This class implements all steps necessary for training an image segmentation model during a single epoch.
    """
    def __init__(self, model_config: SegmentationModelBase,
                 train_val_params: TrainValidateParameters[DeviceAwareModule]):
        """
        Creates a new instance of the class.
        :param model_config: The configuration of a segmentation model.
        :param train_val_params: The parameters for training the model, including the optimizer and the data loaders.
        """
        super().__init__(model_config, train_val_params)
        self.example_to_save = np.random.randint(
            0, len(train_val_params.data_loader))
        self.pipeline = SegmentationForwardPass(
            model=self.train_val_params.model,
            model_config=self.model_config,
            batch_size=self.model_config.train_batch_size,
            optimizer=self.train_val_params.optimizer,
            in_training_mode=self.train_val_params.in_training_mode,
            criterion=self.compute_loss,
            gradient_scaler=train_val_params.gradient_scaler)
        self.metrics = MetricsDict(hues=[BACKGROUND_CLASS_NAME] +
                                   model_config.ground_truth_ids)

    def create_loss_function(self) -> torch.nn.Module:
        """
        Returns a torch module that computes a loss function.
        """
        return self.construct_loss_function(self.model_config)

    @classmethod
    def construct_loss_function(
            cls, model_config: SegmentationModelBase
    ) -> SupervisedLearningCriterion:
        """
        Returns a loss function from the model config; mixture losses are constructed as weighted combinations of
        other loss functions.
        """
        if model_config.loss_type == SegmentationLoss.Mixture:
            components = model_config.mixture_loss_components
            assert components is not None
            sum_weights = sum(component.weight for component in components)
            weights_and_losses = []
            for component in components:
                normalized_weight = component.weight / sum_weights
                loss_function = cls.construct_non_mixture_loss_function(
                    model_config, component.loss_type,
                    component.class_weight_power)
                weights_and_losses.append((normalized_weight, loss_function))
            return MixtureLoss(weights_and_losses)
        return cls.construct_non_mixture_loss_function(
            model_config, model_config.loss_type,
            model_config.loss_class_weight_power)

    @classmethod
    def construct_non_mixture_loss_function(
            cls, model_config: SegmentationModelBase,
            loss_type: SegmentationLoss,
            power: Optional[float]) -> SupervisedLearningCriterion:
        """
        :param model_config: model configuration to get some parameters from
        :param loss_type: type of loss function
        :param power: value for class_weight_power for the loss function
        :return: instance of loss function
        """
        if loss_type == SegmentationLoss.SoftDice:
            return SoftDiceLoss(class_weight_power=power)
        elif loss_type == SegmentationLoss.CrossEntropy:
            return CrossEntropyLoss(
                class_weight_power=power,
                smoothing_eps=model_config.label_smoothing_eps,
                focal_loss_gamma=None)
        elif loss_type == SegmentationLoss.Focal:
            return CrossEntropyLoss(
                class_weight_power=power,
                smoothing_eps=model_config.label_smoothing_eps,
                focal_loss_gamma=model_config.focal_loss_gamma)
        else:
            raise NotImplementedError(
                "Loss type {} is not implemented".format(loss_type))

    def forward_and_backward_minibatch(
            self, sample: Dict[str, Any], batch_index: int,
            epoch: int) -> ModelForwardAndBackwardsOutputs:
        """
        Runs training for a single minibatch of training data, and computes all metrics.
        :param sample: The batched sample on which the model should be trained.
        :param batch_index: The index of the present batch (supplied only for diagnostics).
        :param epoch: The number of the present epoch.
        """
        cropped_sample: CroppedSample = CroppedSample.from_dict(sample=sample)
        labels = self.model_config.get_gpu_tensor_if_possible(
            cropped_sample.labels_center_crop)

        mask = None if self.train_val_params.in_training_mode else cropped_sample.mask_center_crop
        forward_pass_result = self.pipeline.forward_pass_patches(
            patches=cropped_sample.image, labels=labels, mask=mask)
        # Clear the GPU cache between forward and backward passes to avoid possible out-of-memory
        torch.cuda.empty_cache()
        dice_for_all_classes = metrics.compute_dice_across_patches(
            segmentation=torch.tensor(
                forward_pass_result.segmentations).long(),
            ground_truth=labels,
            use_cuda=self.model_config.use_gpu,
            allow_multiple_classes_for_each_pixel=True).cpu().numpy()
        foreground_voxels = metrics_util.get_number_of_voxels_per_class(
            cropped_sample.labels)
        # loss is a scalar, also when running the forward pass over multiple crops.
        # dice_for_all_structures has one row per crop.
        if forward_pass_result.loss is None:
            raise ValueError(
                "During training, the loss should always be computed, but the value is None."
            )
        loss = forward_pass_result.loss

        # store metrics per batch
        self.metrics.add_metric(MetricType.LOSS, loss)
        for i, ground_truth_id in enumerate(
                self.metrics.get_hue_names(include_default=False)):
            for b in range(dice_for_all_classes.shape[0]):
                self.metrics.add_metric(MetricType.DICE,
                                        dice_for_all_classes[b, i].item(),
                                        hue=ground_truth_id,
                                        skip_nan_when_averaging=True)
            self.metrics.add_metric(MetricType.VOXEL_COUNT,
                                    foreground_voxels[i],
                                    hue=ground_truth_id)
        # store diagnostics per batch
        center_indices = cropped_sample.center_indices
        if isinstance(center_indices, torch.Tensor):
            center_indices = center_indices.cpu().numpy()
        self.metrics.add_diagnostics(MetricType.PATCH_CENTER.value,
                                     np.copy(center_indices))
        if self.train_val_params.in_training_mode:
            # store the sample train patch from this epoch for visualization
            if batch_index == self.example_to_save and self.model_config.store_dataset_sample:
                _store_dataset_sample(self.model_config,
                                      self.train_val_params.epoch,
                                      forward_pass_result, cropped_sample)

        return ModelForwardAndBackwardsOutputs(
            loss=loss,
            logits=forward_pass_result.posteriors,
            labels=forward_pass_result.segmentations)

    def get_epoch_results_and_store(self,
                                    epoch_time_seconds: float) -> MetricsDict:
        """
        Assembles all training results that were achieved over all minibatches, writes them to Tensorboard and
        AzureML, and returns them as a MetricsDict object.
        :param epoch_time_seconds: For diagnostics, this is the total time in seconds for training the present epoch.
        :return: A dictionary that holds all metrics averaged over the epoch.
        """
        self.metrics.add_metric(MetricType.SECONDS_PER_EPOCH,
                                epoch_time_seconds)
        assert len(self.train_val_params.epoch_learning_rate
                   ) == 1, "Expected a single entry for learning rate."
        self.metrics.add_metric(MetricType.LEARNING_RATE,
                                self.train_val_params.epoch_learning_rate[0])
        result = metrics.aggregate_segmentation_metrics(self.metrics)
        metrics.store_epoch_metrics(self.azure_and_tensorboard_logger,
                                    self.df_logger,
                                    self.train_val_params.epoch, result,
                                    self.train_val_params.epoch_learning_rate,
                                    self.model_config)
        return result
def test_metrics_dict_with_default_hue() -> None:
    hue_name = "foo"
    metrics_dict = MetricsDict(hues=[hue_name, MetricsDict.DEFAULT_HUE_KEY])
    assert metrics_dict.get_hue_names(include_default=True) == [hue_name, MetricsDict.DEFAULT_HUE_KEY]
    assert metrics_dict.get_hue_names(include_default=False) == [hue_name]
def test_metrics_dict1() -> None:
    """
    Test insertion of scalar values into a MetricsDict.
    """
    m = MetricsDict()
    assert m.get_hue_names() == [MetricsDict.DEFAULT_HUE_KEY]
    name = "foo"
    v1 = 2.7
    v2 = 3.14
    m.add_metric(name, v1)
    m.add_metric(name, v2)
    assert m.values()[name] == [v1, v2]
    with pytest.raises(ValueError) as ex:
        # noinspection PyTypeChecker
        m.add_metric(name, [1.0])  # type: ignore
    assert "Expected the metric to be a scalar" in str(ex)
    assert m.skip_nan_when_averaging[name] is False
    v3 = 3.0
    name2 = "bar"
    m.add_metric(name2, v3, skip_nan_when_averaging=True)
    assert m.skip_nan_when_averaging[name2] is True
    # Expected average: Metric "foo" averages over two values v1 and v2. For "bar", we only inserted one value anyhow
    average = m.average()
    mean_v1_v2 = mean([v1, v2])
    assert average.values() == {name: [mean_v1_v2], name2: [v3]}
    num_entries = m.num_entries()
    assert num_entries == {name: 2, name2: 1}