示例#1
0
def measure(key: str, ground_truth: list, data: list, uncertainty: str, bins: int):
    """ Measure miscalibration (batched mode) """

    print("Measure: %s" % key)
    try:
        confidence = [x[key] for x in data]
    except KeyError:
        return np.nan

    ece = ECE(bins=bins, detection=False)
    miscalibration = []
    for conf, gt in zip(confidence, ground_truth):

        if conf.ndim == 3:
            if uncertainty == 'mean':
                conf = np.mean(conf, axis=0)
            elif uncertainty == 'flatten':
                gt = np.tile(gt, conf.shape[0]).flatten()
                conf = conf.flatten()
            else:
                raise AttributeError("Unknown type of uncertainty handling: %s." % uncertainty)

        miscalibration.append(ece.measure(conf, gt))

    return np.mean(miscalibration)
示例#2
0
    def evaluation_metrics(self, n_bins=10, verbose=True):
        """
        Calculates proper losses, calibration error metrics, and the
        macro weighted F1-score (to evaluate predictive performance).

        Expected Calibration Error: Discretize the probability interval into a
            fixed number of bins and assign predicted probabilities to each bin.
            The calibration error is the difference between the number of correct
            probabilities (accuracy) and the mean of all probabilities (confidence)
            for each bin.
        Classwise ECE: The ECE calculated for each class.
        Adaptive ECE: The Adaptive ECE focuses on those bins where predictions are
            made rather than weighing all bins equally. This metric spaces the bin
            intervals in such a way that each contains an equal number of predictions.
        Brier Score: "The Brier score measures the mean squared difference between
            (1) the predicted probability assigned to the possible outcomes for item i,
            and (2) the actual outcome. Therefore, the lower the Brier score is for a
            set of predictions, the better the predictions are calibrated." sklearn metrics
        Negative Log-Likelihood: The NLL also average the error on every single
            instance to calculate the calibration error.
        F1-Macro weighted: The weighted average of the precision and recall.
            Calculate metrics for each label, and find their unweighted mean.

        Parameters:
        ----------------
            probs: np.ndarray, shape=(n_samples, 3)
                Estimated confidences (probabilities) from classifier.
            y: np.ndarray, shape=(n_samples,)
                NumPy 1-D array with ground truth labels.
            method: str, Calibration method used.
            n_bins: n_bins: int, default: 10
                Discretize the probability interval into a fixed number of bins
                and assign predicted probabilities to each bin.
            verbose: bool, default: True
                Print metrics as output.

        Returns:
            Dataframe with all evaluation metrics.
        """
        ece = ECE(n_bins)
        ece_score = ece.measure(self.calibrated, self.labels)
        classwise_ece = calc_classwise_ece(self.calibrated, self.labels)
        dd_ece = stats.ece(self.calibrated, one_hot( self.labels), binning=binning.DataDependentBinning())
        brier = brier_multi(self.labels, self.calibrated)
        nll = log_loss(self.labels, self.calibrated)
        f1 = f1_score(self.labels, np.argmax(self.calibrated, axis=1), average="macro")

        df = pd.DataFrame(columns=['ECE', 'Classwise ECE', 'Adaptive ECE', 'Brier', 'Neg Log-Likelihood', 'F1-Macro'])
        df.loc[0] = ece_score, classwise_ece, dd_ece, brier, nll, f1

        if verbose:
            print(self.method + ' - Calibration Metrics')
            print('-'*50)
            print('ECE: ', round(ece_score, 4))
            print('Classwise/ Static ECE: ', round(classwise_ece, 4))
            print('Adaptive ECE: ', round(dd_ece, 4))
            print('Brier Multi Score: ', round(brier, 4))
            print('f1 - macro: ', round(f1, 4))
            print('Negative Log-Likelihood: ', round(nll, 4))
        return df
示例#3
0
 def predict_ece_logloss(self, X, y, bins=10, mode='map'):
     preds_probs = self.predict_proba(X, mode=mode)
     #print(preds_probs, preds_probs.shape)
     ece = ECE(bins)
     calibrated_score = ece.measure(preds_probs, y)
     #print(y, preds_probs)
     return calibrated_score, log_loss(y, preds_probs, labels=[0, 1])
def eval_cal(y_preds, y_true, bins=15):
    # Calibration Metrics
    ece = ECE(bins)
    ace = ACE(bins)
    mce = MCE(bins)
    ece_score = ece.measure(y_preds, y_true)
    ace_score = ace.measure(y_preds, y_true)
    mce_score = mce.measure(y_preds, y_true)
    return ece_score, ace_score, mce_score
示例#5
0
    def script():
        '''
                    LOAD DATA
        '''
        data_path = '../data/'
        train_data = pd.read_csv(data_path + 'v3.2.2_train.csv')
        test_data = pd.read_csv(data_path + 'v3.2.2_test.csv')

        y_train, y_test = train_data[['TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow']], \
                          test_data[['TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow']]
        X_train, X_test = train_data.drop(
            ['Unnamed: 0', 'TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow'], axis=1), \
                          test_data.drop(
                              ['Unnamed: 0', 'TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow'], axis=1)
        model = load_model("../network/models/v3.2.2/model.h5")
        print("# of train samples: ", len(y_train.index))
        print("# of test samples: ", len(y_test.index))

        ##Using NetCal package
        n_bins = 10
        confidences = model.predict(X_test.values)
        ece = ECE(n_bins)
        uncalibrated_score = ece.measure(confidences,
                                         y_test.values.argmax(axis=1))
        print("Calibration Error before calibration: ", uncalibrated_score)

        temperature = TemperatureScaling()
        temperature.fit(confidences, y_test.values.argmax(axis=1))
        calibrated = temperature.transform(confidences)
        ece = ECE(n_bins)
        calibrated_score = ece.measure(calibrated,
                                       y_test.values.argmax(axis=1))
        print("Calibration Error after calibration: ", calibrated_score)

        diagram = ReliabilityDiagram(n_bins)
        diagram.plot(confidences, y_test.values.argmax(
            axis=1))  # visualize miscalibration of uncalibrated

        diagram.plot(calibrated, y_test.values.argmax(
            axis=1))  # visualize miscalibration of calibrated

        np.savetxt('./calibration-data/test_calibrated_v3.2.2.csv',
                   calibrated,
                   delimiter=',')
def measure_miscalibration(bins: Union[tuple, list, int], data: dict,
                           methods0d: list, methods2d: list):
    """
    Measure miscalibration and write to stdout.

    Parameters
    ----------
    bins : iterable or int
        Number of bins used by ACE, ECE and MCE.
    data : dict
        Dictionary of calibration data.
    methods0d : list
        List with strings containing the keys for the calibration data (confidence only methods).
    methods2d : list
        List with strings containing the keys for the calibration data (2D methods).
    """

    # iterate over 0D and 2D methods
    for i, methods in enumerate([methods0d, methods2d]):

        # insert 'confidence' key to the first place in the list to keep track of default miscalibration
        if i == 1:
            methods = ['confidence'] + methods0d + methods2d
        else:
            methods = ['confidence'] + methods

        # on confidence only, use one single value (the first one)
        bins = bins[0] if i == 0 and isinstance(bins, (tuple, list)) else bins

        # create instances for measuring miscalibration
        ace = ACE(bins=bins, detection=True)
        ece = ECE(bins=bins, detection=True)
        mce = MCE(bins=bins, detection=True)

        # initialize empty lists
        ace_list = []
        ece_list = []
        mce_list = []

        # iterate over all methods
        for method in methods:
            data_input = data[method] if i == 0 else np.stack(
                (data[method], data['cx'], data['cy']), axis=1)
            ace_list.append(ace.measure(data_input, data['matched']))
            ece_list.append(ece.measure(data_input, data['matched']))
            mce_list.append(mce.measure(data_input, data['matched']))

        # output formatted ECE
        names = [len(x) for x in methods]
        buffer = max(names)

        # write out all miscalibration results in a 'pretty' manner
        for j, method in enumerate(methods):
            fill = (buffer - len(method)) * " "
            print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" %
                  (method, fill, ace_list[j], ece_list[j], mce_list[j]))
def calibrate(model, valid_loader, test_loader, n_bins=15):
    """
    Calibrate the model via temperature scaling
    """
    confidence, labels = rollout_loader(model, valid_loader)
    test_confidence, test_labels = rollout_loader(model, test_loader)

    temperature = TemperatureScaling()
    temperature.fit(confidence, labels)
    calibrated = temperature.transform(test_confidence)

    ece = ECE(n_bins)
    calibrated_score = ece.measure(calibrated, test_labels)

    return calibrated_score
def cross_validate_temp_scaling(model, data_loader, batch_size, k=5, seed=0, num_workers=0, n_bins=15, pin_memory=False):
    """
    Perform temperature scaling on the model with k-fold cross validation
    """
    print("Computing model calibration", flush=True)
    test_dataset = data_loader.dataset
    num_test = len(test_dataset)
    indices = list(range(num_test))
    np.random.seed(seed)
    np.random.shuffle(indices)
    idxs = torch.tensor(indices).split(int(len(indices) / k))[:k]

    # get the uncalibrated ECE
    confidence, labels = rollout_loader(model, data_loader)
    ece = ECE(n_bins)
    unscaled_ece = ece.measure(confidence, labels)
    print(f'ECE: {unscaled_ece:.3f}')

    # compute the calibrated ECE
    scaled_eces = []
    # for each of the k folds
    for i in range(k):
        valid_idx = idxs[i]
        before = torch.cat(idxs[:i]) if i is not 0 else torch.tensor([], dtype=torch.long)
        after = torch.cat(idxs[i + 1:]) if i + 1 is not k else torch.tensor([], dtype=torch.long)
        test_idx = torch.cat([before, after])

        # create data loaders
        test_sampler = SubsetRandomSampler(test_idx)
        valid_sampler = SubsetRandomSampler(valid_idx)

        test_loader = DataLoader(
            test_dataset, batch_size=batch_size, sampler=test_sampler,
            num_workers=num_workers, pin_memory=pin_memory
        )
        valid_loader = DataLoader(
            test_dataset, batch_size=batch_size, sampler=valid_sampler,
            num_workers=num_workers, pin_memory=pin_memory
        )

        scaled_ece = calibrate(model, valid_loader, test_loader, n_bins)
        print(f'Cross validation fold {i}, temperature scaled ECE: {scaled_ece:.3f}')
        scaled_eces.append(scaled_ece)
    mean_scaled_ece = np.mean(scaled_eces)

    return unscaled_ece, mean_scaled_ece
示例#9
0
def single_example(models: list,
                   datafile: str,
                   bins: int,
                   diagram: str = None,
                   validation_split: float = 0.7,
                   save_models: bool = False,
                   domain: str = ".") -> int:
    """
    Measure miscalibration of given methods on specified dataset.

    Parameters
    ----------
    models : list
        List of tuples with [('<name>', <instance of CalibrationMethod>), ...].
    datafile : str
        Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'.
    bins : int
        Number of bins used by ECE, MCE and ReliabilityDiagram.
    diagram : str, optional, default: None
        Type of diagram wich should be plotted. This could be 'diagram', 'curve', 'inference' or None.
    validation_split : float
        Split ratio between build set and validation set.
    save_models : bool
        True if instances of calibration methods should be stored.
    domain : str, optional, default: "."
        Domain/directory where to store the results.

    Returns
    -------
    int
        0 on success, -1 otherwise
    """

    if not os.path.exists(datafile):
        print("Dataset \'%s\' does not exist" % datafile)
        return -1

    # read NumPy input files
    try:
        with open(datafile, "rb") as open_file:
            npzfile = np.load(open_file)
            ground_truth = npzfile['ground_truth'].squeeze()
            predictions = npzfile['predictions'].squeeze()
    except KeyError:
        print(
            "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'"
            % datafile)
        return -1

    # split data set into build set and validation set
    build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split(
        ground_truth,
        predictions,
        test_size=validation_split,
        stratify=ground_truth,
        random_state=None)

    # initialize error metrics
    ace = ACE(bins)
    ece = ECE(bins)
    mce = MCE(bins)

    predictions = []
    all_ace = [ace.measure(validation_set_sm, validation_set_gt)]
    all_ece = [ece.measure(validation_set_sm, validation_set_gt)]
    all_mce = [mce.measure(validation_set_sm, validation_set_gt)]

    # ------------------------------------------

    # build and save models
    for model in models:
        name, instance = model
        print("Build %s model" % name)
        instance.fit(build_set_sm, build_set_gt)

        if save_models:
            instance.save_model("%s/models/%s.pkl" % (domain, name))

    # ------------------------------------------

    # perform predictions
    for model in models:
        _, instance = model
        prediction = instance.transform(validation_set_sm)
        predictions.append(prediction)

        all_ace.append(ace.measure(prediction, validation_set_gt))
        all_ece.append(ece.measure(prediction, validation_set_gt))
        all_mce.append(mce.measure(prediction, validation_set_gt))

    # ------------------------------------------

    # output formatted ECE
    names = [len(x[0]) for x in models]
    buffer = max(names)

    fill = (buffer - len("Default")) * " "
    print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" %
          ("Default", fill, all_ace[0], all_ece[0], all_mce[0]))
    for i, model in enumerate(models, start=1):
        name, instance = model
        fill = (buffer - len(name)) * " "
        print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" %
              (name, fill, all_ace[i], all_ece[i], all_mce[i]))

    # ------------------------------------------

    if diagram == 'diagram':

        diagram = ReliabilityDiagram(bins=bins, title_suffix="default")
        diagram.plot(validation_set_sm, validation_set_gt, filename="test.png")
        for i, prediction in enumerate(predictions):
            diagram = ReliabilityDiagram(bins=bins, title_suffix=models[i][0])
            diagram.plot(prediction, validation_set_gt)

    elif diagram is None:
        pass
    else:
        print("Unknown diagram type \'%s\'" % diagram)
        return -1

    return 0
示例#10
0
def transform(frames: List[Dict], dataset: str, network: str, subset: List,
              ious: List, test_ids: List[int]):
    """
    After calibration training, evaluate the trained models by several miscalibration metrics. These metrics are:
    D-ECE, Brier, NLL. Also capture area under precision-recall curve (AUPRC).
    All results are stored at "./output/<network>".

    Parameters
    ----------
    frames : List[Dict]
        List of dictionaries holding the input data for each image frame.
    dataset : str
        String of the used dataset (see detectron2 registered datasets).
    network : str
        String describing the base neural network.
    subset : List[str]
        List with additional features used for calibration. Options are:
        - 'cx'
        - 'cy'
        - 'w'
        - 'h'
    ious : List[float]
        List with IoU scores used for evaluation.
    test_ids : List
        List of data frame ids used for calibration testing.
    """

    # get meta information and specify all relevant paths
    meta = MetadataCatalog.get(dataset)
    model_dir = os.path.join("calibration", network, "models")
    output_dir = os.path.join("output", network)
    diagram_path = os.path.join(
        output_dir, "diagrams",
        ''.join(subset) if len(subset) > 0 else "confidence")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(diagram_path, exist_ok=True)

    # calibration methods that have also been used for calibration training
    methods = [("histogram", HistogramBinning), ("lr", LogisticCalibration),
               ("lr_dependent", LogisticCalibrationDependent),
               ("betacal", BetaCalibration),
               ("betacal_dependent", BetaCalibrationDependent)]

    # reverse mapping of category ids to network class ids (e.g. for COCO dataset)
    if hasattr(meta, "thing_dataset_id_to_contiguous_id"):
        reverse_dictionary = {
            v: k
            for k, v in meta.thing_dataset_id_to_contiguous_id.items()
        }
    else:
        reverse_dictionary = None

    # lists and placeholders for evaluation metrics
    n_samples_total = 0
    n_samples_per_class = []
    dece_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)]

    brier_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)]
    nll_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)]
    average_precision = [[[] for _ in ious] for _ in range(len(methods) + 1)]

    # -----------------------------------------------------
    # visualization routine
    diagram0d = ReliabilityDiagram(bins=20, detection=True, sample_threshold=8)
    diagram1d = ReliabilityDiagram(bins=[5, 15],
                                   detection=True,
                                   sample_threshold=3,
                                   fmin=0,
                                   fmax=0.3)
    diagram2d = ReliabilityDiagram(bins=[6, 9, 9],
                                   detection=True,
                                   sample_threshold=2,
                                   fmin=0,
                                   fmax=0.3)

    def plot(f: np.ndarray, m: np.ndarray, title: str, formatter: str):
        # Define function for diagram output

        # plot baseline miscalibration
        figures = [
            diagram0d.plot(f[:, :1],
                           m,
                           tikz=False,
                           title_suffix=title,
                           filename=formatter % "0d")
        ]

        # plot all additional features in 1D miscalibration plots
        for i, fname in enumerate(['cx', 'cy', 'w', 'h']):
            figures.append(
                diagram1d.plot(f[:, (0, i + 1)],
                               m,
                               tikz=False,
                               feature_names=[fname],
                               title_suffix=title,
                               filename=formatter % ("1d_%s" % fname)))

        # finally, plot all feature combinations of size 2
        for (i, fname1), (j, fname2) in itertools.combinations(
                enumerate(['cx', 'cy', 'w', 'h']), 2):
            figures.append(
                diagram2d.plot(f[:, (0, i + 1, j + 1)],
                               m,
                               tikz=False,
                               feature_names=[fname1, fname2],
                               title_suffix=title,
                               filename=formatter % ("2d_%s_%s" %
                                                     (fname1, fname2))))

        # free memory space
        for fig in figures:
            plt.close(fig)

    # -----------------------------------------------------

    # iterate over all classes that are present in the current dataset
    for i, classname in enumerate(meta.thing_classes):

        # get calibration features for selected class
        category_id = reverse_dictionary[
            i] if reverse_dictionary is not None else i
        features, matched, img_ids = get_features(frames, category_id, subset,
                                                  ious, test_ids)
        all_features, _, _ = get_features(frames, category_id,
                                          ['cx', 'cy', 'w', 'h'], ious,
                                          test_ids)

        if features.size == 0:
            print("No samples for category %s found" % classname)
            continue

        # different binning schemes for different feature dimensions
        if features.shape[1] == 1:
            bins = 20
        elif features.shape[1] == 3:
            bins = 8
        elif features.shape[1] == 5:
            bins = 5
        else:
            raise ValueError("Unknown dimension: %d" % features.shape[1])

        # define D-ECE metric
        dece = ECE(bins=bins, detection=True, sample_threshold=8)
        n_samples_per_class.append(features.shape[0])
        n_samples_total += features.shape[0]

        # failed flag is required to optionally blank failed or non-present classes during evaluation
        # i.e., if a metric returns NaN
        failed = False

        # perform evaluation for each category separately
        print("Inference: category %d: %d samples" %
              (category_id, features.shape[0]))
        for j, (iou, m) in enumerate(zip(ious, matched)):

            score = average_precision_score(m, features[:, 0])
            if not np.isfinite(score) or np.isnan(score):
                brier_per_class[0][j].append(0.)
                nll_per_class[0][j].append(0.)
                dece_per_class[0][j].append(0.)
                average_precision[0][j].append(0.)
                failed = True

            # compute average precision, Brier, NLL and ECE
            else:
                brier_per_class[0][j].append(
                    np.mean(np.square(features[:, 0] - m)))
                nll_per_class[0][j].append(
                    -np.mean(m * np.log(features[:, 0]) +
                             (1. - m) * np.log(1. - features[:, 0])))
                dece_per_class[0][j].append(dece.measure(features, m))
                average_precision[0][j].append(score)

            diagramname = os.path.join(
                diagram_path,
                "default_cls-%02d_iou%.2f" % (i, iou) + "_%s.tex")
            plot(all_features, m, title="default", formatter=diagramname)

            # start calibration evaluation for each method separately
            for k, (name, method) in enumerate(methods, start=1):
                instance = method()

                try:
                    print("Load %s and transform" % name)
                    instance.load_model(
                        os.path.join(
                            model_dir, "%s_%s_iou%.2f_cls-%02d.pkl" %
                            (name, ''.join(subset), iou, i)))
                    calibrated = instance.transform(features)

                    # perform clipping
                    np.clip(calibrated,
                            np.finfo(np.float32).eps,
                            1. - np.finfo(np.float32).eps,
                            out=calibrated)
                    score = average_precision_score(m, calibrated)
                    if not np.isfinite(score) or np.isnan(score):
                        raise ValueError("Couldn't compute AUPRC score")

                    average_precision[k][j].append(score)

                    brier_per_class[k][j].append(
                        np.mean(np.square(calibrated - m)))
                    nll_per_class[k][j].append(
                        -np.mean(m * np.log(calibrated) +
                                 (1. - m) * np.log(1. - calibrated)))

                    input = np.concatenate(
                        (np.reshape(calibrated, (-1, 1)), features[:, 1:]),
                        axis=1)
                    dece_per_class[k][j].append(dece.measure(input, m))

                    diagramname = os.path.join(
                        diagram_path,
                        "%s_cls-%02d_iou%.2f" % (name, i, iou) + "_%s.tex")
                    input = np.concatenate(
                        (np.reshape(calibrated, (-1, 1)), all_features[:, 1:]),
                        axis=1)
                    plot(input, m, title=name, formatter=diagramname)

                except (FileNotFoundError, ValueError):
                    print(
                        "Could not find weight file ",
                        os.path.join(
                            model_dir, "%s_%s_iou%.2f_cls-%02d.pkl" %
                            (name, ''.join(subset), iou, i)))
                    print("Disable evaluation for class %d" % i)

                    brier_per_class[k][j].append(0.)
                    nll_per_class[k][j].append(0.)
                    dece_per_class[k][j].append(0.)
                    average_precision[k][j].append(0.)

                    failed = True

        if failed:
            n_samples_total -= n_samples_per_class[-1]
            n_samples_per_class[-1] = 0

    # convert all lists to NumPy arrays
    weights = np.array(n_samples_per_class) / n_samples_total
    brier_per_class = np.array(brier_per_class)
    nll_per_class = np.array(nll_per_class)
    dece_per_class = np.array(dece_per_class)
    average_precision = np.array(average_precision)

    # compute a feed-forward average and and a weighted counter-part
    brier_global = np.mean(brier_per_class, axis=2)
    weighted_brier_global = np.average(brier_per_class,
                                       weights=weights,
                                       axis=2)
    nll_global = np.mean(nll_per_class, axis=2)
    weighted_nll_global = np.average(nll_per_class, weights=weights, axis=2)
    dece_global = np.mean(dece_per_class, axis=2)
    weighted_dece_global = np.average(dece_per_class, weights=weights, axis=2)
    average_precision_macro = np.mean(average_precision, axis=2)
    average_precision_weighted = np.average(average_precision,
                                            weights=weights,
                                            axis=2)

    # use tabulate library to visualize the evaluation results
    header = []
    body = [['default']]
    body.extend([[name] for name, method in methods])
    for i, iou in enumerate(ious):

        header.extend([
            'D-ECE(w) @ IoU %.2f' % iou,
            'D-ECE @ IoU %.2f' % iou,
            'Brier(w) @ IoU %.2f' % iou,
            'Brier @ IoU %.2f' % iou,
            'NLL(w) @ IoU %.2f' % iou,
            'NLL @ IoU %.2f' % iou,
            'AP(w) @ IoU %.2f' % iou, 'AP @ IoU %.2f'
        ])
        body[0].extend([
            weighted_dece_global[0][i], dece_global[0][i],
            weighted_brier_global[0][i], brier_global[0][i],
            weighted_nll_global[0][i], nll_global[0][i],
            average_precision_weighted[0][i], average_precision_macro[0][i]
        ])
        for k, (name, method) in enumerate(methods):
            body[k + 1].extend([
                weighted_dece_global[k + 1][i], dece_global[k + 1][i],
                weighted_brier_global[k + 1][i], brier_global[k + 1][i],
                weighted_nll_global[k + 1][i], nll_global[k + 1][i],
                average_precision_weighted[k + 1][i],
                average_precision_macro[k + 1][i]
            ])

    results = [header, *body]

    # also write the evaluation results to CSV format
    print("\nEvaluation Results:")
    print(tabulate(results, headers="firstrow"))
    with open(os.path.join(output_dir, "results_%s.csv" % ''.join(subset)),
              "w") as open_file:
        writer = csv.writer(open_file)
        writer.writerow([
            "method",
        ] + results[0])
        writer.writerows(results[1:])
示例#11
0
 def predict_ece(self, X, y, mode='map', bins=10):
     ece = ECE(bins)
     calibrated_score = ece.measure(self.predict_proba(X, mode=mode), y)
     return calibrated_score
 def predict_ece_logloss(self, X, y, bins=10):
     preds_probs = self.predict_proba(X)
     ece = ECE(bins)
     calibrated_score = ece.measure(preds_probs, y)
     #print(calibrated_score, y, preds_probs)
     return calibrated_score, log_loss(y, preds_probs, labels=[0, 1])
         lw=1,
         color='red')

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="upper left")
ax1.set_title('Calibration plots  (reliability curve)')

ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)
plt.tight_layout()

#Tempreture scaling for probability calibration using netcal package
from netcal.scaling import TemperatureScaling
temperature = TemperatureScaling()
temperature.fit(y_prob, y_all)
calibrated = temperature.transform(y_prob)

#Computing the expected calibration error
from netcal.metrics import ECE
from netcal.presentation import ReliabilityDiagram
n_bins = 10
ece = ECE(n_bins)
uncalibrated_score = ece.measure(y_new, y_test)
calibrated_score = ece.measure(calibrated, y_test)

diagram = ReliabilityDiagram(n_bins)
diagram.plot(y_new, y_test)  # visualize miscalibration of uncalibrated
diagram.plot(calibrated, y_test)  # visualize miscalibration of calibrated
示例#14
0
from netcal.metrics import ECE

from utils_constants import CORRECTNESS, A, B, C, D, LABEL
from utils_data import create_calibrated_df

random_state = 42
split = 'test'
n_bins = 10

for random_seed in [1, 2, 3, 4, 5]:
    df = create_calibrated_df(
        ['output_xlnet_seed_%d_%s.csv' % (random_seed, split)])
    ece = ECE(n_bins)
    uncalibrated_score = ece.measure(df[[A, B, C, D]].values, df[LABEL].values)
    print('XLNET %d: ECE = %.4f' % (random_seed, float(uncalibrated_score)))

for random_seed in [0, 1, 2, 3, 42]:
    df = create_calibrated_df(
        ['output_distilbert_seed%d_%s.csv' % (random_seed, split)])
    ece = ECE(n_bins)
    uncalibrated_score = ece.measure(df[[A, B, C, D]].values, df[LABEL].values)
    print('DistilBERT %d: ECE = %.4f' %
          (random_seed, float(uncalibrated_score)))

for random_seed in [0, 1, 2, 3, 42]:
    df = create_calibrated_df(
        ['output_bert_seed%d_%s.csv' % (random_seed, split)])
    ece = ECE(n_bins)
    uncalibrated_score = ece.measure(df[[A, B, C, D]].values, df[LABEL].values)
    print('BERT %d: ECE = %.4f' % (random_seed, float(uncalibrated_score)))
示例#15
0
                 accs[k].append(0)
 xs = []
 ys = []
 for k in bins.keys():
     xs.append(np.mean(bins[k]))
     ys.append(np.mean(accs[k]))
     ax1.annotate(str(len(accs[k])),
                  (np.mean(bins[k]), np.mean(accs[k])),
                  color=color)
 ax1.plot(xs, ys, label=title, color=color)
 if args.mutual_info:
     with open(title + '.pkl', 'rb') as fp:
         ys2 = pickle.load(fp)
     ax2.plot(list(range(1, len(ys2) * 3 + 1, 3)), ys2, color=color)
 ece = ECE(args.ece_bins)
 ece_score = ece.measure(X, y)
 acc = 100. * correct / total
 print('Testing LOS_LOSS:', np.mean(test_loss))
 print('Testing ACCURACY:', acc)
 print('Testing ECE:', ece_score)
 results_dict['logloss'] = np.mean(test_loss)
 results_dict['acc'] = acc
 results_dict['ece'] = ece_score
 results_df = results_df.append(results_dict, ignore_index=True)
 f = open(main_path + "evaluation_logs.txt", "a")
 f.write(
     '###################################################################\n'
     + 'P:' + p + '\n')
 f.write('Mean Testing LOG_LOSS:' + str(np.mean(test_loss)) + '\n' +
         'Testing ACCURACY:' + str(acc) + '\n' + 'Testing ECE:' +
         str(ece_score) + '\n')
示例#16
0
def ece_score(y_true, y_prob, n_bins=10):
    ece = ECE(n_bins)
    ece_val = ece.measure(y_prob, y_true)

    return ece_val
示例#17
0
            'min_beta': min_beta
        })

    predictions = unc_model.predict(
        [test_mu_predictions, stl10_x_test_resized])

    logger.error("Compute predictions")
    sess = K.get_session()
    logger.error("Compute mu pred. entropy")
    probs = sess.run(predict_probs(predictions))
    accuracy = accuracy_score(stl10_y_test, np.argmax(predictions, axis=1))
    logger.error("Resulting accuracy: {}".format(accuracy))

    n_bins = 10
    ground_truth = stl10_y_test
    confidences = test_mu_predictions

    temperature = TemperatureScaling()
    temperature.fit(confidences, ground_truth)
    calibrated = temperature.transform(confidences)
    n_bins = 10

    ece = ECE(n_bins)
    uncalibrated_score = ece.measure(confidences, ground_truth)
    calibrated_score = ece.measure(calibrated, ground_truth)
    wrapper_score = ece.measure(probs, ground_truth)
    logger.error("ECE scores: {}, {}, {}".format(uncalibrated_score,
                                                 calibrated_score,
                                                 wrapper_score))
    logger.error("Done")
示例#18
0
def cross_validation_5_2(models: list,
                         datafile: str,
                         bins: int,
                         save_models: bool = False,
                         domain: str = '.') -> int:
    """
    5x2 cross validation on given methods on specified dataset.

    Parameters
    ----------
    models : list
        List of tuples with [('<name>', <instance of CalibrationMethod>), ...].
    datafile : str
        Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'.
    bins : int
        Number of bins used by ECE, MCE and ReliabilityDiagram.
    save_models : bool, optional, default: False
        True if instances of calibration methods should be stored.
    domain : str, optional, default: "."
        Domain/directory where to store the results.

    Returns
    -------
    int
        0 on success, -1 otherwise
    """

    network = datafile[datafile.rfind("/") + 1:datafile.rfind(".npz")]
    seeds = [60932, 29571058, 127519, 23519410, 74198274]

    if not os.path.exists(datafile):
        print("Dataset \'%s\' does not exist" % datafile)
        return -1

    # read NumPy input files
    try:
        with open(datafile, "rb") as open_file:
            npzfile = np.load(open_file)
            ground_truth = npzfile['ground_truth'].squeeze()
            predictions = npzfile['predictions'].squeeze()
    except KeyError:
        print(
            "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'"
            % datafile)
        return -1

    if len(predictions.shape) == 2:
        n_classes = predictions.shape[1]
    else:
        n_classes = 2

    # initialize error metrics
    ace = ACE(bins)
    ece = ECE(bins)
    mce = MCE(bins)

    all_accuracy = []

    all_ace = []
    all_ece = []
    all_mce = []

    it = 0
    for i, seed in enumerate(seeds):

        np.random.seed(seed)

        # split data set into build set and validation set
        build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split(
            ground_truth,
            predictions,
            random_state=seed,
            test_size=0.5,
            stratify=ground_truth)

        for j in range(2):

            calibrated_data = {}

            # 5x2 cross validation - flip build/val set after each iteration
            build_set_gt, validation_set_gt = validation_set_gt, build_set_gt
            build_set_sm, validation_set_sm = validation_set_sm, build_set_sm

            # lists for error metrics for current iteration (it)
            it_all_accuracy = []

            it_all_ace = []
            it_all_ece = []
            it_all_mce = []

            if n_classes > 2:
                labels = np.argmax(validation_set_sm, axis=1)
            else:
                labels = np.where(validation_set_sm > 0.5,
                                  np.ones_like(validation_set_gt),
                                  np.zeros_like(validation_set_gt))

            accuracy = np.mean(
                np.where(labels == validation_set_gt, np.ones_like(labels),
                         np.zeros_like(labels)))
            it_all_accuracy.append(accuracy)

            it_all_ace.append(ace.measure(validation_set_sm,
                                          validation_set_gt))
            it_all_ece.append(ece.measure(validation_set_sm,
                                          validation_set_gt))
            it_all_mce.append(mce.measure(validation_set_sm,
                                          validation_set_gt))

            # ------------------------------------------

            # build and save models
            for model in models:
                name, instance = model
                print("Build %s model" % name)

                instance.fit(build_set_sm, build_set_gt)
                if save_models:
                    instance.save_model("%s/models/%s-%s-%d.pkl" %
                                        (domain, network, name, i))

                prediction = instance.transform(validation_set_sm)
                calibrated_data[name] = prediction

                if n_classes > 2:
                    if prediction.ndim == 3:
                        prediction = np.mean(prediction, axis=0)

                    labels = np.argmax(prediction, axis=1)
                else:
                    if prediction.ndim == 2:
                        prediction = np.mean(prediction, axis=0)

                    labels = np.where(prediction > 0.5,
                                      np.ones_like(validation_set_gt),
                                      np.zeros_like(validation_set_gt))

                accuracy = np.mean(
                    np.where(labels == validation_set_gt, np.ones_like(labels),
                             np.zeros_like(labels)))
                it_all_accuracy.append(accuracy)

                it_all_ace.append(ace.measure(prediction, validation_set_gt))
                it_all_ece.append(ece.measure(prediction, validation_set_gt))
                it_all_mce.append(mce.measure(prediction, validation_set_gt))

            # append lists of current iterations
            all_accuracy.append(it_all_accuracy)
            all_ace.append(it_all_ace)
            all_ece.append(it_all_ece)
            all_mce.append(it_all_mce)

            filename = "%s/results/%s_%02d.npz" % (domain, network, it)
            with open(filename, "wb") as open_file:
                np.savez_compressed(open_file,
                                    train_gt=build_set_gt,
                                    test_gt=validation_set_gt,
                                    train_scores=build_set_sm,
                                    test_scores=validation_set_sm,
                                    **calibrated_data)

            it += 1

    # convert to NumPy arrays and reduce mean afterwards
    all_accuracy = np.array(all_accuracy)
    all_ace = np.array(all_ace)
    all_ece = np.array(all_ece)
    all_mce = np.array(all_mce)

    all_accuracy = np.mean(all_accuracy, axis=0)
    all_ace = np.mean(all_ace, axis=0)
    all_ece = np.mean(all_ece, axis=0)
    all_mce = np.mean(all_mce, axis=0)

    names = [len(x[0]) for x in models]
    buffer = max(names)

    # ---------------------------------------------------------
    # output formatted ECE
    fill = (buffer - len("Default")) * " "
    print(
        "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" %
        ("Default", fill, all_accuracy[0], all_ace[0], all_ece[0], all_mce[0]))

    # ---------------------------------------------------------
    for i, model in enumerate(models, start=1):
        name, instance = model
        fill = (buffer - len(name)) * " "
        print(
            "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" %
            (name, fill, all_accuracy[i], all_ace[i], all_ece[i], all_mce[i]))

    return 0
示例#19
0
def cross_validation_5_2(models: list,
                         datafile: str,
                         bins: int,
                         save_models: bool = False) -> int:
    """
    5x2 cross validation on given methods on specified dataset.

    Parameters
    ----------
    models : list
        List of tuples with [('<name>', <instance of CalibrationMethod>), ...].
    datafile : str
        Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'.
    bins : int
        Number of bins used by ECE, MCE and ReliabilityDiagram.
    save_models : bool, optional, default: False
        True if instances of calibration methods should be stored.

    Returns
    -------
    int
        0 on success, -1 otherwise
    """

    if not os.path.exists(datafile):
        print("Dataset \'%s\' does not exist" % datafile)
        return -1

    # read NumPy input files
    try:
        with open(datafile, "rb") as open_file:
            npzfile = np.load(open_file)
            ground_truth = npzfile['ground_truth'].squeeze()
            predictions = npzfile['predictions'].squeeze()
    except KeyError:
        print(
            "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'"
            % datafile)
        return -1

    if len(predictions.shape) == 2:
        n_classes = predictions.shape[1]
    else:
        n_classes = 2

    # initialize error metrics
    ace = ACE(bins)
    ece = ECE(bins)
    mce = MCE(bins)

    all_accuracy = []

    all_ace = []
    all_ece = []
    all_mce = []

    for i in range(5):
        # split data set into build set and validation set
        build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split(
            ground_truth, predictions, test_size=0.5, stratify=ground_truth)

        for _ in range(2):

            # 5x2 cross validation - flip build/val set after each iteration
            build_set_gt, validation_set_gt = validation_set_gt, build_set_gt
            build_set_sm, validation_set_sm = validation_set_sm, build_set_sm

            # lists for error metrics for current iteration (it)
            it_all_accuracy = []

            it_all_ace = []
            it_all_ece = []
            it_all_mce = []

            if n_classes > 2:
                labels = np.argmax(validation_set_sm, axis=1)
            else:
                labels = np.where(validation_set_sm > 0.5,
                                  np.ones_like(validation_set_gt),
                                  np.zeros_like(validation_set_gt))

            accuracy = np.mean(
                np.where(labels == validation_set_gt, np.ones_like(labels),
                         np.zeros_like(labels)))
            it_all_accuracy.append(accuracy)

            it_all_ace.append(ace.measure(validation_set_sm,
                                          validation_set_gt))
            it_all_ece.append(ece.measure(validation_set_sm,
                                          validation_set_gt))
            it_all_mce.append(mce.measure(validation_set_sm,
                                          validation_set_gt))

            # ------------------------------------------

            # build and save models
            for model in models:
                name, instance = model
                print("Build %s model" % name)

                instance.fit(build_set_sm, build_set_gt)
                if save_models:
                    instance.save_model("./models/%s_run_%d.pkl" % (name, i))

            # ------------------------------------------

            # perform predictions
            for model in models:
                _, instance = model
                prediction = instance.transform(validation_set_sm)

                if n_classes > 2:
                    labels = np.argmax(prediction, axis=1)
                else:
                    labels = np.where(prediction > 0.5,
                                      np.ones_like(validation_set_gt),
                                      np.zeros_like(validation_set_gt))

                accuracy = np.mean(
                    np.where(labels == validation_set_gt, np.ones_like(labels),
                             np.zeros_like(labels)))
                it_all_accuracy.append(accuracy)

                it_all_ace.append(ace.measure(prediction, validation_set_gt))
                it_all_ece.append(ece.measure(prediction, validation_set_gt))
                it_all_mce.append(mce.measure(prediction, validation_set_gt))

            # append lists of current iterations
            all_accuracy.append(it_all_accuracy)
            all_ace.append(it_all_ace)
            all_ece.append(it_all_ece)
            all_mce.append(it_all_mce)

    # convert to NumPy arrays and reduce mean afterwards
    all_accuracy = np.array(all_accuracy)
    all_ace = np.array(all_ace)
    all_ece = np.array(all_ece)
    all_mce = np.array(all_mce)

    all_accuracy = np.mean(all_accuracy, axis=0)
    all_ace = np.mean(all_ace, axis=0)
    all_ece = np.mean(all_ece, axis=0)
    all_mce = np.mean(all_mce, axis=0)

    names = [len(x[0]) for x in models]
    buffer = max(names)

    # ---------------------------------------------------------
    # output formatted ECE
    fill = (buffer - len("Default")) * " "
    print(
        "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" %
        ("Default", fill, all_accuracy[0], all_ace[0], all_ece[0], all_mce[0]))

    # ---------------------------------------------------------
    for i, model in enumerate(models, start=1):
        name, instance = model
        fill = (buffer - len(name)) * " "
        print(
            "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" %
            (name, fill, all_accuracy[i], all_ace[i], all_ece[i], all_mce[i]))

    return 0
示例#20
0
def evaluate(
    annotations,
    results,
    iou=0.75,
    iou_type="segm",
    dataset="lvis",
    n_bins=10,
    commercial_only=False,
    subset=1.0,
    seed=0.0,
    min_score=0.0,
    vis_dir=None,
    vis_per_class=False,
    max_dets=300,
    max_dets_per_class=-1,
):
    """
    Args:
        annotations (str, Path, or dict): Path to COCO/LVIS-style annotations, or
            dict containing the annotations.
        results (str, Path, or dict): Path to COCO/LVIS-style results, or dict
            containing the results.
        iou (float): IoU threshold to evaluate calibration at.
        iou_type (str): segm or bbox
        dataset (str): lvis or coco
        n_bins (int): Number of bins for calibration eval
        commercial_only (bool): Use only commercial images for COCO. Used to match
            Küppers et al. setting.
        subset (float): If <1.0, use a random subset of this portion for eval.
        seed (float): Used to seed the rng for subset selection.
        min_score (float): If specified, ignore detections below this threshold for
            calibration evaluation. This flag does not affect the AP calculation.
            This should generally be left at 0, but can be set to 0.3 to match the
            Küppers et al. setting.
        vis_dir (str, Path, or None): If specified, output reliability diagrams to this
            directory.
        vis_per_class (bool): If vis_dir is specified and vis_per_class is True, output
            a reliability diagram for each class.
        max_dets (int): Limit number of detections per image.
        max_dets_per_class (int): Limit number of detections per class.
    """
    if vis_dir is not None:
        vis_dir = Path(vis_dir)
        plotter = ReliabilityDiagram(bins=n_bins, detection=True, metric="ECE")
    else:
        plotter = None

    rng = random.Random(seed)
    eval_wrapper = EvalWrapper(
        annotations,
        results,
        dataset_type=dataset,
        ious=[iou],
        iou_type=iou_type,
        max_dets=max_dets,
        max_dets_per_class=max_dets_per_class,
    )
    eval_obj = eval_wrapper.construct_eval(use_cats=True)
    is_lvis = eval_wrapper.is_lvis()
    params = eval_obj.params
    gt = eval_obj.lvis_gt if is_lvis else eval_obj.cocoGt

    if commercial_only:
        # Licenses 1, 2, 3 are NonCommercial
        valid_licenses = {4, 5, 6, 7, 8}
        orig_img_ids = params.img_ids if is_lvis else params.imgIds
        img_ids = [
            i for i in orig_img_ids if gt.imgs[i]["license"] in valid_licenses
        ]
        logging.info(
            f"Selecting {len(img_ids)}/{len(orig_img_ids)} commercial images.")
        if is_lvis:
            params.img_ids = img_ids
        else:
            params.imgIds = img_ids

    if subset < 1.0:
        img_ids = params.img_ids if is_lvis else params.imgIds
        k = int(round(len(img_ids) * subset))
        logging.info(f"Selecting {k}/{len(img_ids)} images randomly.")
        rng.shuffle(img_ids)
        if is_lvis:
            params.img_ids = img_ids[:k]
        else:
            params.imgIds = img_ids[:k]

    eval_obj.evaluate()

    # True positive set
    true_positives, false_positives, missed_gt = load_tp_fp_fn(eval_obj)

    eval_obj.accumulate()
    eval_obj.summarize()

    # Map class id to list of (detection: dict, is_matched: bool)
    class_dets = defaultdict(list)
    for dt_id in true_positives:
        ann = eval_wrapper.results.anns[dt_id]
        class_dets[ann["category_id"]].append((ann, True))
    for dt_id in false_positives:
        ann = eval_wrapper.results.anns[dt_id]
        class_dets[ann["category_id"]].append((ann, False))

    if min_score > 0.0:
        class_dets = {
            c: [x for x in dets if x[0]["score"] > min_score]
            for c, dets in class_dets.items()
        }
        # Remove empty classes.
        class_dets = {c: v for c, v in class_dets.items() if v}

    # Map class id to tuple of (scores, is_matched)
    scores_matched = {
        c: (
            np.array([d["score"]
                      for d, _ in dets])[:, np.newaxis],  # scores, (n, 1)
            np.array([m for _, m in dets])[:,
                                           np.newaxis],  # is_matched, (n, 1)
        )
        for c, dets in class_dets.items()
    }
    classes = sorted(scores_matched.keys())

    all_scores = np.vstack([scores_matched[c][0] for c in classes])
    all_is_matched = np.vstack([scores_matched[c][1] for c in classes])

    ece = ECE([n_bins], detection=True)

    output_metrics = {}
    output_metrics["AP"] = eval_obj.results["AP"]
    if is_lvis:
        for f in ("f", "c", "r"):
            output_metrics[f"AP{f}"] = eval_obj.results[f"AP{f}"]
    output_metrics["ece-overall"] = ece.measure(all_scores, all_is_matched)
    if plotter:
        fig = plotter.plot(all_scores,
                           all_is_matched,
                           filename=vis_dir / f"overall.pdf")
        plt.close(fig)

    # NOTE: Skips classes with no predictions nor groundtruth; Assigns ECE of 1.0 for
    # classes with groundtruth but no predictions.
    per_class_eces = {}
    predicted_classes = set(scores_matched.keys())
    missed_classes = {gt.anns[g]["category_id"] for g in missed_gt}
    for cid in missed_classes | predicted_classes:
        if cid not in predicted_classes:  # Present but not predicted
            # Skip class from calibration error.
            continue
        else:
            scores, is_matched = scores_matched[cid]
            per_class_eces[cid] = ece.measure(scores, is_matched)
            if plotter and vis_per_class:
                cname = gt.cats[cid].get("synset", gt.cats[cid]["name"])
                fig = plotter.plot(scores,
                                   is_matched,
                                   filename=vis_dir /
                                   f"class-{cid}-{cname}.pdf")
                plt.close(fig)
    output_metrics["ece-per-class"] = np.mean(list(per_class_eces.values()))

    if eval_wrapper.is_lvis():
        # Map frequency to category ids (eval_obj.freq_groups maps to indices)
        for f, indices in enumerate(eval_obj.freq_groups):
            freq = eval_obj.params.img_count_lbl[f]
            cat_ids = [eval_obj.params.cat_ids[i] for i in indices]
            cat_ids = [c for c in cat_ids if c in scores_matched]
            freq_scores = np.vstack([scores_matched[c][0] for c in cat_ids])
            freq_matched = np.vstack([scores_matched[c][1] for c in cat_ids])
            output_metrics[f"ece-freq-{freq}"] = ece.measure(
                freq_scores, freq_matched)
            output_metrics[f"ece-per-class-{freq}"] = np.mean(
                [per_class_eces[c] for c in cat_ids if c in per_class_eces])
            if plotter:
                fig = plotter.plot(freq_scores,
                                   freq_matched,
                                   filename=vis_dir / f"freq-{freq}.pdf")
                plt.close(fig)

    return output_metrics
示例#21
0
def get_model_diagnosis(df,
                        strategy='quantile',
                        rps_col_prefix='model',
                        add_baseline=False):
    """
    Diagnosis Plots:
    Accepts a DataFrame containing columns:
    ordinal_result_1
    ordinsl_result_2
    ordinal_result_3
    1
    2
    3
    The columns are paired as follows:
    "ordinal_result_1" represents a binary column defining
    whether a home win event occurred, and
    column named "1" contains the corresponding model probabilities
    Same for ordinal_result_2, and 2 and
    ordinal_result_3 and 3
    strategy{‘uniform’, ‘quantile’}, (default=’uniform’)
    Strategy used to define the widths of the bins.
      uniform
        All bins have identical widths.
      quantile
        All bins have the same number of points.
    RPS Plots:
    Accepts a DataFrame containing columns:
    ordinal_result
    "rps_col_prefix"_rps
    and optional columns named
    rps_baseline_1
    2
    3
    The columns are paired as follows:
    "ordinal_result_1" represents a binary column defining
    whether a home win event occurred, and
    column named "1" contains the corresponding model probabilities
    Same for ordinal_result_2, and 2 and
    ordinal_result_3 and 3
    """
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
    ax1, ax2, ax3 = axes[:, 0]
    n_bins = 10
    mapper = {1: 'Home Win', 2: 'Draw', 3: 'Away Win'}
    for col, ax in zip([1, 2, 3], (ax1, ax2, ax3)):
        fop, mpv = calibration_curve(df['ordinal_result_' + str(col)],
                                     df[col],
                                     n_bins=n_bins,
                                     strategy=strategy)
        # plot perfectly calibrated
        ax.plot([0, 1], [0, 1], linestyle='--')
        # plot model reliability
        ax.plot(mpv, fop, marker='.')
        ax.set_title(mapper[col])

    ax4, ax5, ax6 = axes[:, 1]
    n_bins = 10
    mapper = {1: 'Home Win RPS', 2: 'Draw RPS', 3: 'Away Win RPS'}
    for col, ax in zip([1, 2, 3], (ax4, ax5, ax6)):
        rpss = df[df['ordinal_result'] == col][rps_col_prefix + '_rps']
        ax.hist(rpss, bins=n_bins)
        ax.set_xlim(0, 1.0)
        baseline_col_name = 'rps_baseline_' + str(col)
        if add_baseline and baseline_col_name in df.columns:
            ax.axvline(df['rps_baseline_1'].unique(), color='r')
        median = rpss.median()
        ax.axvline(median,
                   color='r',
                   linestyle='dashed',
                   label=f'Median: {median:.3f}')
        ax.set_title(mapper[col])
        ax.legend()
        ax.grid()

    pred_arr, act_arr = df[[1, 2, 3]].values, df['ordinal_result'].values

    ace = ACE(bins=n_bins)
    ace_val = ace.measure(pred_arr, act_arr)

    ece = ECE(bins=n_bins)
    ece_val = ece.measure(pred_arr, act_arr)

    mce = MCE(bins=n_bins)
    mce_val = mce.measure(pred_arr, act_arr)

    print(
        f'Average Calibration Error:  {ace_val:.3f}\nExpected Calibration Error: {ece_val:.3f}\nMaximum Calibration Error:  {mce_val:.3f}'
    )
    print(f"Number of Instances: {len(df)}")
    return fig, (ax1, ax2, ax3, ax4, ax5, ax6)