def compute_val():
    loss_function = nn.CrossEntropyLoss()
    with torch.no_grad():
        model.eval()
        y_pred = []
        output_prob_val = []
        output_logits_val = []
        y_val_hard = []
        for sent, label in validating_loader:
            y_val_hard.append(label.item())
            sent = sent.squeeze(0)
            if torch.cuda.is_available():
                sent = sent.cuda()
                label = label.cuda()
            output = model.forward(sent)[0]
            logit, predicted = torch.max(output.data, 1)
            output_logits_val.append(output[0].cpu().tolist())
            output_prob_val.append(torch.sigmoid(output[0]).cpu().tolist())
            y_pred.append(predicted.item())
        loss_val = loss_function(torch.Tensor(output_logits_val),
                                 torch.LongTensor(y_val_hard)).item()
        model.train()
        # compute and plot ECE
        ece_val = ece_score(np.array(y_val_hard), np.array(output_prob_val))
        n_bins = 10
        title_suffix = ''
        diagram = ReliabilityDiagram(n_bins)
        diagram.plot(np.array(output_prob_val), np.array(y_val_hard),
                     title_suffix)
        # plt.savefig(title_suffix + '.pdf')

        # check if binary or multi class classification
        num_classes = len(set(y_val_hard))
        if num_classes == 2:
            average = 'binary'
        else:
            average = 'macro'
        pre_val, rec_val, f1_val, _ = precision_recall_fscore_support(
            y_val_hard, y_pred, average=average, beta=1)
        _, _, f01_val, _ = precision_recall_fscore_support(y_val_hard,
                                                           y_pred,
                                                           average=average,
                                                           beta=0.1)
        _, _, f10_val, _ = precision_recall_fscore_support(y_val_hard,
                                                           y_pred,
                                                           average=average,
                                                           beta=10)
        print(
            'Iteration: {}. Train Loss: {:1.5f}. Test Loss: {:1.5f}, F1: {:1.3f}, ECE: {:1.3f}, Precision: {:1.3f}, Recall: {:1.3f}'
            .format(i, loss.item(), loss_val, f1_val, ece_val, pre_val,
                    rec_val))
        # print to result file
        with open(res_path, 'w') as f:
            c = 'epoch, iter, loss_train, loss_test, pre_test, rec_test, f01_test, f1_test, f10_test, ece_test'
            f.write(c + '\n')
            res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(
                epoch, i, loss.item(), loss_val, pre_val, rec_val, f01_val,
                f1_val, f10_val, ece_val)
            f.write(res_i)
def plot_results(bins: Union[tuple, list, int], data: dict, methods0d: list,
                 methods2d: list):
    """
    Plot results as reliability diagrams (either 0D or 2D).

    Parameters
    ----------
    bins : iterable or int
        Number of bins used by ACE, ECE and MCE
    data : dict
        Dictionary of calibration data.
    methods0d : list
        List with strings containing the keys for the calibration data (confidence only methods).
    methods2d : list
        List with strings containing the keys for the calibration data (2D methods).
    """

    for i, methods in enumerate([methods0d, methods2d]):

        # insert 'confidence' key to the first place in the list to keep track of default miscalibration
        methods = ['confidence'] + methods

        # on confidence only, use one single value (the first one)
        bins = bins[0] if i == 0 and isinstance(bins, (tuple, list)) else bins

        # iterate over all calibration models and plot reliability diagram
        for method in methods:
            diagram = ReliabilityDiagram(bins,
                                         detection=True,
                                         title_suffix=method)
            fig = diagram.plot(data[method], data['matched'])

        # --------------------------------------------
        # second, plot 2D reliability diagrams as heatmaps
        for method in methods:
            data_input = np.stack((data[method], data['cx'], data['cy']),
                                  axis=1)

            diagram = ReliabilityDiagram(bins,
                                         detection=True,
                                         feature_names=['cx', 'cy'],
                                         fmin=0.0,
                                         fmax=0.3,
                                         title_suffix=method)
            fig = diagram.plot(data_input, data['matched'])

    plt.show()
示例#3
0
    def script():
        '''
                    LOAD DATA
        '''
        data_path = '../data/'
        train_data = pd.read_csv(data_path + 'v3.2.2_train.csv')
        test_data = pd.read_csv(data_path + 'v3.2.2_test.csv')

        y_train, y_test = train_data[['TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow']], \
                          test_data[['TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow']]
        X_train, X_test = train_data.drop(
            ['Unnamed: 0', 'TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow'], axis=1), \
                          test_data.drop(
                              ['Unnamed: 0', 'TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow'], axis=1)
        model = load_model("../network/models/v3.2.2/model.h5")
        print("# of train samples: ", len(y_train.index))
        print("# of test samples: ", len(y_test.index))

        ##Using NetCal package
        n_bins = 10
        confidences = model.predict(X_test.values)
        ece = ECE(n_bins)
        uncalibrated_score = ece.measure(confidences,
                                         y_test.values.argmax(axis=1))
        print("Calibration Error before calibration: ", uncalibrated_score)

        temperature = TemperatureScaling()
        temperature.fit(confidences, y_test.values.argmax(axis=1))
        calibrated = temperature.transform(confidences)
        ece = ECE(n_bins)
        calibrated_score = ece.measure(calibrated,
                                       y_test.values.argmax(axis=1))
        print("Calibration Error after calibration: ", calibrated_score)

        diagram = ReliabilityDiagram(n_bins)
        diagram.plot(confidences, y_test.values.argmax(
            axis=1))  # visualize miscalibration of uncalibrated

        diagram.plot(calibrated, y_test.values.argmax(
            axis=1))  # visualize miscalibration of calibrated

        np.savetxt('./calibration-data/test_calibrated_v3.2.2.csv',
                   calibrated,
                   delimiter=',')
         lw=1,
         color='red')

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="upper left")
ax1.set_title('Calibration plots  (reliability curve)')

ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)
plt.tight_layout()

#Tempreture scaling for probability calibration using netcal package
from netcal.scaling import TemperatureScaling
temperature = TemperatureScaling()
temperature.fit(y_prob, y_all)
calibrated = temperature.transform(y_prob)

#Computing the expected calibration error
from netcal.metrics import ECE
from netcal.presentation import ReliabilityDiagram
n_bins = 10
ece = ECE(n_bins)
uncalibrated_score = ece.measure(y_new, y_test)
calibrated_score = ece.measure(calibrated, y_test)

diagram = ReliabilityDiagram(n_bins)
diagram.plot(y_new, y_test)  # visualize miscalibration of uncalibrated
diagram.plot(calibrated, y_test)  # visualize miscalibration of calibrated
示例#5
0
def plot_reliability_diagram(y_true, y_prob, n_bins=10, title_suffix=''):
    diagram = ReliabilityDiagram(n_bins)
    diagram.plot(y_prob, y_true, title_suffix)
示例#6
0
def single_example(models: list,
                   datafile: str,
                   bins: int,
                   diagram: str = None,
                   validation_split: float = 0.7,
                   save_models: bool = False,
                   domain: str = ".") -> int:
    """
    Measure miscalibration of given methods on specified dataset.

    Parameters
    ----------
    models : list
        List of tuples with [('<name>', <instance of CalibrationMethod>), ...].
    datafile : str
        Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'.
    bins : int
        Number of bins used by ECE, MCE and ReliabilityDiagram.
    diagram : str, optional, default: None
        Type of diagram wich should be plotted. This could be 'diagram', 'curve', 'inference' or None.
    validation_split : float
        Split ratio between build set and validation set.
    save_models : bool
        True if instances of calibration methods should be stored.
    domain : str, optional, default: "."
        Domain/directory where to store the results.

    Returns
    -------
    int
        0 on success, -1 otherwise
    """

    if not os.path.exists(datafile):
        print("Dataset \'%s\' does not exist" % datafile)
        return -1

    # read NumPy input files
    try:
        with open(datafile, "rb") as open_file:
            npzfile = np.load(open_file)
            ground_truth = npzfile['ground_truth'].squeeze()
            predictions = npzfile['predictions'].squeeze()
    except KeyError:
        print(
            "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'"
            % datafile)
        return -1

    # split data set into build set and validation set
    build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split(
        ground_truth,
        predictions,
        test_size=validation_split,
        stratify=ground_truth,
        random_state=None)

    # initialize error metrics
    ace = ACE(bins)
    ece = ECE(bins)
    mce = MCE(bins)

    predictions = []
    all_ace = [ace.measure(validation_set_sm, validation_set_gt)]
    all_ece = [ece.measure(validation_set_sm, validation_set_gt)]
    all_mce = [mce.measure(validation_set_sm, validation_set_gt)]

    # ------------------------------------------

    # build and save models
    for model in models:
        name, instance = model
        print("Build %s model" % name)
        instance.fit(build_set_sm, build_set_gt)

        if save_models:
            instance.save_model("%s/models/%s.pkl" % (domain, name))

    # ------------------------------------------

    # perform predictions
    for model in models:
        _, instance = model
        prediction = instance.transform(validation_set_sm)
        predictions.append(prediction)

        all_ace.append(ace.measure(prediction, validation_set_gt))
        all_ece.append(ece.measure(prediction, validation_set_gt))
        all_mce.append(mce.measure(prediction, validation_set_gt))

    # ------------------------------------------

    # output formatted ECE
    names = [len(x[0]) for x in models]
    buffer = max(names)

    fill = (buffer - len("Default")) * " "
    print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" %
          ("Default", fill, all_ace[0], all_ece[0], all_mce[0]))
    for i, model in enumerate(models, start=1):
        name, instance = model
        fill = (buffer - len(name)) * " "
        print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" %
              (name, fill, all_ace[i], all_ece[i], all_mce[i]))

    # ------------------------------------------

    if diagram == 'diagram':

        diagram = ReliabilityDiagram(bins=bins, title_suffix="default")
        diagram.plot(validation_set_sm, validation_set_gt, filename="test.png")
        for i, prediction in enumerate(predictions):
            diagram = ReliabilityDiagram(bins=bins, title_suffix=models[i][0])
            diagram.plot(prediction, validation_set_gt)

    elif diagram is None:
        pass
    else:
        print("Unknown diagram type \'%s\'" % diagram)
        return -1

    return 0
示例#7
0
def evaluate(
    annotations,
    results,
    iou=0.75,
    iou_type="segm",
    dataset="lvis",
    n_bins=10,
    commercial_only=False,
    subset=1.0,
    seed=0.0,
    min_score=0.0,
    vis_dir=None,
    vis_per_class=False,
    max_dets=300,
    max_dets_per_class=-1,
):
    """
    Args:
        annotations (str, Path, or dict): Path to COCO/LVIS-style annotations, or
            dict containing the annotations.
        results (str, Path, or dict): Path to COCO/LVIS-style results, or dict
            containing the results.
        iou (float): IoU threshold to evaluate calibration at.
        iou_type (str): segm or bbox
        dataset (str): lvis or coco
        n_bins (int): Number of bins for calibration eval
        commercial_only (bool): Use only commercial images for COCO. Used to match
            Küppers et al. setting.
        subset (float): If <1.0, use a random subset of this portion for eval.
        seed (float): Used to seed the rng for subset selection.
        min_score (float): If specified, ignore detections below this threshold for
            calibration evaluation. This flag does not affect the AP calculation.
            This should generally be left at 0, but can be set to 0.3 to match the
            Küppers et al. setting.
        vis_dir (str, Path, or None): If specified, output reliability diagrams to this
            directory.
        vis_per_class (bool): If vis_dir is specified and vis_per_class is True, output
            a reliability diagram for each class.
        max_dets (int): Limit number of detections per image.
        max_dets_per_class (int): Limit number of detections per class.
    """
    if vis_dir is not None:
        vis_dir = Path(vis_dir)
        plotter = ReliabilityDiagram(bins=n_bins, detection=True, metric="ECE")
    else:
        plotter = None

    rng = random.Random(seed)
    eval_wrapper = EvalWrapper(
        annotations,
        results,
        dataset_type=dataset,
        ious=[iou],
        iou_type=iou_type,
        max_dets=max_dets,
        max_dets_per_class=max_dets_per_class,
    )
    eval_obj = eval_wrapper.construct_eval(use_cats=True)
    is_lvis = eval_wrapper.is_lvis()
    params = eval_obj.params
    gt = eval_obj.lvis_gt if is_lvis else eval_obj.cocoGt

    if commercial_only:
        # Licenses 1, 2, 3 are NonCommercial
        valid_licenses = {4, 5, 6, 7, 8}
        orig_img_ids = params.img_ids if is_lvis else params.imgIds
        img_ids = [
            i for i in orig_img_ids if gt.imgs[i]["license"] in valid_licenses
        ]
        logging.info(
            f"Selecting {len(img_ids)}/{len(orig_img_ids)} commercial images.")
        if is_lvis:
            params.img_ids = img_ids
        else:
            params.imgIds = img_ids

    if subset < 1.0:
        img_ids = params.img_ids if is_lvis else params.imgIds
        k = int(round(len(img_ids) * subset))
        logging.info(f"Selecting {k}/{len(img_ids)} images randomly.")
        rng.shuffle(img_ids)
        if is_lvis:
            params.img_ids = img_ids[:k]
        else:
            params.imgIds = img_ids[:k]

    eval_obj.evaluate()

    # True positive set
    true_positives, false_positives, missed_gt = load_tp_fp_fn(eval_obj)

    eval_obj.accumulate()
    eval_obj.summarize()

    # Map class id to list of (detection: dict, is_matched: bool)
    class_dets = defaultdict(list)
    for dt_id in true_positives:
        ann = eval_wrapper.results.anns[dt_id]
        class_dets[ann["category_id"]].append((ann, True))
    for dt_id in false_positives:
        ann = eval_wrapper.results.anns[dt_id]
        class_dets[ann["category_id"]].append((ann, False))

    if min_score > 0.0:
        class_dets = {
            c: [x for x in dets if x[0]["score"] > min_score]
            for c, dets in class_dets.items()
        }
        # Remove empty classes.
        class_dets = {c: v for c, v in class_dets.items() if v}

    # Map class id to tuple of (scores, is_matched)
    scores_matched = {
        c: (
            np.array([d["score"]
                      for d, _ in dets])[:, np.newaxis],  # scores, (n, 1)
            np.array([m for _, m in dets])[:,
                                           np.newaxis],  # is_matched, (n, 1)
        )
        for c, dets in class_dets.items()
    }
    classes = sorted(scores_matched.keys())

    all_scores = np.vstack([scores_matched[c][0] for c in classes])
    all_is_matched = np.vstack([scores_matched[c][1] for c in classes])

    ece = ECE([n_bins], detection=True)

    output_metrics = {}
    output_metrics["AP"] = eval_obj.results["AP"]
    if is_lvis:
        for f in ("f", "c", "r"):
            output_metrics[f"AP{f}"] = eval_obj.results[f"AP{f}"]
    output_metrics["ece-overall"] = ece.measure(all_scores, all_is_matched)
    if plotter:
        fig = plotter.plot(all_scores,
                           all_is_matched,
                           filename=vis_dir / f"overall.pdf")
        plt.close(fig)

    # NOTE: Skips classes with no predictions nor groundtruth; Assigns ECE of 1.0 for
    # classes with groundtruth but no predictions.
    per_class_eces = {}
    predicted_classes = set(scores_matched.keys())
    missed_classes = {gt.anns[g]["category_id"] for g in missed_gt}
    for cid in missed_classes | predicted_classes:
        if cid not in predicted_classes:  # Present but not predicted
            # Skip class from calibration error.
            continue
        else:
            scores, is_matched = scores_matched[cid]
            per_class_eces[cid] = ece.measure(scores, is_matched)
            if plotter and vis_per_class:
                cname = gt.cats[cid].get("synset", gt.cats[cid]["name"])
                fig = plotter.plot(scores,
                                   is_matched,
                                   filename=vis_dir /
                                   f"class-{cid}-{cname}.pdf")
                plt.close(fig)
    output_metrics["ece-per-class"] = np.mean(list(per_class_eces.values()))

    if eval_wrapper.is_lvis():
        # Map frequency to category ids (eval_obj.freq_groups maps to indices)
        for f, indices in enumerate(eval_obj.freq_groups):
            freq = eval_obj.params.img_count_lbl[f]
            cat_ids = [eval_obj.params.cat_ids[i] for i in indices]
            cat_ids = [c for c in cat_ids if c in scores_matched]
            freq_scores = np.vstack([scores_matched[c][0] for c in cat_ids])
            freq_matched = np.vstack([scores_matched[c][1] for c in cat_ids])
            output_metrics[f"ece-freq-{freq}"] = ece.measure(
                freq_scores, freq_matched)
            output_metrics[f"ece-per-class-{freq}"] = np.mean(
                [per_class_eces[c] for c in cat_ids if c in per_class_eces])
            if plotter:
                fig = plotter.plot(freq_scores,
                                   freq_matched,
                                   filename=vis_dir / f"freq-{freq}.pdf")
                plt.close(fig)

    return output_metrics
示例#8
0
def transform(frames: List[Dict], dataset: str, network: str, subset: List,
              ious: List, test_ids: List[int]):
    """
    After calibration training, evaluate the trained models by several miscalibration metrics. These metrics are:
    D-ECE, Brier, NLL. Also capture area under precision-recall curve (AUPRC).
    All results are stored at "./output/<network>".

    Parameters
    ----------
    frames : List[Dict]
        List of dictionaries holding the input data for each image frame.
    dataset : str
        String of the used dataset (see detectron2 registered datasets).
    network : str
        String describing the base neural network.
    subset : List[str]
        List with additional features used for calibration. Options are:
        - 'cx'
        - 'cy'
        - 'w'
        - 'h'
    ious : List[float]
        List with IoU scores used for evaluation.
    test_ids : List
        List of data frame ids used for calibration testing.
    """

    # get meta information and specify all relevant paths
    meta = MetadataCatalog.get(dataset)
    model_dir = os.path.join("calibration", network, "models")
    output_dir = os.path.join("output", network)
    diagram_path = os.path.join(
        output_dir, "diagrams",
        ''.join(subset) if len(subset) > 0 else "confidence")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(diagram_path, exist_ok=True)

    # calibration methods that have also been used for calibration training
    methods = [("histogram", HistogramBinning), ("lr", LogisticCalibration),
               ("lr_dependent", LogisticCalibrationDependent),
               ("betacal", BetaCalibration),
               ("betacal_dependent", BetaCalibrationDependent)]

    # reverse mapping of category ids to network class ids (e.g. for COCO dataset)
    if hasattr(meta, "thing_dataset_id_to_contiguous_id"):
        reverse_dictionary = {
            v: k
            for k, v in meta.thing_dataset_id_to_contiguous_id.items()
        }
    else:
        reverse_dictionary = None

    # lists and placeholders for evaluation metrics
    n_samples_total = 0
    n_samples_per_class = []
    dece_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)]

    brier_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)]
    nll_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)]
    average_precision = [[[] for _ in ious] for _ in range(len(methods) + 1)]

    # -----------------------------------------------------
    # visualization routine
    diagram0d = ReliabilityDiagram(bins=20, detection=True, sample_threshold=8)
    diagram1d = ReliabilityDiagram(bins=[5, 15],
                                   detection=True,
                                   sample_threshold=3,
                                   fmin=0,
                                   fmax=0.3)
    diagram2d = ReliabilityDiagram(bins=[6, 9, 9],
                                   detection=True,
                                   sample_threshold=2,
                                   fmin=0,
                                   fmax=0.3)

    def plot(f: np.ndarray, m: np.ndarray, title: str, formatter: str):
        # Define function for diagram output

        # plot baseline miscalibration
        figures = [
            diagram0d.plot(f[:, :1],
                           m,
                           tikz=False,
                           title_suffix=title,
                           filename=formatter % "0d")
        ]

        # plot all additional features in 1D miscalibration plots
        for i, fname in enumerate(['cx', 'cy', 'w', 'h']):
            figures.append(
                diagram1d.plot(f[:, (0, i + 1)],
                               m,
                               tikz=False,
                               feature_names=[fname],
                               title_suffix=title,
                               filename=formatter % ("1d_%s" % fname)))

        # finally, plot all feature combinations of size 2
        for (i, fname1), (j, fname2) in itertools.combinations(
                enumerate(['cx', 'cy', 'w', 'h']), 2):
            figures.append(
                diagram2d.plot(f[:, (0, i + 1, j + 1)],
                               m,
                               tikz=False,
                               feature_names=[fname1, fname2],
                               title_suffix=title,
                               filename=formatter % ("2d_%s_%s" %
                                                     (fname1, fname2))))

        # free memory space
        for fig in figures:
            plt.close(fig)

    # -----------------------------------------------------

    # iterate over all classes that are present in the current dataset
    for i, classname in enumerate(meta.thing_classes):

        # get calibration features for selected class
        category_id = reverse_dictionary[
            i] if reverse_dictionary is not None else i
        features, matched, img_ids = get_features(frames, category_id, subset,
                                                  ious, test_ids)
        all_features, _, _ = get_features(frames, category_id,
                                          ['cx', 'cy', 'w', 'h'], ious,
                                          test_ids)

        if features.size == 0:
            print("No samples for category %s found" % classname)
            continue

        # different binning schemes for different feature dimensions
        if features.shape[1] == 1:
            bins = 20
        elif features.shape[1] == 3:
            bins = 8
        elif features.shape[1] == 5:
            bins = 5
        else:
            raise ValueError("Unknown dimension: %d" % features.shape[1])

        # define D-ECE metric
        dece = ECE(bins=bins, detection=True, sample_threshold=8)
        n_samples_per_class.append(features.shape[0])
        n_samples_total += features.shape[0]

        # failed flag is required to optionally blank failed or non-present classes during evaluation
        # i.e., if a metric returns NaN
        failed = False

        # perform evaluation for each category separately
        print("Inference: category %d: %d samples" %
              (category_id, features.shape[0]))
        for j, (iou, m) in enumerate(zip(ious, matched)):

            score = average_precision_score(m, features[:, 0])
            if not np.isfinite(score) or np.isnan(score):
                brier_per_class[0][j].append(0.)
                nll_per_class[0][j].append(0.)
                dece_per_class[0][j].append(0.)
                average_precision[0][j].append(0.)
                failed = True

            # compute average precision, Brier, NLL and ECE
            else:
                brier_per_class[0][j].append(
                    np.mean(np.square(features[:, 0] - m)))
                nll_per_class[0][j].append(
                    -np.mean(m * np.log(features[:, 0]) +
                             (1. - m) * np.log(1. - features[:, 0])))
                dece_per_class[0][j].append(dece.measure(features, m))
                average_precision[0][j].append(score)

            diagramname = os.path.join(
                diagram_path,
                "default_cls-%02d_iou%.2f" % (i, iou) + "_%s.tex")
            plot(all_features, m, title="default", formatter=diagramname)

            # start calibration evaluation for each method separately
            for k, (name, method) in enumerate(methods, start=1):
                instance = method()

                try:
                    print("Load %s and transform" % name)
                    instance.load_model(
                        os.path.join(
                            model_dir, "%s_%s_iou%.2f_cls-%02d.pkl" %
                            (name, ''.join(subset), iou, i)))
                    calibrated = instance.transform(features)

                    # perform clipping
                    np.clip(calibrated,
                            np.finfo(np.float32).eps,
                            1. - np.finfo(np.float32).eps,
                            out=calibrated)
                    score = average_precision_score(m, calibrated)
                    if not np.isfinite(score) or np.isnan(score):
                        raise ValueError("Couldn't compute AUPRC score")

                    average_precision[k][j].append(score)

                    brier_per_class[k][j].append(
                        np.mean(np.square(calibrated - m)))
                    nll_per_class[k][j].append(
                        -np.mean(m * np.log(calibrated) +
                                 (1. - m) * np.log(1. - calibrated)))

                    input = np.concatenate(
                        (np.reshape(calibrated, (-1, 1)), features[:, 1:]),
                        axis=1)
                    dece_per_class[k][j].append(dece.measure(input, m))

                    diagramname = os.path.join(
                        diagram_path,
                        "%s_cls-%02d_iou%.2f" % (name, i, iou) + "_%s.tex")
                    input = np.concatenate(
                        (np.reshape(calibrated, (-1, 1)), all_features[:, 1:]),
                        axis=1)
                    plot(input, m, title=name, formatter=diagramname)

                except (FileNotFoundError, ValueError):
                    print(
                        "Could not find weight file ",
                        os.path.join(
                            model_dir, "%s_%s_iou%.2f_cls-%02d.pkl" %
                            (name, ''.join(subset), iou, i)))
                    print("Disable evaluation for class %d" % i)

                    brier_per_class[k][j].append(0.)
                    nll_per_class[k][j].append(0.)
                    dece_per_class[k][j].append(0.)
                    average_precision[k][j].append(0.)

                    failed = True

        if failed:
            n_samples_total -= n_samples_per_class[-1]
            n_samples_per_class[-1] = 0

    # convert all lists to NumPy arrays
    weights = np.array(n_samples_per_class) / n_samples_total
    brier_per_class = np.array(brier_per_class)
    nll_per_class = np.array(nll_per_class)
    dece_per_class = np.array(dece_per_class)
    average_precision = np.array(average_precision)

    # compute a feed-forward average and and a weighted counter-part
    brier_global = np.mean(brier_per_class, axis=2)
    weighted_brier_global = np.average(brier_per_class,
                                       weights=weights,
                                       axis=2)
    nll_global = np.mean(nll_per_class, axis=2)
    weighted_nll_global = np.average(nll_per_class, weights=weights, axis=2)
    dece_global = np.mean(dece_per_class, axis=2)
    weighted_dece_global = np.average(dece_per_class, weights=weights, axis=2)
    average_precision_macro = np.mean(average_precision, axis=2)
    average_precision_weighted = np.average(average_precision,
                                            weights=weights,
                                            axis=2)

    # use tabulate library to visualize the evaluation results
    header = []
    body = [['default']]
    body.extend([[name] for name, method in methods])
    for i, iou in enumerate(ious):

        header.extend([
            'D-ECE(w) @ IoU %.2f' % iou,
            'D-ECE @ IoU %.2f' % iou,
            'Brier(w) @ IoU %.2f' % iou,
            'Brier @ IoU %.2f' % iou,
            'NLL(w) @ IoU %.2f' % iou,
            'NLL @ IoU %.2f' % iou,
            'AP(w) @ IoU %.2f' % iou, 'AP @ IoU %.2f'
        ])
        body[0].extend([
            weighted_dece_global[0][i], dece_global[0][i],
            weighted_brier_global[0][i], brier_global[0][i],
            weighted_nll_global[0][i], nll_global[0][i],
            average_precision_weighted[0][i], average_precision_macro[0][i]
        ])
        for k, (name, method) in enumerate(methods):
            body[k + 1].extend([
                weighted_dece_global[k + 1][i], dece_global[k + 1][i],
                weighted_brier_global[k + 1][i], brier_global[k + 1][i],
                weighted_nll_global[k + 1][i], nll_global[k + 1][i],
                average_precision_weighted[k + 1][i],
                average_precision_macro[k + 1][i]
            ])

    results = [header, *body]

    # also write the evaluation results to CSV format
    print("\nEvaluation Results:")
    print(tabulate(results, headers="firstrow"))
    with open(os.path.join(output_dir, "results_%s.csv" % ''.join(subset)),
              "w") as open_file:
        writer = csv.writer(open_file)
        writer.writerow([
            "method",
        ] + results[0])
        writer.writerows(results[1:])