def evaluate_predictions(self, results_list, monitor_metrics=None):
        """
        Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes.
        Resulting info of each prediction is stored as one line in an internal dataframe, with the keys:
        det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative)
        pred_class: foreground class which the object predicts.
        pid: corresponding patient-id.
        pred_score: confidence score [0, 1]
        fold: corresponding fold of CV.
        match_iou: utilized IoU for matching.
        :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form:
        [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
        Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...])
        :param monitor_metrics (optional):  dict of dicts with all metrics of previous epochs.
        :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch.
        """
        # gets results_list = [[batch_instances_box_lists], [batch_instances_pids]]*n_batches
        # we want to evaluate one batch_instance (= 2D or 3D image) at a time.

        df_list_preds = []
        df_list_labels = []
        df_list_class_preds = []
        df_list_pids = []
        df_list_type = []
        df_list_match_iou = []

        self.logger.info('evaluating in mode {}'.format(self.mode))

        if self.mode == 'train' or self.mode == 'val_sampling':
            # batch_size > 1, with varying patients across batch:
            # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
            # -> [results_0, results_1, ..] , [pid_0, pid_1, ...]
            batch_elements_list = [[b_box_list] for item in results_list
                                   for b_box_list in item[0]]
            pid_list = [pid for item in results_list for pid in item[1]]
        else:
            # patient processing, one element per batch = one patient.
            # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..] , [pid_0, pid_1, ...]
            batch_elements_list = [item[0] for item in results_list]
            pid_list = [item[1] for item in results_list]

        for match_iou in self.cf.ap_match_ious:
            self.logger.info(
                'evaluating with conf_thresh: {} match_iou: {}'.format(
                    self.cf.min_det_thresh, match_iou))
            for cl in list(self.cf.class_dict.keys()):
                for pix, pid in enumerate(pid_list):

                    len_df_list_before_patient = len(df_list_pids)

                    # input of each batch element is a list of boxes, where each box is a dictionary.
                    for bix, b_boxes_list in enumerate(
                            batch_elements_list[pix]):

                        b_tar_boxes = []
                        b_cand_boxes = []
                        b_cand_scores = []

                        for box in b_boxes_list:
                            if (box['box_type'] == 'det' and
                                    box['box_score'] < self.cf.min_det_thresh):
                                print("skipped a box")
                                continue
                            if (box['box_type'] == 'gt'
                                    and box['box_label'] == cl):
                                b_tar_boxes.append(box['box_coords'])
                            if (box['box_type'] == 'det'
                                    and box['box_pred_class_id'] == cl):
                                b_cand_boxes.append(box['box_coords'])
                                b_cand_scores.append(box['box_score'])

                        b_tar_boxes = np.array(b_tar_boxes)
                        b_cand_boxes = np.array(b_cand_boxes)
                        b_cand_scores = np.array(b_cand_scores)

                        # check if predictions and ground truth boxes exist and match them according to match_iou.
                        if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
                            overlaps = mutils.compute_overlaps(
                                b_cand_boxes, b_tar_boxes)
                            match_cand_ixs = np.argwhere(
                                np.max(overlaps, 1) > match_iou)[:, 0]
                            non_match_cand_ixs = np.argwhere(
                                np.max(overlaps, 1) <= match_iou)[:, 0]
                            match_gt_ixs = np.argmax(
                                overlaps[match_cand_ixs, :], 1
                            ) if not 0 in match_cand_ixs.shape else np.array(
                                [])
                            non_match_gt_ixs = np.array([
                                ii for ii in np.arange(b_tar_boxes.shape[0])
                                if ii not in match_gt_ixs
                            ])
                            unique, counts = np.unique(match_gt_ixs,
                                                       return_counts=True)

                            # check for double assignments, i.e. two predictions having been assigned to the same gt.
                            # according to the COCO-metrics, only one prediction counts as true positive, the rest counts as
                            # false positive. This case is supposed to be avoided by the model itself by,
                            #  e.g. using a low enough NMS threshold.
                            if np.any(counts > 1):
                                double_match_gt_ixs = unique[np.argwhere(
                                    counts > 1)[:, 0]]
                                keep_max = []
                                double_match_list = []
                                for dg in double_match_gt_ixs:
                                    double_match_cand_ixs = match_cand_ixs[
                                        np.argwhere(match_gt_ixs == dg)]
                                    keep_max.append(
                                        double_match_cand_ixs[np.argmax(
                                            b_cand_scores[
                                                double_match_cand_ixs])])
                                    double_match_list += [
                                        ii for ii in double_match_cand_ixs
                                    ]

                                fp_ixs = np.array([
                                    ii for ii in match_cand_ixs
                                    if (ii in double_match_list
                                        and ii not in keep_max)
                                ])

                                match_cand_ixs = np.array([
                                    ii for ii in match_cand_ixs
                                    if ii not in fp_ixs
                                ])

                                df_list_preds += [
                                    ii for ii in b_cand_scores[fp_ixs]
                                ]
                                df_list_labels += [0] * fp_ixs.shape[0]
                                df_list_class_preds += [cl] * fp_ixs.shape[0]
                                df_list_pids += [pid] * fp_ixs.shape[0]
                                df_list_type += ['det_fp'] * fp_ixs.shape[0]

                            # matched:
                            if not 0 in match_cand_ixs.shape:
                                df_list_preds += [
                                    ii for ii in b_cand_scores[match_cand_ixs]
                                ]
                                df_list_labels += [1] * match_cand_ixs.shape[0]
                                df_list_class_preds += [
                                    cl
                                ] * match_cand_ixs.shape[0]
                                df_list_pids += [pid] * match_cand_ixs.shape[0]
                                df_list_type += ['det_tp'
                                                 ] * match_cand_ixs.shape[0]
                            # rest fp:
                            if not 0 in non_match_cand_ixs.shape:
                                df_list_preds += [
                                    ii
                                    for ii in b_cand_scores[non_match_cand_ixs]
                                ]
                                df_list_labels += [
                                    0
                                ] * non_match_cand_ixs.shape[0]
                                df_list_class_preds += [
                                    cl
                                ] * non_match_cand_ixs.shape[0]
                                df_list_pids += [
                                    pid
                                ] * non_match_cand_ixs.shape[0]
                                df_list_type += [
                                    'det_fp'
                                ] * non_match_cand_ixs.shape[0]
                            # rest fn:
                            if not 0 in non_match_gt_ixs.shape:
                                df_list_preds += [0
                                                  ] * non_match_gt_ixs.shape[0]
                                df_list_labels += [
                                    1
                                ] * non_match_gt_ixs.shape[0]
                                df_list_class_preds += [
                                    cl
                                ] * non_match_gt_ixs.shape[0]
                                df_list_pids += [pid
                                                 ] * non_match_gt_ixs.shape[0]
                                df_list_type += ['det_fn'
                                                 ] * non_match_gt_ixs.shape[0]
                                # print("FN in ", pid)
                        # only fp:
                        if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape:
                            df_list_preds += [ii for ii in b_cand_scores]
                            df_list_labels += [0] * b_cand_scores.shape[0]
                            df_list_class_preds += [cl
                                                    ] * b_cand_scores.shape[0]
                            df_list_pids += [pid] * b_cand_scores.shape[0]
                            df_list_type += ['det_fp'] * b_cand_scores.shape[0]
                        # only fn:
                        if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
                            df_list_preds += [0] * b_tar_boxes.shape[0]
                            df_list_labels += [1] * b_tar_boxes.shape[0]
                            df_list_class_preds += [cl] * b_tar_boxes.shape[0]
                            df_list_pids += [pid] * b_tar_boxes.shape[0]
                            df_list_type += ['det_fn'] * b_tar_boxes.shape[0]

                    # empty patient with 0 detections needs patient dummy score, in order to not disappear from stats.
                    # filtered out for roi-level evaluation later. During training (and val_sampling),
                    # tn are assigned per sample independently of associated patients.
                    if len(df_list_pids) == len_df_list_before_patient:
                        df_list_preds += [0] * 1
                        df_list_labels += [0] * 1
                        df_list_class_preds += [cl] * 1
                        df_list_pids += [pid] * 1
                        df_list_type += [
                            'patient_tn'
                        ] * 1  # true negative: no ground truth boxes, no detections.

            df_list_match_iou += [match_iou] * (len(df_list_preds) -
                                                len(df_list_match_iou))

        self.test_df = pd.DataFrame()
        self.test_df['pred_score'] = df_list_preds
        self.test_df['class_label'] = df_list_labels
        self.test_df['pred_class'] = df_list_class_preds
        self.test_df['pid'] = df_list_pids
        self.test_df['det_type'] = df_list_type
        self.test_df['fold'] = self.cf.fold
        self.test_df['match_iou'] = df_list_match_iou
        if monitor_metrics is not None:
            return self.return_metrics(monitor_metrics)
    def eval_boxes(self, batch_res_dicts, pid_list):
        """ """

        df_list_preds = []
        df_list_labels = []
        df_list_class_preds = []
        df_list_pids = []
        df_list_type = []
        df_list_match_iou = []

        if self.mode == 'train' or self.mode == 'val_sampling':
            # one pid per batch element
            # batch_size > 1, with varying patients across batch:
            # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
            # -> [results_0, results_1, ..]
            batch_inst_boxes = [
                b_res_dict['boxes'] for b_res_dict in batch_res_dicts
            ]  # len: nr of batches in epoch
            batch_inst_boxes = [[b_inst_boxes]
                                for whole_batch_boxes in batch_inst_boxes
                                for b_inst_boxes in whole_batch_boxes]
        else:
            # patient processing, one element per batch = one patient.
            # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..]
            batch_inst_boxes = [
                b_res_dict['boxes'] for b_res_dict in batch_res_dicts
            ]

        assert len(batch_inst_boxes) == len(pid_list)

        for match_iou in self.cf.ap_match_ious:
            self.logger.info('evaluating with match_iou: {}'.format(match_iou))
            for cl in list(self.cf.class_dict.keys()):
                for pix, pid in enumerate(pid_list):

                    len_df_list_before_patient = len(df_list_pids)

                    # input of each batch element is a list of boxes, where each box is a dictionary.
                    for bix, b_boxes_list in enumerate(batch_inst_boxes[pix]):

                        b_tar_boxes = np.array([
                            box['box_coords'] for box in b_boxes_list
                            if (box['box_type'] == 'gt'
                                and box['box_label'] == cl)
                        ])
                        b_cand_boxes = np.array([
                            box['box_coords'] for box in b_boxes_list
                            if (box['box_type'] == 'det'
                                and box['box_pred_class_id'] == cl)
                        ])
                        b_cand_scores = np.array([
                            box['box_score'] for box in b_boxes_list
                            if (box['box_type'] == 'det'
                                and box['box_pred_class_id'] == cl)
                        ])

                        # check if predictions and ground truth boxes exist and match them according to match_iou.
                        if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
                            overlaps = mutils.compute_overlaps(
                                b_cand_boxes, b_tar_boxes)
                            match_cand_ixs = np.argwhere(
                                np.max(overlaps, 1) > match_iou)[:, 0]
                            non_match_cand_ixs = np.argwhere(
                                np.max(overlaps, 1) <= match_iou)[:, 0]
                            match_gt_ixs = np.argmax(
                                overlaps[match_cand_ixs, :], 1
                            ) if not 0 in match_cand_ixs.shape else np.array(
                                [])
                            non_match_gt_ixs = np.array([
                                ii for ii in np.arange(b_tar_boxes.shape[0])
                                if ii not in match_gt_ixs
                            ])
                            unique, counts = np.unique(match_gt_ixs,
                                                       return_counts=True)

                            # check for double assignments, i.e. two predictions having been assigned to the same gt.
                            # according to the COCO-metrics, only one prediction counts as true positive, the rest counts as
                            # false positive. This case is supposed to be avoided by the model itself by,
                            #  e.g. using a low enough NMS threshold.
                            if np.any(counts > 1):
                                double_match_gt_ixs = unique[np.argwhere(
                                    counts > 1)[:, 0]]
                                keep_max = []
                                double_match_list = []
                                for dg in double_match_gt_ixs:
                                    double_match_cand_ixs = match_cand_ixs[
                                        np.argwhere(match_gt_ixs == dg)]
                                    keep_max.append(
                                        double_match_cand_ixs[np.argmax(
                                            b_cand_scores[
                                                double_match_cand_ixs])])
                                    double_match_list += [
                                        ii for ii in double_match_cand_ixs
                                    ]

                                fp_ixs = np.array([
                                    ii for ii in match_cand_ixs
                                    if (ii in double_match_list
                                        and ii not in keep_max)
                                ])

                                match_cand_ixs = np.array([
                                    ii for ii in match_cand_ixs
                                    if ii not in fp_ixs
                                ])

                                df_list_preds += [
                                    ii for ii in b_cand_scores[fp_ixs]
                                ]
                                df_list_labels += [0] * fp_ixs.shape[0]
                                df_list_class_preds += [cl] * fp_ixs.shape[0]
                                df_list_pids += [pid] * fp_ixs.shape[0]
                                df_list_type += ['det_fp'] * fp_ixs.shape[0]

                            # matched:
                            if not 0 in match_cand_ixs.shape:
                                df_list_preds += [
                                    ii for ii in b_cand_scores[match_cand_ixs]
                                ]
                                df_list_labels += [1] * match_cand_ixs.shape[0]
                                df_list_class_preds += [
                                    cl
                                ] * match_cand_ixs.shape[0]
                                df_list_pids += [pid] * match_cand_ixs.shape[0]
                                df_list_type += ['det_tp'
                                                 ] * match_cand_ixs.shape[0]
                            # rest fp:
                            if not 0 in non_match_cand_ixs.shape:
                                df_list_preds += [
                                    ii
                                    for ii in b_cand_scores[non_match_cand_ixs]
                                ]
                                df_list_labels += [
                                    0
                                ] * non_match_cand_ixs.shape[0]
                                df_list_class_preds += [
                                    cl
                                ] * non_match_cand_ixs.shape[0]
                                df_list_pids += [
                                    pid
                                ] * non_match_cand_ixs.shape[0]
                                df_list_type += [
                                    'det_fp'
                                ] * non_match_cand_ixs.shape[0]
                            # rest fn:
                            if not 0 in non_match_gt_ixs.shape:
                                df_list_preds += [0
                                                  ] * non_match_gt_ixs.shape[0]
                                df_list_labels += [
                                    1
                                ] * non_match_gt_ixs.shape[0]
                                df_list_class_preds += [
                                    cl
                                ] * non_match_gt_ixs.shape[0]
                                df_list_pids += [pid
                                                 ] * non_match_gt_ixs.shape[0]
                                df_list_type += ['det_fn'
                                                 ] * non_match_gt_ixs.shape[0]
                        # only fp:
                        if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape:
                            df_list_preds += [ii for ii in b_cand_scores]
                            df_list_labels += [0] * b_cand_scores.shape[0]
                            df_list_class_preds += [cl
                                                    ] * b_cand_scores.shape[0]
                            df_list_pids += [pid] * b_cand_scores.shape[0]
                            df_list_type += ['det_fp'] * b_cand_scores.shape[0]
                        # only fn:
                        if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
                            df_list_preds += [0] * b_tar_boxes.shape[0]
                            df_list_labels += [1] * b_tar_boxes.shape[0]
                            df_list_class_preds += [cl] * b_tar_boxes.shape[0]
                            df_list_pids += [pid] * b_tar_boxes.shape[0]
                            df_list_type += ['det_fn'] * b_tar_boxes.shape[0]

                    # empty patient with 0 detections needs patient dummy score, in order to not disappear from stats.
                    # filtered out for roi-level evaluation later. During training (and val_sampling),
                    # tn are assigned per sample independently of associated patients.
                    if len(df_list_pids) == len_df_list_before_patient:
                        df_list_preds += [0] * 1
                        df_list_labels += [0] * 1
                        df_list_class_preds += [cl] * 1
                        df_list_pids += [pid] * 1
                        df_list_type += [
                            'patient_tn'
                        ] * 1  # true negative: no ground truth boxes, no detections.

            df_list_match_iou += [match_iou] * (len(df_list_preds) -
                                                len(df_list_match_iou))

        self.test_df = pd.DataFrame()
        self.test_df['pred_score'] = df_list_preds
        self.test_df['class_label'] = df_list_labels
        self.test_df['pred_class'] = df_list_class_preds
        self.test_df['pid'] = df_list_pids
        self.test_df['det_type'] = df_list_type
        self.test_df['fold'] = self.cf.fold
        self.test_df['match_iou'] = df_list_match_iou
示例#3
0
def gt_anchor_matching(cf,
                       anchors,
                       gt_boxes,
                       gt_class_ids=None,
                       gt_regressions=None):
    """Given the anchors and GT boxes, compute overlaps and identify positive
    anchors and deltas to refine them to match their corresponding GT boxes.

    anchors: [num_anchors, (y1, x1, y2, x2, (z1), (z2))]
    gt_boxes: [num_gt_boxes, (y1, x1, y2, x2, (z1), (z2))]
    gt_class_ids (optional): [num_gt_boxes] Integer class IDs for one stage detectors. in RPN case of Mask R-CNN,
    set all positive matches to 1 (foreground)
    gt_regressions: [num_gt_rgs, n_rg_feats], if None empty rg_targets are returned

    Returns:
    anchor_class_matches: [N] (int32) matches between anchors and GT boxes. class_id = positive anchor,
     -1 = negative anchor, 0 = neutral. i.e., positively matched anchors are marked by class_id (which is >0).
    anchor_delta_targets: [N, (dy, dx, (dz), log(dh), log(dw), (log(dd)))] Anchor bbox deltas.
    anchor_rg_targets: [n_anchors, n_rg_feats]
    """

    anchor_class_matches = np.zeros([anchors.shape[0]], dtype=np.int32)
    anchor_delta_targets = np.zeros(
        (cf.rpn_train_anchors_per_image, 2 * cf.dim))
    if gt_regressions is not None:
        if 'regression_bin' in cf.prediction_tasks:
            anchor_rg_targets = np.zeros((cf.rpn_train_anchors_per_image, ))
        else:
            anchor_rg_targets = np.zeros(
                (cf.rpn_train_anchors_per_image, cf.regression_n_features))
    else:
        anchor_rg_targets = np.array([])

    anchor_matching_iou = cf.anchor_matching_iou

    if gt_boxes is None:
        anchor_class_matches = np.full(anchor_class_matches.shape,
                                       fill_value=-1)
        return anchor_class_matches, anchor_delta_targets, anchor_rg_targets

    # for mrcnn: anchor matching is done for RPN loss, so positive labels are all 1 (foreground)
    if gt_class_ids is None:
        gt_class_ids = np.array([1] * len(gt_boxes))

    # Compute overlaps [num_anchors, num_gt_boxes]
    overlaps = mutils.compute_overlaps(anchors, gt_boxes)

    # Match anchors to GT Boxes
    # If an anchor overlaps a GT box with IoU >= anchor_matching_iou then it's positive.
    # If an anchor overlaps a GT box with IoU < 0.1 then it's negative.
    # Neutral anchors are those that don't match the conditions above,
    # and they don't influence the loss function.
    # However, don't keep any GT box unmatched (rare, but happens). Instead,
    # match it to the closest anchor (even if its max IoU is < 0.1).

    # 1. Set negative anchors first. They get overwritten below if a GT box is
    # matched to them. Skip boxes in crowd areas.
    anchor_iou_argmax = np.argmax(overlaps, axis=1)
    anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
    if anchors.shape[1] == 4:
        anchor_class_matches[(anchor_iou_max < 0.1)] = -1
    elif anchors.shape[1] == 6:
        anchor_class_matches[(anchor_iou_max < 0.01)] = -1
    else:
        raise ValueError('anchor shape wrong {}'.format(anchors.shape))

    # 2. Set an anchor for each GT box (regardless of IoU value).
    gt_iou_argmax = np.argmax(overlaps, axis=0)
    for ix, ii in enumerate(gt_iou_argmax):
        anchor_class_matches[ii] = gt_class_ids[ix]

    # 3. Set anchors with high overlap as positive.
    above_thresh_ixs = np.argwhere(anchor_iou_max >= anchor_matching_iou)
    anchor_class_matches[above_thresh_ixs] = gt_class_ids[
        anchor_iou_argmax[above_thresh_ixs]]

    # Subsample to balance positive anchors.
    ids = np.where(anchor_class_matches > 0)[0]
    extra = len(ids) - (cf.rpn_train_anchors_per_image // 2)
    if extra > 0:
        # Reset the extra ones to neutral
        ids = np.random.choice(ids, extra, replace=False)
        anchor_class_matches[ids] = 0

    # Leave all negative proposals negative for now and sample from them later in online hard example mining.
    # For positive anchors, compute shift and scale needed to transform them to match the corresponding GT boxes.
    ids = np.where(anchor_class_matches > 0)[0]
    ix = 0  # index into anchor_delta_targets
    for i, a in zip(ids, anchors[ids]):
        # closest gt box (it might have IoU < anchor_matching_iou)
        gt = gt_boxes[anchor_iou_argmax[i]]

        # convert coordinates to center plus width/height.
        gt_h = gt[2] - gt[0]
        gt_w = gt[3] - gt[1]
        gt_center_y = gt[0] + 0.5 * gt_h
        gt_center_x = gt[1] + 0.5 * gt_w
        # Anchor
        a_h = a[2] - a[0]
        a_w = a[3] - a[1]
        a_center_y = a[0] + 0.5 * a_h
        a_center_x = a[1] + 0.5 * a_w

        if cf.dim == 2:
            anchor_delta_targets[ix] = [(gt_center_y - a_center_y) / a_h,
                                        (gt_center_x - a_center_x) / a_w,
                                        np.log(gt_h / a_h),
                                        np.log(gt_w / a_w)]
        else:
            gt_d = gt[5] - gt[4]
            gt_center_z = gt[4] + 0.5 * gt_d
            a_d = a[5] - a[4]
            a_center_z = a[4] + 0.5 * a_d
            anchor_delta_targets[ix] = [(gt_center_y - a_center_y) / a_h,
                                        (gt_center_x - a_center_x) / a_w,
                                        (gt_center_z - a_center_z) / a_d,
                                        np.log(gt_h / a_h),
                                        np.log(gt_w / a_w),
                                        np.log(gt_d / a_d)]

        # normalize.
        anchor_delta_targets[ix] /= cf.rpn_bbox_std_dev
        if gt_regressions is not None:
            anchor_rg_targets[ix] = gt_regressions[anchor_iou_argmax[i]]

        ix += 1

    return anchor_class_matches, anchor_delta_targets, anchor_rg_targets
    def evaluate_predictions(self, results_list, epoch,cf,pth='./',flag = ''):
        """
        Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes.
        Resulting info of each prediction is stored as one line in an internal dataframe, with the keys:
        det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative)
        pred_class: foreground class which the object predicts.
        pid: corresponding patient-id.
        pred_score: confidence score [0, 1]
        fold: corresponding fold of CV.
        match_iou: utilized IoU for matching.
        :param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form:
        [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
        Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...])
        :param monitor_metrics (optional):  dict of dicts with all metrics of previous epochs.
        :return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch.
        """
        # gets results_list = [[batch_instances_box_lists], [batch_instances_pids]]*n_batches
        # we want to evaluate one batch_instance (= 2D or 3D image) at a time.
        df_list_preds = []
        df_list_labels = []
        df_list_class_preds = []
        df_list_pids = []
        df_list_type = []
        df_list_match_iou = []
        df_list_pred_iou = []
        df_list_coords = []
        df_list_gt = []
        self.logger.info('evaluating in mode {}'.format(self.mode))


        if self.mode == 'train' or self.mode=='val_sampling':
            # batch_size > 1, with varying patients across batch:
            # [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
            # -> [results_0, results_1, ..] , [pid_0, pid_1, ...]
            #print('results_list',len(results_list))
            batch_elements_list = [[b_box_list] for item in results_list for b_box_list in item[0]]#this epoch all batch len = batchsize*batchnumber
            pid_list = [pid for item in results_list for pid in item[1]]
        else:#test or val_patient
            # patient processing, one element per batch = one patient.
            # [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..] , [pid_0, pid_1, ...]
            batch_elements_list = [item[0] for item in results_list]
            pid_list = [item[1] for item in results_list]

        total_num = []
        [total_num.append(i) for i in pid_list if not i in total_num] 
        total_num = len(total_num)#total patient number

        for match_iou in self.cf.ap_match_ious:
            self.logger.info('evaluating with match_iou: {}'.format(match_iou))
            TP_roi,FP_roi,FN_roi,TN_pat = 0,0,0,0
            for cl in list(self.cf.class_dict.keys()):
                for pix, pid in enumerate(pid_list):
                    for bix, b_boxes_list in enumerate(batch_elements_list[pix]):#len == 1
                        b_tar_boxes = np.array([box['box_coords'] for box in b_boxes_list if
                                                (box['box_type'] == 'gt' and box['box_label'] == cl)])
                        b_cand_boxes = np.array([box['box_coords'] for box in b_boxes_list if
                                                 (box['box_type'] == 'det' and
                                                  box['box_pred_class_id'] == cl)])#detected box
                        b_cand_scores = np.array([box['box_score'] for box in b_boxes_list if
                                                  (box['box_type'] == 'det' and
                                                   box['box_pred_class_id'] == cl)])#detected source
                        if len(b_tar_boxes) > 0:
                            gt = b_tar_boxes[0]
                        else:
                            gt = [0, 0, 0, 0, 0, 0]
                        # check if predictions and ground truth boxes exist and match them according to match_iou.
                        if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:#30,6 1,6 30,1
                            # pred is FP or TP
                            overlaps = mutils.compute_overlaps(b_cand_boxes, b_tar_boxes)#overlap between gt and pred
                            match_cand_ixs = np.argwhere(np.max(overlaps, 1) > match_iou)[:, 0]# 0.1 TP
                            non_match_cand_ixs = np.argwhere(np.max(overlaps, 1) <= match_iou)[:, 0]#FP

                            # matched:
                            if not 0 in match_cand_ixs.shape:
                                df_list_preds += [ii for ii in b_cand_scores[match_cand_ixs]]
                                df_list_coords += [ii for ii in b_cand_boxes[match_cand_ixs]]
                                df_list_pred_iou += [ii for ii in overlaps[match_cand_ixs]]
                                df_list_gt += [gt] * match_cand_ixs.shape[0]
                                df_list_labels += [1] * match_cand_ixs.shape[0]
                                df_list_class_preds += [cl] * match_cand_ixs.shape[0]
                                df_list_pids += [pid] * match_cand_ixs.shape[0]
                                df_list_type += ['det_tp'] * match_cand_ixs.shape[0]
                                TP_roi += match_cand_ixs.shape[0]
                            # rest fp:
                            if not 0 in non_match_cand_ixs.shape:
                                df_list_preds += [ii for ii in b_cand_scores[non_match_cand_ixs]]
                                df_list_coords += [ii for ii in b_cand_boxes[non_match_cand_ixs]]
                                df_list_pred_iou += [ii for ii in overlaps[non_match_cand_ixs]]
                                df_list_gt += [gt] * non_match_cand_ixs.shape[0]
                                df_list_labels += [0] * non_match_cand_ixs.shape[0]
                                df_list_class_preds += [cl] * non_match_cand_ixs.shape[0]
                                df_list_pids += [pid] * non_match_cand_ixs.shape[0]
                                df_list_type += ['det_fp'] * non_match_cand_ixs.shape[0]
                                FP_roi += non_match_cand_ixs.shape[0]
                        # only fp:
                        if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape:
                            df_list_preds += [ii for ii in b_cand_scores]
                            df_list_coords += [ii for ii in b_cand_boxes]
                            df_list_pred_iou += [0] * b_cand_scores.shape[0]
                            df_list_gt += [gt] * b_cand_scores.shape[0]
                            df_list_labels += [0] * b_cand_scores.shape[0]
                            df_list_class_preds += [cl] * b_cand_scores.shape[0]
                            df_list_pids += [pid] * b_cand_scores.shape[0]
                            df_list_type += ['det_fp'] * b_cand_scores.shape[0]
                            FP_roi += b_cand_scores.shape[0]
            df_list_match_iou += [match_iou] * (len(df_list_preds) - len(df_list_match_iou))
        self.test_df = pd.DataFrame()
        self.test_df['pred_score'] = df_list_preds
        self.test_df['pred_box'] = df_list_coords
        self.test_df['pred_iou'] = df_list_pred_iou
        self.test_df['gt'] = df_list_gt
        self.test_df['class_label'] = df_list_labels
        self.test_df['pred_class'] = df_list_class_preds
        self.test_df['pid'] = df_list_pids
        self.test_df['det_type'] = df_list_type
        self.test_df['fold'] = self.cf.fold
        self.test_df['match_iou'] = df_list_match_iou
        tp_patient = [] 
        for ii,det in enumerate(self.test_df['det_type']):
            if det == 'det_tp':
                tp_patient.append(self.test_df['pid'][ii])
        tp_patient_ = []
        [tp_patient_.append(i) for i in tp_patient if not i in tp_patient_] 
        tp_patient_num = len(tp_patient_)
        if flag == 'test':
            if cf.test_last_epoch == False:
                csvpth = pth + '/{}_epoch_{}.csv'.format(flag,epoch)
            else:
                csvpth = pth + '/{}_lastepoch_{}.csv'.format(flag,epoch)
            print('csvpth',csvpth)
            self.test_df.to_csv(csvpth)
        return (tp_patient_num,TP_roi,FP_roi,total_num)#,FN_roi,TN_pat)