def evaluate(self): """Computes evaluation result. Returns: A named tuple with the following fields - average_precision: a float number corresponding to average precision. precisions: an array of precisions. recalls: an array of recalls. recall@50: recall computed on 50 top-scoring samples. recall@100: recall computed on 100 top-scoring samples. median_rank@50: median rank computed on 50 top-scoring samples. median_rank@100: median rank computed on 100 top-scoring samples. """ if self._num_gt_instances == 0: logging.warn('No ground truth instances') if not self._scores: scores = np.array([], dtype=float) tp_fp_labels = np.array([], dtype=bool) else: scores = np.concatenate(self._scores) tp_fp_labels = np.concatenate(self._tp_fp_labels) relation_field_values = np.concatenate(self._relation_field_values) for relation_field_value, _ in ( self._num_gt_instances_per_relationship.iteritems()): precisions, recalls = metrics.compute_precision_recall( scores[relation_field_values == relation_field_value], tp_fp_labels[relation_field_values == relation_field_value], self._num_gt_instances_per_relationship[relation_field_value]) self._average_precisions[ relation_field_value] = metrics.compute_average_precision( precisions, recalls) self._mean_average_precision = np.mean( self._average_precisions.values()) self._precisions, self._recalls = metrics.compute_precision_recall( scores, tp_fp_labels, self._num_gt_instances) self._weighted_average_precision = metrics.compute_average_precision( self._precisions, self._recalls) self._recall_50 = (metrics.compute_recall_at_k(self._tp_fp_labels, self._num_gt_instances, 50)) self._median_rank_50 = (metrics.compute_median_rank_at_k( self._tp_fp_labels, 50)) self._recall_100 = (metrics.compute_recall_at_k( self._tp_fp_labels, self._num_gt_instances, 100)) self._median_rank_100 = (metrics.compute_median_rank_at_k( self._tp_fp_labels, 100)) return VRDDetectionEvalMetrics( self._weighted_average_precision, self._mean_average_precision, self._average_precisions, self._precisions, self._recalls, self._recall_50, self._recall_100, self._median_rank_50, self._median_rank_100)
def test_compute_precision_recall(self): num_gt = 10 scores = np.array([0.4, 0.3, 0.6, 0.2, 0.7, 0.1], dtype=float) labels = np.array([0, 1, 1, 0, 0, 1], dtype=bool) labels_float_type = np.array([0, 1, 1, 0, 0, 1], dtype=float) accumulated_tp_count = np.array([0, 1, 1, 2, 2, 3], dtype=float) expected_precision = accumulated_tp_count / np.array([1, 2, 3, 4, 5, 6]) expected_recall = accumulated_tp_count / num_gt precision, recall = metrics.compute_precision_recall(scores, labels, num_gt) precision_float_type, recall_float_type = metrics.compute_precision_recall( scores, labels_float_type, num_gt) self.assertAllClose(precision, expected_precision) self.assertAllClose(recall, expected_recall) self.assertAllClose(precision_float_type, expected_precision) self.assertAllClose(recall_float_type, expected_recall)
def test_compute_precision_recall_and_ap_no_groundtruth(self): num_gt = 0 scores = np.array([0.4, 0.3, 0.6, 0.2, 0.7, 0.1], dtype=float) labels = np.array([0, 0, 0, 0, 0, 0], dtype=bool) expected_precision = None expected_recall = None precision, recall = metrics.compute_precision_recall(scores, labels, num_gt) self.assertEquals(precision, expected_precision) self.assertEquals(recall, expected_recall) ap = metrics.compute_average_precision(precision, recall) self.assertTrue(np.isnan(ap))
def test_compute_precision_recall_float(self): num_gt = 10 scores = np.array([0.4, 0.3, 0.6, 0.2, 0.7, 0.1], dtype=float) labels_float = np.array([0, 1, 1, 0.5, 0, 1], dtype=float) expected_precision = np.array( [0., 0.5, 0.33333333, 0.5, 0.55555556, 0.63636364], dtype=float) expected_recall = np.array([0., 0.1, 0.1, 0.2, 0.25, 0.35], dtype=float) precision, recall = metrics.compute_precision_recall( scores, labels_float, num_gt) self.assertAllClose(precision, expected_precision) self.assertAllClose(recall, expected_recall)
def evaluate(self): """Compute evaluation result. Returns: A named tuple with the following fields - average_precision: float numpy array of average precision for each class. mean_ap: mean average precision of all classes, float scalar precisions: List of precisions, each precision is a float numpy array recalls: List of recalls, each recall is a float numpy array corloc: numpy float array mean_corloc: Mean CorLoc score for each class, float scalar """ if (self.num_gt_instances_per_class == 0).any(): logging.warn( 'The following classes have no ground truth examples: %s', np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) + self.label_id_offset) if self.use_weighted_mean_ap: all_scores = np.array([], dtype=float) all_tp_fp_labels = np.array([], dtype=bool) for class_index in range(self.num_class): if self.num_gt_instances_per_class[class_index] == 0: continue if not self.scores_per_class[class_index]: scores = np.array([], dtype=float) tp_fp_labels = np.array([], dtype=float) else: scores = np.concatenate(self.scores_per_class[class_index]) tp_fp_labels = np.concatenate( self.tp_fp_labels_per_class[class_index]) if self.use_weighted_mean_ap: all_scores = np.append(all_scores, scores) all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels) logging.info('Scores and tpfp per class label: %d', class_index) logging.info(tp_fp_labels) logging.info(scores) precision, recall = metrics.compute_precision_recall( scores, tp_fp_labels, self.num_gt_instances_per_class[class_index]) self.precisions_per_class.append(precision) self.recalls_per_class.append(recall) average_precision = metrics.compute_average_precision( precision, recall) self.average_precision_per_class[class_index] = average_precision self.corloc_per_class = metrics.compute_cor_loc( self.num_gt_imgs_per_class, self.num_images_correctly_detected_per_class) if self.use_weighted_mean_ap: num_gt_instances = np.sum(self.num_gt_instances_per_class) precision, recall = metrics.compute_precision_recall( all_scores, all_tp_fp_labels, num_gt_instances) mean_ap = metrics.compute_average_precision(precision, recall) else: mean_ap = np.nanmean(self.average_precision_per_class) mean_corloc = np.nanmean(self.corloc_per_class) return ObjectDetectionEvalMetrics(self.average_precision_per_class, mean_ap, self.precisions_per_class, self.recalls_per_class, self.corloc_per_class, mean_corloc)