def score(path_predictions, path_groundtruth, path_output, iou_threshold=.4): assert (iou_threshold < 1 and iou_threshold > 0) ttime = time.time() boxes_dict = {} pchips = [] stclasses = [] num_preds = 0 for file in tqdm(os.listdir(path_predictions)): fname = file.split(".txt")[0] pchips.append(fname) with open(path_predictions + file, 'r') as f: arr = np.array(list(csv.reader(f, delimiter=" "))) if arr.shape[0] == 0: #If the file is empty, we fill it in with an array of zeros boxes_dict[fname] = np.array([[0, 0, 0, 0, 0, 0]]) num_preds += 1 else: arr = arr[:, :6].astype(np.float64) threshold = iou_threshold arr = arr[arr[:, 5] > threshold] stclasses += list(arr[:, 4]) num_preds += arr.shape[0] if np.any(arr[:, :4] < 0): raise ValueError('Bounding boxes cannot be negative.') if np.any(arr[:, 5] < 0) or np.any(arr[:, 5] > 1): raise ValueError( 'Confidence scores should be between 0 and 1.') boxes_dict[fname] = arr[:, :6] pchips = sorted(pchips) stclasses = np.unique(stclasses).astype(np.int64) gt_coords, gt_chips, gt_classes = get_labels(path_groundtruth) gt_coords = gt_coords[gt_chips == '5.tif'] gt_classes = gt_classes[gt_chips == '5.tif'].astype(np.int64) gt_chips = gt_chips[gt_chips == '5.tif'] gt_unique = np.unique(gt_classes.astype(np.int64)) print(gt_unique) max_gt_cls = 100 if set(pchips).issubset(set(gt_unique)): raise ValueError( 'The prediction files {%s} are not in the ground truth.' % str(set(pchips) - (set(gt_unique)))) print("Number of Predictions: %d" % num_preds) print("Number of GT: %d" % np.sum(gt_classes.shape)) per_file_class_data = {} for i in gt_unique: per_file_class_data[i] = [[], []] num_gt_per_cls = np.zeros((max_gt_cls)) for file_ind in range(len(pchips)): print(pchips[file_ind]) det_box = boxes_dict[pchips[file_ind]][:, :4] det_scores = boxes_dict[pchips[file_ind]][:, 5] det_cls = boxes_dict[pchips[file_ind]][:, 4] gt_box = gt_coords[(gt_chips == pchips[file_ind]).flatten()] gt_cls = gt_classes[(gt_chips == pchips[file_ind])] for i in gt_unique: s = det_scores[det_cls == i] ssort = np.argsort(s)[::-1] per_file_class_data[i][0] += s[ssort].tolist() gt_box_i_cls = gt_box[gt_cls == i].flatten().tolist() det_box_i_cls = det_box[det_cls == i] det_box_i_cls = det_box_i_cls[ssort].flatten().tolist() gt_rects = convert_to_rectangle_list(gt_box_i_cls) rects = convert_to_rectangle_list(det_box_i_cls) matching = Matching(gt_rects, rects) rects_matched, gt_matched = matching.greedy_match(iou_threshold) #we aggregate confidence scores, rectangles, and num_gt across classes #per_file_class_data[i][0] += det_scores[det_cls == i].tolist() per_file_class_data[i][1] += rects_matched num_gt_per_cls[i] += len(gt_matched) average_precision_per_class = np.ones(max_gt_cls) * float('nan') per_class_p = np.ones(max_gt_cls) * float('nan') per_class_r = np.ones(max_gt_cls) * float('nan') for i in gt_unique: scores = np.array(per_file_class_data[i][0]) rects_matched = np.array(per_file_class_data[i][1]) if num_gt_per_cls[i] != 0: sorted_indices = np.argsort(scores)[::-1] tp_sum = np.cumsum(rects_matched[sorted_indices]) fp_sum = np.cumsum(np.logical_not(rects_matched[sorted_indices])) precision = tp_sum / (tp_sum + fp_sum + np.spacing(1)) recall = tp_sum / num_gt_per_cls[i] per_class_p[i] = np.sum(rects_matched) / len(rects_matched) per_class_r[i] = np.sum(rects_matched) / num_gt_per_cls[i] ap = ap_from_pr(precision, recall) else: ap = float('nan') average_precision_per_class[i] = ap #metric splits metric_keys = [ 'map', 'map/small', 'map/medium', 'map/large', 'map/common', 'map/rare' ] splits = { 'map/small': [ 17, 18, 19, 20, 21, 23, 24, 26, 27, 28, 32, 41, 60, 62, 63, 64, 65, 66, 91 ], 'map/medium': [ 11, 12, 15, 25, 29, 33, 34, 35, 36, 37, 38, 42, 44, 47, 50, 53, 56, 59, 61, 71, 72, 73, 76, 84, 86, 93, 94 ], 'map/large': [13, 40, 45, 49, 51, 52, 54, 55, 57, 74, 77, 79, 83, 89], 'map/common': [ 13, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 34, 35, 41, 47, 60, 63, 64, 71, 72, 73, 76, 77, 79, 83, 86, 89, 91 ], 'map/rare': [ 11, 12, 15, 29, 32, 33, 36, 37, 38, 40, 42, 44, 45, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 61, 62, 65, 66, 74, 84, 93, 94 ] } vals = {} vals['map'] = np.nanmean(average_precision_per_class) vals['map_score'] = np.nanmean(per_class_p) vals['mar_score'] = np.nanmean(per_class_r) for i in splits.keys(): vals[i] = np.nanmean(average_precision_per_class[splits[i]]) for i in gt_unique: vals[int(i)] = average_precision_per_class[int(i)] vals['f1'] = 2 / ((1 / (np.spacing(1) + vals['map_score'])) + (1 / (np.spacing(1) + vals['mar_score']))) print("mAP: %f | mAP score: %f | mAR: %f | F1: %f" % (vals['map'], vals['map_score'], vals['mar_score'], vals['f1'])) with open(path_output + '/score.txt', 'w') as f: f.write(str("%.4f" % vals['map'])) result = [] with open(path_output + '/metrics.txt', 'w') as f: for key in vals.keys(): f.write("%s %.4f\n" % (str(key), vals[key])) result.append( str(key) + " " + str(round(float(vals[key]), 4)) + "\n") result = sorted(result) print("Final time: %s" % str(time.time() - ttime)) return result
def score(path_predictions, path_groundtruth, path_output, iou_threshold=.5): """ Compute metrics on a number of prediction files, given a folder of prediction files and a ground truth. Primary metric is mean average precision (mAP). Args: path_predictions: a folder path of prediction files. Prediction files should have filename format 'XYZ.tif.txt', where 'XYZ.tif' is the xView TIFF file being predicted on. Prediction files should be in space-delimited csv format, with each line like (xmin ymin xmax ymax class_prediction score_prediction) path_groundtruth: a file path to a single ground truth geojson path_output: a folder path for output scoring files iou_threshold: a float between 0 and 1 indicating the percentage iou required to count a prediction as a true positive Outputs: Writes two files to the 'path_output' parameter folder: 'score.txt' and 'metrics.txt' 'score.txt' contains a single floating point value output: mAP 'metrics.txt' contains the remaining metrics in per-line format (metric/class_num: score_float) Raises: ValueError: if there are files in the prediction folder that are not in the ground truth geojson. EG a prediction file is titled '15.tif.txt', but the file '15.tif' is not in the ground truth. """ assert (iou_threshold < 1 and iou_threshold > 0) ttime = time.time() boxes_dict = {} pchips = [] stclasses = [] num_preds = 0 for file in tqdm(os.listdir(path_predictions)): fname = file.split(".txt")[0] pchips.append(fname) with open(path_predictions + file, 'r') as f: arr = np.array(list(csv.reader(f, delimiter=" "))) arr = arr[:, :6].astype(np.float64) threshold = 0 arr = arr[arr[:, 5] > threshold] stclasses += list(arr[:, 4]) num_preds += arr.shape[0] if np.any(arr[:, :4] < 0): raise ValueError('Bounding boxes cannot be negative.') boxes_dict[fname] = arr[:, :6] pchips = sorted(pchips) stclasses = np.unique(stclasses).astype(np.int64) gt_coords, gt_chips, gt_classes = get_labels(path_groundtruth) gt_unique = np.unique(gt_classes.astype(np.int64)) max_gt_cls = 100 if set(pchips).issubset(set(gt_unique)): raise ValueError( 'The prediction files {%s} are not in the ground truth.' % str(set(pchips) - (set(gt_unique)))) print("Number of Predictions: %d" % num_preds) print("Number of GT: %d" % np.sum(gt_classes.shape)) per_file_class_data = {} for i in gt_unique: per_file_class_data[i] = [[], []] num_gt_per_cls = np.zeros((max_gt_cls)) for file_ind in range(len(pchips)): print(pchips[file_ind]) det_box = boxes_dict[pchips[file_ind]][:, :4] det_scores = boxes_dict[pchips[file_ind]][:, 5] det_cls = boxes_dict[pchips[file_ind]][:, 4] gt_box = gt_coords[(gt_chips == pchips[file_ind]).flatten()] gt_cls = gt_classes[(gt_chips == pchips[file_ind])] for i in gt_unique: gt_box_i_cls = gt_box[gt_cls == i].flatten().tolist() det_box_i_cls = det_box[det_cls == i].flatten().tolist() gt_rects = convert_to_rectangle_list(gt_box_i_cls) rects = convert_to_rectangle_list(det_box_i_cls) matching = Matching(gt_rects, rects) rects_matched, gt_matched = matching.greedy_match(iou_threshold) #we aggregate confidence scores, rectangles, and num_gt across classes per_file_class_data[i][0] += det_scores[det_cls == i].tolist() per_file_class_data[i][1] += rects_matched num_gt_per_cls[i] += len(gt_matched) average_precision_per_class = np.ones(max_gt_cls) * float('nan') per_class_p = np.ones(max_gt_cls) * float('nan') per_class_r = np.ones(max_gt_cls) * float('nan') for i in gt_unique: scores = np.array(per_file_class_data[i][0]) rects_matched = np.array(per_file_class_data[i][1]) if num_gt_per_cls[i] != 0: sorted_indices = np.argsort(scores)[::-1] tp_sum = np.cumsum(rects_matched[sorted_indices]) fp_sum = np.cumsum(np.logical_not(rects_matched[sorted_indices])) precision = tp_sum / (tp_sum + fp_sum + np.spacing(1)) recall = tp_sum / num_gt_per_cls[i] per_class_p[i] = np.sum(rects_matched) / len(rects_matched) per_class_r[i] = np.sum(rects_matched) / num_gt_per_cls[i] ap = ap_from_pr(precision, recall) else: ap = float('nan') average_precision_per_class[i] = ap #metric splits metric_keys = [ 'map', 'map/small', 'map/medium', 'map/large', 'map/common', 'map/rare' ] splits = { 'map/small': [ 17, 18, 19, 20, 21, 23, 24, 26, 27, 28, 32, 41, 60, 62, 63, 64, 65, 66, 91 ], 'map/medium': [ 11, 12, 15, 25, 29, 33, 34, 35, 36, 37, 38, 42, 44, 47, 50, 53, 56, 59, 61, 71, 72, 73, 76, 84, 86, 93, 94 ], 'map/large': [13, 40, 45, 49, 51, 52, 54, 55, 57, 74, 77, 79, 83, 89], 'map/common': [ 13, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 34, 35, 41, 47, 60, 63, 64, 71, 72, 73, 76, 77, 79, 83, 86, 89, 91 ], 'map/rare': [ 11, 12, 15, 29, 32, 33, 36, 37, 38, 40, 42, 44, 45, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 61, 62, 65, 66, 74, 84, 93, 94 ] } vals = {} vals['map'] = np.nanmean(average_precision_per_class) vals['map_score'] = np.nanmean(per_class_p) vals['mar_score'] = np.nanmean(per_class_r) for i in splits.keys(): vals[i] = np.nanmean(average_precision_per_class[splits[i]]) for i in gt_unique: vals[int(i)] = average_precision_per_class[int(i)] vals['f1'] = 2 / ((1 / (np.spacing(1) + vals['map_score'])) + (1 / (np.spacing(1) + vals['mar_score']))) print("mAP: %f | mAP score: %f | mAR: %f | F1: %f" % (vals['map'], vals['map_score'], vals['mar_score'], vals['f1'])) with open(path_output + '/score.txt', 'w') as f: f.write(str("%.8f" % vals['map'])) with open(path_output + '/metrics.txt', 'w') as f: for key in vals.keys(): f.write("%s %f\n" % (str(key), vals[key])) print("Final time: %s" % str(time.time() - ttime))
def score(path_predictions, path_groundtruth, path_output, iou_threshold=.5): """ Compute metrics on a number of prediction files, given a folder of prediction files and a ground truth. Primary metric is mean average precision (mAP). Args: path_predictions: a folder path of prediction files. Prediction files should have filename format 'XYZ.tif.txt', where 'XYZ.tif' is the xView TIFF file being predicted on. Prediction files should be in space-delimited csv format, with each line like (xmin ymin xmax ymax class_prediction score_prediction) path_groundtruth: a file path to a single ground truth geojson path_output: a folder path for output scoring files iou_threshold: a float between 0 and 1 indicating the percentage iou required to count a prediction as a true positive Outputs: Writes two files to the 'path_output' parameter folder: 'score.txt' and 'metrics.txt' 'score.txt' contains a single floating point value output: mAP 'metrics.txt' contains the remaining metrics in per-line format (metric/class_num: score_float) Raises: ValueError: if there are files in the prediction folder that are not in the ground truth geojson. EG a prediction file is titled '15.tif.txt', but the file '15.tif' is not in the ground truth. """ assert (iou_threshold < 1 and iou_threshold > 0) ttime = time.time() boxes_dict = {} pchips = [] stclasses = [] num_preds = 0 # pchips: prediction txt for file in tqdm(os.listdir(path_predictions)): fname = file.split(".txt")[0] pchips.append(fname) # debug with open(path_predictions + file, 'r') as f: #arr = np.array(list(csv.reader(f,delimiter=" "))) # maybe not needed predict_list = list(csv.reader(f, delimiter=" ")) new_list = remove_invalid_predictions(predict_list) arr = np.array(new_list) if arr.shape[0] == 0: #If the file is empty, we fill it in with an array of zeros boxes_dict[fname] = np.array([[0, 0, 0, 0, 0, 0]]) num_preds += 1 else: arr = arr[:, :6].astype(np.float64) # TODO: may adjust the threshold of scores that to be counted as valid predictions # default = 0 # There should be a nms mode threshold = 0.4 arr = arr[arr[:, 5] > threshold] stclasses += list(arr[:, 4]) num_preds += arr.shape[0] if np.any(arr[:, :4] < 0): raise ValueError('Bounding boxes cannot be negative.') if np.any(arr[:, 5] < 0) or np.any(arr[:, 5] > 1): raise ValueError( 'Confidence scores should be between 0 and 1.') boxes_dict[fname] = arr[:, :6] pchips = sorted(pchips) stclasses = np.unique(stclasses).astype(np.int64) # debug #gt_coords, gt_chips, gt_classes = get_labels(path_groundtruth) gt_coords, gt_chips, gt_classes, _ = get_labels_w_uid_nondamaged( path_groundtruth) # TODO: add removing bboxes over clouds manually or / test images should not contain any black chips gt_unique = np.unique(gt_classes.astype(np.int64)) #debug print('gt_unique: ', gt_unique) max_gt_cls = 100 # max number of classes # debug # need to remove class 0 from evaluation ignored_classes = [0] gt_unique_ig = np.array( [i for i in gt_unique if int(i) not in ignored_classes], dtype=np.int64) #added # get statistics of ground truth num_gt_class = dict() for i in gt_unique: num_gt_class[i] = gt_classes[gt_classes == i].shape[0] if set(pchips).issubset(set(gt_unique_ig)): raise ValueError( 'The prediction files {%s} are not in the ground truth.' % str(set(pchips) - (set(gt_unique)))) #print("Number of Predictions: %d" % num_preds) #print("Number of GT: %d" % np.sum(gt_classes.shape) ) per_file_class_data = {} for i in gt_unique_ig: per_file_class_data[i] = [[], []] num_gt_per_cls = np.zeros((max_gt_cls)) for file_ind in range(len(pchips)): print(pchips[file_ind]) det_box = boxes_dict[pchips[file_ind]][:, :4] det_scores = boxes_dict[pchips[file_ind]][:, 5] det_cls = boxes_dict[pchips[file_ind]][:, 4] gt_box = gt_coords[(gt_chips == pchips[file_ind]).flatten()] gt_cls = gt_classes[(gt_chips == pchips[file_ind])] for i in gt_unique: s = det_scores[det_cls == i] ssort = np.argsort(s)[::-1] per_file_class_data[i][0] += s[ssort].tolist() gt_box_i_cls = gt_box[gt_cls == i].flatten().tolist() det_box_i_cls = det_box[det_cls == i] det_box_i_cls = det_box_i_cls[ssort].flatten().tolist() gt_rects = convert_to_rectangle_list(gt_box_i_cls) rects = convert_to_rectangle_list(det_box_i_cls) matching = Matching(gt_rects, rects) rects_matched, gt_matched = matching.greedy_match(iou_threshold) # debug print('len(gt_matched): ', len(gt_matched)) print('len(rects_matched): ', len(rects_matched)) #print('rects_matched: ', rects_matched) #we aggregate confidence scores, rectangles, and num_gt across classes #per_file_class_data[i][0] += det_scores[det_cls == i].tolist() per_file_class_data[i][1] += rects_matched num_gt_per_cls[i] += len(gt_matched) average_precision_per_class = np.ones(max_gt_cls) * float('nan') per_class_p = np.ones(max_gt_cls) * float('nan') per_class_r = np.ones(max_gt_cls) * float('nan') # debug # need to remove class 0 from evaluation ignored_classes = [0] gt_unique_ig = np.array( [i for i in gt_unique if int(i) not in ignored_classes], dtype=np.int64) for i in gt_unique_ig: scores = np.array(per_file_class_data[i][0]) rects_matched = np.array(per_file_class_data[i][1]) if num_gt_per_cls[i] != 0: sorted_indices = np.argsort(scores)[::-1] tp_sum = np.cumsum(rects_matched[sorted_indices]) fp_sum = np.cumsum(np.logical_not(rects_matched[sorted_indices])) # calculated using confidence scores of the bboxes that have confidence score > 0.5 (or some other threshold) precision = tp_sum / (tp_sum + fp_sum + np.spacing(1)) recall = tp_sum / num_gt_per_cls[i] # debug # per_class_precision: @IOU >= 0.5, # of correctly identified bboxes / all predicted boxes per_class_p[i] = np.sum(rects_matched) / len(rects_matched) per_class_r[i] = np.sum(rects_matched) / num_gt_per_cls[i] ap = ap_from_pr(precision, recall) # added print('for class: ', i) print('TP: ', tp_sum[-1]) print('FP: ', fp_sum[-1]) else: ap = float('nan') average_precision_per_class[i] = ap # debug #metric splits #metric_keys = ['map','map/small','map/medium','map/large', #'map/common','map/rare'] metric_keys = ['map'] ''' splits = { 'map/small': [17, 18, 19, 20, 21, 23, 24, 26, 27, 28, 32, 41, 60, 62, 63, 64, 65, 66, 91], 'map/medium': [11, 12, 15, 25, 29, 33, 34, 35, 36, 37, 38, 42, 44, 47, 50, 53, 56, 59, 61, 71, 72, 73, 76, 84, 86, 93, 94], 'map/large': [13, 40, 45, 49, 51, 52, 54, 55, 57, 74, 77, 79, 83, 89], 'map/common': [13,17,18,19,20,21,23,24,25,26,27,28,34,35,41, 47,60,63,64,71,72,73,76,77,79,83,86,89,91], 'map/rare': [11,12,15,29,32,33,36,37,38,40,42,44,45,49,50, 51,52,53,54,55,56,57,59,61,62,65,66,74,84,93,94] } ''' vals = {} vals['map'] = np.nanmean(average_precision_per_class) vals['map_score'] = np.nanmean(per_class_p) vals['mar_score'] = np.nanmean(per_class_r) ''' for i in splits.keys(): vals[i] = np.nanmean(average_precision_per_class[splits[i]]) ''' for i in gt_unique: vals[int(i)] = average_precision_per_class[int(i)] vals['f1'] = 2 / ((1 / (np.spacing(1) + vals['map_score'])) + (1 / (np.spacing(1) + vals['mar_score']))) #print("mAP: %f | mAP score: %f | mAR: %f | F1: %f" % print("mAP: %f | mean precision: %f | mean recall: %f | F1: %f" % (vals['map'], vals['map_score'], vals['mar_score'], vals['f1'])) with open(path_output + '/score.txt', 'w') as f: f.write(str("%.8f" % vals['map'])) with open(path_output + '/metrics.txt', 'w') as f: for key in vals.keys(): f.write("%s %f\n" % (str(key), vals[key])) # added print('counting score threshold larger than %s as valid prediction' % str(threshold)) for k, v in num_gt_class.items(): print('ground truth class: ', k) print('the count of GT labels: ', v) print("Number of Predictions: %d" % num_preds) print("Number of GT: %d" % np.sum(gt_classes.shape)) print("Final time: %s" % str(time.time() - ttime))