def do_evaluation(submission_dir, ground_truth_dir): """ Evaluate a particular image sequence :param submission_dir: :param ground_truth_dir: :return: """ ground_truth = gt_loader.read_ground_truth(ground_truth_dir) detections = submission_loader.read_submission(submission_dir, expected_sequence_names=set( ground_truth.keys())) matches = gt_loader.match_sequences(ground_truth, detections) evaluator = PDQ() score = evaluator.score(matches) TP, FP, FN = evaluator.get_assignment_counts() avg_spatial_quality = evaluator.get_avg_spatial_score() avg_label_quality = evaluator.get_avg_label_score() avg_overall_quality = evaluator.get_avg_overall_quality_score() return { 'score': score * 100, 'avg_spatial': avg_spatial_quality, 'avg_label': avg_label_quality, 'avg_pPDQ': avg_overall_quality, 'TPs': TP, 'FPs': FP, 'FNs': FN }
def main(method, n_classes): if (method == 1): print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() print("Calculating PDQ") # Get summary statistics (PDQ, avg_qualities) evaluator = PDQ(filter_gts=True, segment_mode=False, greedy_mode=False) pdq = evaluator.score(param_sequence) TP, FP, FN = evaluator.get_assignment_counts() avg_spatial_quality = evaluator.get_avg_spatial_score() avg_label_quality = evaluator.get_avg_label_score() avg_overall_quality = evaluator.get_avg_overall_quality_score() avg_fg_quality = evaluator.get_avg_fg_quality_score() avg_bg_quality = evaluator.get_avg_bg_quality_score() # Get the detection-wise and ground-truth-wise qualities and matches for PDQ and save them to file all_gt_eval_dicts = evaluator._gt_evals all_det_eval_dicts = evaluator._det_evals result = { "PDQ": pdq, "avg_pPDQ": avg_overall_quality, "avg_spatial": avg_spatial_quality, 'avg_fg': avg_fg_quality, 'avg_bg': avg_bg_quality, "avg_label": avg_label_quality, "TP": TP, "FP": FP, "FN": FN } return result #Calculate mAP if (method == 0): print("Calculating mAP") #print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() mAP = coco_mAP(param_sequence, n_classes, use_heatmap=False) #print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP ", mAP) return mAP # Compile evaluation statistics into a single dictionary result = { "PDQ": pdq, "avg_pPDQ": avg_overall_quality, "avg_spatial": avg_spatial_quality, 'avg_fg': avg_fg_quality, 'avg_bg': avg_bg_quality, "avg_label": avg_label_quality, "TP": TP, "FP": FP, "FN": FN, 'mAP': mAP }
def do_evaluation(submission_dir, ground_truth_dir, sequences=None, num_frames=-1, start_frame=0): """ Evaluate a particular image sequence :param submission_dir: location of the detections .json files (one for each sequence) :param ground_truth_dir: location of the ground-truth folders (one for each sequence). Each ground-truth folder must contain mask images (.png format) and a matching labels.json file. :param sequences: A whitelist of sequence ids to include, as integers :param num_frames: The number of frames to read from each sequence, default is all available. :param start_frame: The index of the first frame to read :return: Dictionary containing summary of all metrics used in competition leaderboard (score, average spatial quality, average label quality, average overall quality (avg_pPDQ), true positives, false positives, and false negatives) """ ground_truth = gt_loader.read_ground_truth(ground_truth_dir, sequences, start_index=start_frame, end_index=start_frame + num_frames) detections = submission_loader.read_submission( submission_dir, expected_sequence_names=set(ground_truth.keys()), start_index=start_frame, end_index=start_frame + num_frames) matches = gt_loader.match_sequences(ground_truth, detections) evaluator = PDQ() score = evaluator.score(matches) TP, FP, FN = evaluator.get_assignment_counts() avg_spatial_quality = evaluator.get_avg_spatial_score() avg_label_quality = evaluator.get_avg_label_score() avg_overall_quality = evaluator.get_avg_overall_quality_score() avg_fp_quality = evaluator.get_avg_fp_score() return { 'score': score * 100, 'avg_spatial': avg_spatial_quality, 'avg_label': avg_label_quality, 'avg_pPDQ': avg_overall_quality, 'avg_fp_quality': avg_fp_quality, 'TPs': TP, 'FPs': FP, 'FNs': FN }
def main(): if not os.path.isdir(args.save_folder): os.makedirs(args.save_folder) print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() print("Calculating PDQ") # Get summary statistics (PDQ, avg_qualities) evaluator = PDQ(filter_gts=(args.test_set == 'rvc1'), segment_mode=args.segment_mode, greedy_mode=args.greedy_mode) pdq = evaluator.score(param_sequence) TP, FP, FN = evaluator.get_assignment_counts() avg_spatial_quality = evaluator.get_avg_spatial_score() avg_label_quality = evaluator.get_avg_label_score() avg_overall_quality = evaluator.get_avg_overall_quality_score() avg_fg_quality = evaluator.get_avg_fg_quality_score() avg_bg_quality = evaluator.get_avg_bg_quality_score() # Get the detection-wise and ground-truth-wise qualities and matches for PDQ and save them to file all_gt_eval_dicts = evaluator._gt_evals all_det_eval_dicts = evaluator._det_evals # Calculate mAP print("Calculating mAP") # generate the parameter sequence again for new tests (generator does not hold onto data once used) print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() if args.mAP_heatmap: mAP = coco_mAP(param_sequence, use_heatmap=True) print('mAP: {0}'.format(mAP)) else: mAP = coco_mAP(param_sequence, use_heatmap=False) print('mAP: {0}'.format(mAP)) # Calculate LRP print("Calculating LRP") # generate the parameter sequence again for new tests (generator does not hold onto data once used) print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() # Use same BBox definition as would be used for mAP # Extract all moLRP statistics if args.mAP_heatmap: LRP_dict = coco_LRP(param_sequence, use_heatmap=True, full=True) else: LRP_dict = coco_LRP(param_sequence, use_heatmap=False, full=True) # Compile evaluation statistics into a single dictionary result = { "PDQ": pdq, "avg_pPDQ": avg_overall_quality, "avg_spatial": avg_spatial_quality, 'avg_fg': avg_fg_quality, 'avg_bg': avg_bg_quality, "avg_label": avg_label_quality, "TP": TP, "FP": FP, "FN": FN, 'mAP': mAP, 'moLRP': LRP_dict['moLRP'], 'moLRPLoc': LRP_dict['moLRPLoc'], 'moLRPFP': LRP_dict['moLRPFP'], 'moLRPFN': LRP_dict['moLRPFN'] } print("PDQ: {0:4f}\n" "mAP: {1:4f}\n" "avg_pPDQ:{2:4f}\n" "avg_spatial:{3:4f}\n" "avg_label:{4:4f}\n" "avg_foreground:{5:4f}\n" "avg_background:{6:4f}\n" "TP:{7}\nFP:{8}\nFN:{9}\n" "moLRP:{10:4f}\n" "moLRPLoc:{11:4f}\n" "moLRPFP:{12:4f}\n" "moLRPFN:{13:4f}\n".format(pdq, mAP, avg_overall_quality, avg_spatial_quality, avg_label_quality, avg_fg_quality, avg_bg_quality, TP, FP, FN, LRP_dict['moLRP'], LRP_dict['moLRPLoc'], LRP_dict['moLRPFP'], LRP_dict['moLRPFN'])) # Save evaluation statistics to file with open(os.path.join(args.save_folder, 'scores.txt'), 'w') as output_file: output_file.write("\n".join("{0}:{1}".format(k, v) for k, v in sorted(result.items()))) # Save pairwise PDQ statistics to file for use in visualisation code (separate file for each sequence) prev_idx = 0 for idx, len_sequence in enumerate(len_sequences): seq_gt_eval_dicts = all_gt_eval_dicts[prev_idx:prev_idx + len_sequence] seq_det_eval_dicts = all_det_eval_dicts[prev_idx:prev_idx + len_sequence] prev_idx += len_sequence with open( os.path.join(args.save_folder, 'gt_eval_stats_{:02d}.json'.format(idx)), 'w') as f: json.dump(seq_gt_eval_dicts, f) with open( os.path.join(args.save_folder, 'det_eval_stats_{:02d}.json').format(idx), 'w') as f: json.dump(seq_det_eval_dicts, f)