def main(args): # Setup config node cfg = setup_config(args, random_seed=args.random_seed) # For debugging only #cfg.defrost() #cfg.DATALOADER.NUM_WORKERS = 0 #cfg.SOLVER.IMS_PER_BATCH = 1 # Eval only mode to produce mAP results # Build Trainer from config node. Begin Training. if cfg.MODEL.META_ARCHITECTURE == 'ProbabilisticDetr': trainer = Detr_Trainer(cfg) else: trainer = Trainer(cfg) if args.eval_only: model = trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) return res trainer.resume_or_load(resume=args.resume) return trainer.train()
def get_mAP_results(config_names, configs_list, inference_configs_list): # Level 0 is coco validation set with no corruption, level 10 is open # images, level 11 is open images ood image_corruption_levels = [0, 1, 2, 3, 4, 5, 10] test_dataset_coco = "coco_2017_custom_val" test_dataset_open_images = "openimages_val" arg_parser = setup_arg_parser() args = arg_parser.parse_args() # Initiate dataframe dict mAP_results = defaultdict(list) for config_name, config, inference_config_name in zip( config_names, configs_list, inference_configs_list): # Setup config args.config_file = config args.inference_config = inference_config_name args.test_dataset = test_dataset_coco cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() # Read coco dataset results cfg.ACTUAL_TEST_DATASET = args.test_dataset for image_corruption_level in image_corruption_levels: # Build path to gt instances and inference output args.image_corruption_level = image_corruption_level if image_corruption_level == 0: image_corruption_level = 'Val' elif image_corruption_level == 10: image_corruption_level = 'OpenIm' else: image_corruption_level = 'C' + str(image_corruption_level) if 'OpenIm' not in image_corruption_level: inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) else: args.image_corruption_level = 0 args.test_dataset = test_dataset_open_images inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) text_file_name = glob.glob( os.path.join(inference_output_dir, 'mAP_res.txt'))[0] with open(text_file_name, "r") as f: mAP = f.read().strip('][\n').split(', ')[0] mAP = float(mAP) * 100 mAP_results['Method Name'].append(config_name) mAP_results['Image Corruption Level'].append( image_corruption_level) mAP_results['mAP'].append(mAP) return mAP_results
def main(args, cfg=None): # Setup config if cfg is None: cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) # Build path to inference output inference_output_dir = os.path.join( cfg['OUTPUT_DIR'], 'inference', args.test_dataset, os.path.split(args.inference_config)[-1][:-5]) prediction_file_name = os.path.join(inference_output_dir, 'coco_instances_results.json') meta_catalog = MetadataCatalog.get(args.test_dataset) # Evaluate detection results gt_coco_api = COCO(meta_catalog.json_file) res_coco_api = gt_coco_api.loadRes(prediction_file_name) results_api = COCOeval(gt_coco_api, res_coco_api, iouType='bbox') results_api.params.catIds = [ 1, 3 ] #list(meta_catalog.thing_dataset_id_to_contiguous_id.keys()) # Calculate and print aggregate results results_api.evaluate() results_api.accumulate() results_api.summarize() # Compute optimal micro F1 score threshold. We compute the f1 score for # every class and score threshold. We then compute the score threshold that # maximizes the F-1 score of every class. The final score threshold is the average # over all classes. precisions = results_api.eval['precision'].mean(0)[:, :, 0, 2] recalls = np.expand_dims(results_api.params.recThrs, 1) f1_scores = 2 * (precisions * recalls) / (precisions + recalls) optimal_f1_score = f1_scores.argmax(0) scores = results_api.eval['scores'].mean(0)[:, :, 0, 2] optimal_score_threshold = [ scores[optimal_f1_score_i, i] for i, optimal_f1_score_i in enumerate(optimal_f1_score) ] optimal_score_threshold = np.array(optimal_score_threshold) optimal_score_threshold = optimal_score_threshold[ optimal_score_threshold != 0] optimal_score_threshold = optimal_score_threshold.mean() print("Classification Score at Optimal F-1 Score: {}".format( optimal_score_threshold)) text_file_name = os.path.join(inference_output_dir, 'mAP_res.txt') with open(text_file_name, "w") as text_file: print(results_api.stats.tolist() + [ optimal_score_threshold, ], file=text_file)
def main(args): # Setup config node cfg = setup_config(args, random_seed=args.random_seed) # Eval only mode to produce mAP results if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) return res # Build Trainer from config node. Begin Training. trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) return trainer.train()
def main(args): # Setup config cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) # Make sure only 1 data point is processed at a time. This simulates # deployment. cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 32 cfg.SOLVER.IMS_PER_BATCH = 1 cfg.MODEL.DEVICE = device.type # Set up number of cpu threads# torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS) # Create inference output directory and copy inference config file to keep # track of experimental settings inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) os.makedirs(inference_output_dir, exist_ok=True) copyfile( args.inference_config, os.path.join(inference_output_dir, os.path.split(args.inference_config)[-1])) # Get category mapping dictionary: train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( args.test_dataset).thing_dataset_id_to_contiguous_id # If both dicts are equal or if we are performing out of distribution # detection, just flip the test dict. cat_mapping_dict = get_train_contiguous_id_to_test_thing_dataset_id_dict( cfg, args, train_thing_dataset_id_to_contiguous_id, test_thing_dataset_id_to_contiguous_id) # Build predictor predictor = build_predictor(cfg) test_data_loader = build_detection_test_loader( cfg, dataset_name=args.test_dataset) final_output_list = [] if not args.eval_only: with torch.no_grad(): with tqdm.tqdm(total=len(test_data_loader)) as pbar: for idx, input_im in enumerate(test_data_loader): # Apply corruption outputs = predictor(input_im) # predictor.visualize_inference(input_im, outputs) final_output_list.extend( instances_to_json(outputs, input_im[0]['image_id'], cat_mapping_dict)) pbar.update(1) with open( os.path.join(inference_output_dir, 'coco_instances_results.json'), 'w') as fp: json.dump(final_output_list, fp, indent=4, separators=(',', ': ')) if 'ood' in args.test_dataset: compute_ood_probabilistic_metrics.main(args, cfg) else: compute_average_precision.main(args, cfg) compute_probabilistic_metrics.main(args, cfg) compute_calibration_errors.main(args, cfg)
def main(args, cfg=None, iou_min=None, iou_correct=None, min_allowed_score=None): # Setup config if cfg is None: cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() cfg.ACTUAL_TEST_DATASET = args.test_dataset # Build path to gt instances and inference output inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) # Get thresholds to perform evaluation on if iou_min is None: iou_min = args.iou_min if iou_correct is None: iou_correct = args.iou_correct if min_allowed_score is None: # Check if F-1 Score has been previously computed ON THE ORIGINAL # DATASET such as COCO even when evaluating on VOC. try: train_set_inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], cfg.DATASETS.TEST[0], args.inference_config, 0) with open( os.path.join(train_set_inference_output_dir, "mAP_res.txt"), "r") as f: min_allowed_score = f.read().strip('][\n').split(', ')[-1] min_allowed_score = round(float(min_allowed_score), 4) except FileNotFoundError: # If not, process all detections. Not recommended as the results might be influenced by very low scoring # detections that would normally be removed in robotics/vision # applications. min_allowed_score = 0.0 # get preprocessed instances preprocessed_predicted_instances, preprocessed_gt_instances = evaluation_utils.get_per_frame_preprocessed_instances( cfg, inference_output_dir, min_allowed_score) # get metacatalog and image infos meta_catalog = MetadataCatalog.get(args.test_dataset) images_info = json.load(open(meta_catalog.json_file, 'r'))['images'] # Loop over all images and visualize errors for image_info in images_info: image_id = image_info['id'] image = cv2.imread( os.path.join(meta_catalog.image_root, image_info['file_name'])) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) predicted_box_means = { image_id: preprocessed_predicted_instances['predicted_boxes'][image_id] } predicted_box_covariances = { image_id: preprocessed_predicted_instances['predicted_covar_mats'][image_id] } predicted_cls_probs = { image_id: preprocessed_predicted_instances['predicted_cls_probs'][image_id] } gt_box_means = { image_id: preprocessed_gt_instances['gt_boxes'][image_id] } gt_cat_idxs = { image_id: preprocessed_gt_instances['gt_cat_idxs'][image_id] } # Perform matching matched_results = evaluation_utils.match_predictions_to_groundtruth( predicted_box_means, predicted_cls_probs, predicted_box_covariances, gt_box_means, gt_cat_idxs, iou_min=iou_min, iou_correct=iou_correct) true_positives = matched_results['true_positives'] duplicates = matched_results['duplicates'] localization_errors = matched_results['localization_errors'] false_positives = matched_results['false_positives'] false_negatives = matched_results['false_negatives'] # Plot True Positive Detections In Blue v = Visualizer(image, meta_catalog, scale=2.0) gt_boxes = true_positives['gt_box_means'].cpu().numpy() true_positive_boxes = true_positives['predicted_box_means'].cpu( ).numpy() false_positives_boxes = false_positives['predicted_box_means'].cpu( ).numpy() duplicates_boxes = duplicates['predicted_box_means'].cpu().numpy() localization_errors_boxes = localization_errors[ 'predicted_box_means'].cpu().numpy() # Get category labels gt_cat_idxs = true_positives['gt_cat_idxs'].cpu().numpy() # Get category mapping dictionary: train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( args.test_dataset).thing_dataset_id_to_contiguous_id thing_dataset_id_to_contiguous_id = evaluation_utils.get_test_thing_dataset_id_to_train_contiguous_id_dict( cfg, args, train_thing_dataset_id_to_contiguous_id, test_thing_dataset_id_to_contiguous_id) class_list = MetadataCatalog.get( cfg.DATASETS.TRAIN[0]).as_dict()['thing_classes'] if gt_cat_idxs.shape[0] > 0: gt_labels = [ class_list[thing_dataset_id_to_contiguous_id[gt_class]] for gt_class in gt_cat_idxs[:, 0] ] else: gt_labels = [] if cfg.MODEL.META_ARCHITECTURE != "ProbabilisticRetinaNet": if len(true_positives['predicted_cls_probs'] > 0): _, true_positive_classes = true_positives[ 'predicted_cls_probs'][:, :-1].max(1) else: true_positive_classes = np.array([]) if len(duplicates['predicted_cls_probs']) > 0: _, duplicates_classes = duplicates[ 'predicted_cls_probs'][:, :-1].max(1) else: duplicates_classes = np.array([]) if len(localization_errors['predicted_cls_probs']) > 0: _, localization_errors_classes = localization_errors[ 'predicted_cls_probs'][:, :-1].max(1) else: localization_errors_classes = np.array([]) if len(false_positives['predicted_cls_probs']) > 0: _, false_positives_classes = false_positives[ 'predicted_cls_probs'][:, :-1].max(1) else: false_positives_classes = np.array([]) else: if len(true_positives['predicted_cls_probs'] > 0): _, true_positive_classes = true_positives[ 'predicted_cls_probs'].max(1) else: true_positive_classes = np.array([]) if len(duplicates['predicted_cls_probs']) > 0: _, duplicates_classes = duplicates['predicted_cls_probs'].max( 1) else: duplicates_classes = np.array([]) if len(localization_errors['predicted_cls_probs']) > 0: _, localization_errors_classes = localization_errors[ 'predicted_cls_probs'].max(1) else: localization_errors_classes = np.array([]) if len(false_positives['predicted_cls_probs']) > 0: _, false_positives_classes = false_positives[ 'predicted_cls_probs'].max(1) else: false_positives_classes = np.array([]) if len(true_positives['predicted_cls_probs'] > 0): true_positive_classes = true_positive_classes.cpu().numpy() true_positive_labels = [ class_list[tp_class] for tp_class in true_positive_classes ] else: true_positive_labels = [] if len(duplicates['predicted_cls_probs']) > 0: duplicates_classes = duplicates_classes.cpu().numpy() duplicates_labels = [ class_list[d_class] for d_class in duplicates_classes ] else: duplicates_labels = [] if len(localization_errors['predicted_cls_probs']) > 0: localization_errors_classes = localization_errors_classes.cpu( ).numpy() localization_errors_labels = [ class_list[le_class] for le_class in localization_errors_classes ] else: localization_errors_labels = [] if len(false_positives['predicted_cls_probs']) > 0: false_positives_classes = false_positives_classes.cpu().numpy() false_positives_labels = [ class_list[fp_class] for fp_class in false_positives_classes ] else: false_positives_labels = [] # Overlay true positives in blue _ = v.overlay_instances(boxes=gt_boxes, assigned_colors=['lime' for _ in gt_boxes], labels=gt_labels, alpha=1.0) plotted_true_positive_boxes = v.overlay_instances( boxes=true_positive_boxes, assigned_colors=['dodgerblue' for _ in true_positive_boxes], alpha=1.0, labels=true_positive_labels) cv2.imshow( 'True positive detections with IOU greater than {}'.format( iou_correct), cv2.cvtColor(plotted_true_positive_boxes.get_image(), cv2.COLOR_RGB2BGR)) # Plot False Positive Detections In Red v = Visualizer(image, meta_catalog, scale=2.0) _ = v.overlay_instances(boxes=gt_boxes, assigned_colors=['lime' for _ in gt_boxes], labels=gt_labels, alpha=0.7) plotted_false_positive_boxes = v.overlay_instances( boxes=false_positives_boxes, assigned_colors=['red' for _ in false_positives_boxes], alpha=1.0, labels=false_positives_labels) cv2.imshow( 'False positive detections with IOU less than {}'.format(iou_min), cv2.cvtColor(plotted_false_positive_boxes.get_image(), cv2.COLOR_RGB2BGR)) # Plot Duplicates v = Visualizer(image, meta_catalog, scale=2.0) _ = v.overlay_instances(boxes=gt_boxes, assigned_colors=['lime' for _ in gt_boxes], labels=gt_labels, alpha=0.7) plotted_duplicates_boxes = v.overlay_instances( boxes=duplicates_boxes, assigned_colors=['magenta' for _ in duplicates_boxes], alpha=1.0, labels=duplicates_labels) cv2.imshow( 'Duplicate Detections', cv2.cvtColor(plotted_duplicates_boxes.get_image(), cv2.COLOR_RGB2BGR)) # Plot localization errors v = Visualizer(image, meta_catalog, scale=2.0) _ = v.overlay_instances(boxes=gt_boxes, assigned_colors=['lime' for _ in gt_boxes], labels=gt_labels, alpha=0.7) plotted_localization_errors_boxes = v.overlay_instances( boxes=localization_errors_boxes, assigned_colors=['aqua' for _ in localization_errors_boxes], alpha=1.0, labels=localization_errors_labels) cv2.imshow( 'Detections with localization errors between minimum IOU = {} and maximum IOU = {}' .format(iou_min, iou_correct), cv2.cvtColor(plotted_localization_errors_boxes.get_image(), cv2.COLOR_RGB2BGR)) # Plot False Negatives Detections In Brown if len(false_negatives['gt_box_means']) > 0: false_negatives_boxes = false_negatives['gt_box_means'].cpu( ).numpy() false_negatives_classes = false_negatives['gt_cat_idxs'].cpu( ).numpy() false_negatives_labels = [ class_list[thing_dataset_id_to_contiguous_id[gt_class[0]]] for gt_class in false_negatives_classes.tolist() ] else: false_negatives_boxes = np.array([]) false_negatives_labels = [] v = Visualizer(image, meta_catalog, scale=2.0) plotted_false_negative_boxes = v.overlay_instances( boxes=false_negatives_boxes, assigned_colors=['coral' for _ in false_negatives_boxes], alpha=1.0, labels=false_negatives_labels) cv2.imshow( 'False negative ground truth.', cv2.cvtColor(plotted_false_negative_boxes.get_image(), cv2.COLOR_RGB2BGR)) cv2.waitKey(0) cv2.destroyAllWindows()
def main( args, cfg=None, min_allowed_score=None): # Setup config if cfg is None: cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() cfg.ACTUAL_TEST_DATASET = args.test_dataset # Build path to gt instances and inference output inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) # Get thresholds to perform evaluation on if min_allowed_score is None: # Check if F-1 Score has been previously computed. try: with open(os.path.join(inference_output_dir, "mAP_res.txt"), "r") as f: min_allowed_score = f.read().strip('][\n').split(', ')[-1] min_allowed_score = round(float(min_allowed_score), 4) except FileNotFoundError: # If not, process all detections. Not recommended as the results might be influenced by very low scoring # detections that would normally be removed in robotics/vision # applications. min_allowed_score = 0.0 # get preprocessed instances preprocessed_predicted_instances, preprocessed_gt_instances = evaluation_utils.get_per_frame_preprocessed_instances( cfg, inference_output_dir, min_allowed_score) # get metacatalog and image infos meta_catalog = MetadataCatalog.get(args.test_dataset) images_info = json.load(open(meta_catalog.json_file, 'r'))['images'] # Loop over all images and visualize errors for image_info in images_info: image_id = image_info['id'] image = cv2.imread( os.path.join( meta_catalog.image_root, image_info['file_name'])) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) v = ProbabilisticVisualizer( image, meta_catalog, scale=1.5) class_list = v.metadata.as_dict()['thing_classes'] predicted_box_means = preprocessed_predicted_instances['predicted_boxes'][image_id].cpu( ).numpy() gt_box_means = preprocessed_gt_instances['gt_boxes'][image_id].cpu( ).numpy() predicted_box_covariances = preprocessed_predicted_instances[ 'predicted_covar_mats'][image_id].cpu( ).numpy() predicted_cls_probs = preprocessed_predicted_instances['predicted_cls_probs'][image_id] if predicted_cls_probs.shape[0] > 0: if cfg.MODEL.META_ARCHITECTURE == "ProbabilisticGeneralizedRCNN" or cfg.MODEL.META_ARCHITECTURE == "ProbabilisticDetr": predicted_scores, predicted_classes = predicted_cls_probs[:, :-1].max( 1) predicted_entropies = entropy( predicted_cls_probs.cpu().numpy(), base=2) else: predicted_scores, predicted_classes = predicted_cls_probs.max( 1) predicted_entropies = entropy( np.stack( (predicted_scores.cpu().numpy(), 1 - predicted_scores.cpu().numpy())), base=2) predicted_classes = predicted_classes.cpu( ).numpy() predicted_classes = [class_list[p_class] for p_class in predicted_classes] assigned_colors = cm.autumn(predicted_entropies) predicted_scores = predicted_scores.cpu().numpy() else: predicted_scores=np.array([]) predicted_classes = np.array([]) assigned_colors = [] gt_cat_idxs = preprocessed_gt_instances['gt_cat_idxs'][image_id].cpu( ).numpy() thing_dataset_id_to_contiguous_id = meta_catalog.thing_dataset_id_to_contiguous_id if gt_cat_idxs.shape[0] > 0: gt_labels = [class_list[thing_dataset_id_to_contiguous_id[gt_class]] for gt_class in gt_cat_idxs[:, 0]] else: gt_labels = [] # noinspection PyTypeChecker _ = v.overlay_covariance_instances( boxes=gt_box_means, assigned_colors=[ 'lightgreen' for _ in gt_box_means], labels=gt_labels, alpha=1.0) plotted_detections = v.overlay_covariance_instances( boxes=predicted_box_means, covariance_matrices=predicted_box_covariances, assigned_colors=assigned_colors, alpha=1.0, labels=predicted_classes) cv2.imshow( 'Detected Instances.', cv2.cvtColor( plotted_detections.get_image(), cv2.COLOR_RGB2BGR)) cv2.waitKey()
def main( args, cfg=None, iou_min=None, iou_correct=None, min_allowed_score=None): # Setup config if cfg is None: cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() cfg.ACTUAL_TEST_DATASET = args.test_dataset # Setup torch device and num_threads torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS) # Build path to gt instances and inference output inference_output_dir = os.path.join( cfg['OUTPUT_DIR'], 'inference', args.test_dataset, os.path.split(args.inference_config)[-1][:-5]) # Get thresholds to perform evaluation on if iou_min is None: iou_min = args.iou_min if iou_correct is None: iou_correct = args.iou_correct if min_allowed_score is None: # Check if F-1 Score has been previously computed ON THE ORIGINAL # DATASET such as COCO even when evaluating on VOC. try: train_set_inference_output_dir = os.path.join( cfg['OUTPUT_DIR'], 'inference', cfg.DATASETS.TEST[0], os.path.split(args.inference_config)[-1][:-5]) with open(os.path.join(train_set_inference_output_dir, "mAP_res.txt"), "r") as f: min_allowed_score = f.read().strip('][\n').split(', ')[-1] min_allowed_score = round(float(min_allowed_score), 4) except FileNotFoundError: # If not, process all detections. Not recommended as the results might be influenced by very low scoring # detections that would normally be removed in robotics/vision # applications. min_allowed_score = 0.0 # Get category mapping dictionary: train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( args.test_dataset).thing_dataset_id_to_contiguous_id cat_mapping_dict = get_thing_dataset_id_to_contiguous_id_dict( cfg, args, train_thing_dataset_id_to_contiguous_id, test_thing_dataset_id_to_contiguous_id) # Get matched results by either generating them or loading from file. with torch.no_grad(): matched_results = evaluation_utils.get_matched_results( cfg, inference_output_dir, iou_min=iou_min, iou_correct=iou_correct, min_allowed_score=min_allowed_score) # Build preliminary dicts required for computing classification scores. for matched_results_key in matched_results.keys(): if 'gt_cat_idxs' in matched_results[matched_results_key].keys(): # First we convert the written things indices to contiguous # indices. gt_converted_cat_idxs = matched_results[matched_results_key]['gt_cat_idxs'].squeeze( 1) gt_converted_cat_idxs = torch.as_tensor([cat_mapping_dict[class_idx.cpu( ).tolist()] for class_idx in gt_converted_cat_idxs]).to(device) matched_results[matched_results_key]['gt_converted_cat_idxs'] = gt_converted_cat_idxs.to( device) if 'predicted_cls_probs' in matched_results[matched_results_key].keys( ): predicted_cls_probs = matched_results[matched_results_key]['predicted_cls_probs'] # This is required for evaluation of retinanet based # detections. matched_results[matched_results_key]['predicted_score_of_gt_category'] = torch.gather( predicted_cls_probs, 1, gt_converted_cat_idxs.unsqueeze(1)).squeeze(1) matched_results[matched_results_key]['gt_cat_idxs'] = gt_converted_cat_idxs else: # For false positives, the correct category is background. For retinanet, since no explicit # background category is available, this value is computed as 1.0 - score of the predicted # category. predicted_class_probs, predicted_class_idx = matched_results[matched_results_key]['predicted_cls_probs'].max( 1) matched_results[matched_results_key]['predicted_score_of_gt_category'] = 1.0 - \ predicted_class_probs matched_results[matched_results_key]['predicted_cat_idxs'] = predicted_class_idx # Load the different detection partitions true_positives = matched_results['true_positives'] false_negatives = matched_results['false_negatives'] false_positives = matched_results['false_positives'] # Get the number of elements in each partition num_true_positives = true_positives['predicted_box_means'].shape[0] num_false_negatives = false_negatives['gt_box_means'].shape[0] num_false_positives = false_positives['predicted_box_means'].shape[0] per_class_output_list = [] for class_idx in [1, 3]: true_positives_valid_idxs = true_positives['gt_converted_cat_idxs'] == class_idx false_positives_valid_idxs = false_positives['predicted_cat_idxs'] == class_idx # Compute classification metrics for every partition true_positives_cls_analysis = scoring_rules.retinanet_compute_cls_scores( true_positives, true_positives_valid_idxs) false_positives_cls_analysis = scoring_rules.retinanet_compute_cls_scores( false_positives, false_positives_valid_idxs) # Compute regression metrics for every partition true_positives_reg_analysis = scoring_rules.compute_reg_scores( true_positives, true_positives_valid_idxs) false_positives_reg_analysis = scoring_rules.compute_reg_scores_fn( false_positives, false_positives_valid_idxs) per_class_output_list.append( {'true_positives_cls_analysis': true_positives_cls_analysis, 'true_positives_reg_analysis': true_positives_reg_analysis, 'false_positives_cls_analysis': false_positives_cls_analysis, 'false_positives_reg_analysis': false_positives_reg_analysis}) final_accumulated_output_dict = dict() final_average_output_dict = dict() for key in per_class_output_list[0].keys(): average_output_dict = dict() for inner_key in per_class_output_list[0][key].keys(): collected_values = [per_class_output[key][inner_key] for per_class_output in per_class_output_list if per_class_output[key][inner_key] is not None] collected_values = np.array(collected_values) if key in average_output_dict.keys(): # Use nan mean since some classes do not have duplicates for # instance or has one duplicate for instance. torch.std returns nan in that case # so we handle those here. This should not have any effect on the final results, as # it only affects inter-class variance which we do not # report anyways. average_output_dict[key].update( {inner_key: np.nanmean(collected_values)}) final_accumulated_output_dict[key].update( {inner_key: collected_values}) else: average_output_dict.update( {key: {inner_key: np.nanmean(collected_values)}}) final_accumulated_output_dict.update( {key: {inner_key: collected_values}}) final_average_output_dict.update(average_output_dict) # Summarize and print all table = PrettyTable() table.field_names = (['Output Type', 'Number of Instances', 'Cls Ignorance Score', 'Reg Ignorance Score']) table.add_row( [ "True Positives:", num_true_positives, '{:.4f}'.format( final_average_output_dict['true_positives_cls_analysis']['ignorance_score_mean']), '{:.4f}'.format( final_average_output_dict['true_positives_reg_analysis']['ignorance_score_mean'])]) table.add_row( [ "False Positives:", num_false_positives, '{:.4f}'.format( final_average_output_dict['false_positives_cls_analysis']['ignorance_score_mean']), '{:.4f}'.format( final_average_output_dict['false_positives_reg_analysis']['total_entropy_mean'])]) table.add_row(["False Negatives:", num_false_negatives, '-', '-']) print(table)
def main(args): cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) # Check if F-1 Score has been previously computed ON THE ORIGINAL # DATASET such as COCO even when evaluating on OpenImages. try: train_set_inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], cfg.DATASETS.TEST[0], args.inference_config, 0) with open(os.path.join(train_set_inference_output_dir, "mAP_res.txt"), "r") as f: min_allowed_score = f.read().strip('][\n').split(', ')[-1] min_allowed_score = round(float(min_allowed_score), 4) except FileNotFoundError: # If not, process all detections. Not recommended as the results might be influenced by very low scoring # detections that would normally be removed in robotics/vision # applications. min_allowed_score = 0.0 iou_thresholds = np.arange(0.5, 1.0, 0.05).round(2) probabilistic_detection_dicts = [] calibration_dicts = [] for iou_correct in iou_thresholds: print("Processing detections at {} iou threshold...".format(iou_correct)) probabilistic_scores_file_name = os.path.join( inference_output_dir, 'probabilistic_scoring_res_{}_{}_{}.pkl'.format( args.iou_min, iou_correct, min_allowed_score)) calibration_file_name = os.path.join( inference_output_dir, 'calibration_errors_res_{}_{}_{}.pkl'.format( args.iou_min, iou_correct, min_allowed_score)) try: with open(probabilistic_scores_file_name, "rb") as f: probabilistic_scores = pickle.load(f) except FileNotFoundError: compute_probabilistic_metrics.main( args, cfg, iou_correct=iou_correct, print_results=False) with open(probabilistic_scores_file_name, "rb") as f: probabilistic_scores = pickle.load(f) try: with open(calibration_file_name, "rb") as f: calibration_errors = pickle.load(f) except FileNotFoundError: compute_calibration_errors.main( args, cfg, iou_correct=iou_correct, print_results=False) with open(calibration_file_name, "rb") as f: calibration_errors = pickle.load(f) probabilistic_detection_dicts.append(probabilistic_scores) calibration_dicts.append(calibration_errors) probabilistic_detection_final_dict = { key: {} for key in probabilistic_detection_dicts[0].keys()} for key in probabilistic_detection_dicts[0].keys(): for key_l2 in probabilistic_detection_dicts[0][key].keys(): accumulated_values = [ probabilistic_detection_dicts[i][key][key_l2] for i in range( len(probabilistic_detection_dicts))] probabilistic_detection_final_dict[key].update( {key_l2: np.nanmean(np.array(accumulated_values), 0)}) calibration_final_dict = {key: None for key in calibration_dicts[0].keys()} for key in calibration_dicts[0].keys(): accumulated_values = [ calibration_dicts[i][key] for i in range( len(calibration_dicts))] calibration_final_dict[key] = np.nanmean( np.array(accumulated_values), 0) dictionary_file_name = os.path.join( inference_output_dir, 'probabilistic_scoring_res_averaged_{}.pkl'.format(min_allowed_score)) with open(dictionary_file_name, "wb") as pickle_file: pickle.dump(probabilistic_detection_final_dict, pickle_file) dictionary_file_name = os.path.join( inference_output_dir, 'calibration_res_averaged_{}.pkl'.format( min_allowed_score)) with open(dictionary_file_name, "wb") as pickle_file: pickle.dump(calibration_final_dict, pickle_file) # Summarize and print all table = PrettyTable() table.field_names = (['Output Type', 'Cls Ignorance Score', 'Cls Brier/Probability Score', 'Reg Ignorance Score', 'Reg Energy Score']) table.add_row( [ "True Positives:", '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['true_positives_cls_analysis']['ignorance_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['true_positives_cls_analysis']['brier_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['true_positives_reg_analysis']['ignorance_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['true_positives_reg_analysis']['energy_score_mean']))]) table.add_row( [ "Duplicates:", '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['duplicates_cls_analysis']['ignorance_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['duplicates_cls_analysis']['brier_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['duplicates_reg_analysis']['ignorance_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['duplicates_reg_analysis']['energy_score_mean']))]) table.add_row( [ "Localization Errors:", '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['localization_errors_cls_analysis']['ignorance_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['localization_errors_cls_analysis']['brier_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['localization_errors_reg_analysis']['ignorance_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['localization_errors_reg_analysis']['energy_score_mean']))]) table.add_row( [ "False Positives:", '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['false_positives_cls_analysis']['ignorance_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['false_positives_cls_analysis']['brier_score_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['false_positives_reg_analysis']['total_entropy_mean'])), '{:.4f}'.format( np.nanmean(probabilistic_detection_final_dict['false_positives_reg_analysis']['fp_energy_score_mean']))]) print(table) text_file_name = os.path.join( inference_output_dir, 'probabilistic_scoring_res_averaged_{}.txt'.format( min_allowed_score)) with open(text_file_name, "w") as text_file: print(table, file=text_file) table = PrettyTable() table.field_names = (['Cls Marginal Calibration Error', 'Reg Expected Calibration Error', 'Reg Maximum Calibration Error']) table.add_row( [ '{:.4f}'.format( calibration_final_dict['cls_marginal_calibration_error']), '{:.4f}'.format( calibration_final_dict['reg_expected_calibration_error']), '{:.4f}'.format( calibration_final_dict['reg_maximum_calibration_error'])]) text_file_name = os.path.join( inference_output_dir, 'calibration_res_averaged_{}.txt'.format( min_allowed_score)) with open(text_file_name, "w") as text_file: print(table, file=text_file) print(table)
def get_matched_results_dicts(config_names, configs_list, inference_configs_list, iou_min=0.1, iou_correct=0.5): # Level 0 is coco validation set with no corruption, level 10 is open # images, level 11 is open images ood image_corruption_levels = [0, 10, 11] test_dataset_coco = "coco_2017_custom_val" test_dataset_open_images = "openimages_val" test_dataset_open_images_odd = "openimages_odd_val" arg_parser = setup_arg_parser() args = arg_parser.parse_args() # Initiate dataframe dict res_dict_clean = defaultdict( lambda: defaultdict(lambda: defaultdict(list))) for config_name, config, inference_config_name in zip( config_names, configs_list, inference_configs_list): # Setup config args.config_file = config args.inference_config = inference_config_name args.test_dataset = test_dataset_coco cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() # Read coco dataset results cfg.ACTUAL_TEST_DATASET = args.test_dataset for image_corruption_level in image_corruption_levels: # Build path to gt instances and inference output args.image_corruption_level = image_corruption_level if image_corruption_level == 0: image_corruption_level = 'Val' elif image_corruption_level == 10: image_corruption_level = 'OpenIm' elif image_corruption_level == 11: image_corruption_level = 'OpenIm OOD' else: image_corruption_level = 'C' + str(image_corruption_level) if 'OpenIm' not in image_corruption_level: inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) # Get matched results by either generating them or loading from # file. dictionary_file_name = glob.glob( os.path.join( inference_output_dir, "matched_results_{}_{}_*.pth".format( iou_min, iou_correct)))[0] matched_results = torch.load(dictionary_file_name, map_location='cuda') elif image_corruption_level == 'OpenIm': args.image_corruption_level = 0 args.test_dataset = test_dataset_open_images if image_corruption_level == 'OpenIm' else test_dataset_open_images_odd inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) dictionary_file_name = glob.glob( os.path.join( inference_output_dir, "matched_results_{}_{}_*.pth".format( iou_min, iou_correct)))[0] matched_results = torch.load(dictionary_file_name, map_location='cuda') else: args.image_corruption_level = 0 args.test_dataset = test_dataset_open_images if image_corruption_level == 'OpenIm' else test_dataset_open_images_odd inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) dictionary_file_name = glob.glob( os.path.join( inference_output_dir, "preprocessed_predicted_instances_odd_*.pth"))[0] preprocessed_predicted_instances = torch.load( dictionary_file_name, map_location='cuda') predicted_boxes = preprocessed_predicted_instances[ 'predicted_boxes'] predicted_cov_mats = preprocessed_predicted_instances[ 'predicted_covar_mats'] predicted_cls_probs = preprocessed_predicted_instances[ 'predicted_cls_probs'] predicted_boxes = list( itertools.chain.from_iterable([ predicted_boxes[key] for key in predicted_boxes.keys() ])) predicted_cov_mats = list( itertools.chain.from_iterable([ predicted_cov_mats[key] for key in predicted_cov_mats.keys() ])) predicted_cls_probs = list( itertools.chain.from_iterable([ predicted_cls_probs[key] for key in predicted_cls_probs.keys() ])) predicted_boxes = torch.stack(predicted_boxes, 1).transpose(0, 1) predicted_cov_mats = torch.stack(predicted_cov_mats, 1).transpose(0, 1) predicted_cls_probs = torch.stack(predicted_cls_probs, 1).transpose(0, 1) matched_results = { 'predicted_box_means': predicted_boxes, 'predicted_box_covariances': predicted_cov_mats, 'predicted_cls_probs': predicted_cls_probs } if image_corruption_level != 'OpenIm OOD': all_results_means = torch.cat(( matched_results['true_positives']['predicted_box_means'], matched_results['localization_errors'] ['predicted_box_means'], matched_results['duplicates']['predicted_box_means'], matched_results['false_positives']['predicted_box_means'])) all_results_covs = torch.cat(( matched_results['true_positives'] ['predicted_box_covariances'], matched_results['localization_errors'] ['predicted_box_covariances'], matched_results['duplicates']['predicted_box_covariances'], matched_results['false_positives'] ['predicted_box_covariances'])) all_gt_means = torch.cat(( matched_results['true_positives']['gt_box_means'], matched_results['localization_errors']['gt_box_means'], matched_results['duplicates']['gt_box_means'], matched_results['false_positives']['predicted_box_means'] * np.NaN)) predicted_multivariate_normal_dists = torch.distributions.multivariate_normal.MultivariateNormal( all_results_means.to('cpu'), all_results_covs.to('cpu') + 1e-2 * torch.eye(all_results_covs.shape[2]).to('cpu')) predicted_multivariate_normal_dists.loc = predicted_multivariate_normal_dists.loc.to( 'cuda') predicted_multivariate_normal_dists.scale_tril = predicted_multivariate_normal_dists.scale_tril.to( 'cuda') predicted_multivariate_normal_dists._unbroadcasted_scale_tril = predicted_multivariate_normal_dists._unbroadcasted_scale_tril.to( 'cuda') predicted_multivariate_normal_dists.covariance_matrix = predicted_multivariate_normal_dists.covariance_matrix.to( 'cuda') predicted_multivariate_normal_dists.precision_matrix = predicted_multivariate_normal_dists.precision_matrix.to( 'cuda') all_entropy = predicted_multivariate_normal_dists.entropy() all_log_prob = -predicted_multivariate_normal_dists.log_prob( all_gt_means) # Energy Score. sample_set = predicted_multivariate_normal_dists.sample( (3, )).to('cuda') sample_set_1 = sample_set[:-1] sample_set_2 = sample_set[1:] energy_score = torch.norm( (sample_set_1 - all_gt_means), dim=2).mean(0) - 0.5 * torch.norm( (sample_set_1 - sample_set_2), dim=2).mean(0) mse_loss = torch.nn.MSELoss(reduction='none') mse = mse_loss(all_gt_means, all_results_means).mean(1) res_dict_clean[config_name][image_corruption_level][ 'Entropy'].extend(all_entropy.cpu().numpy()) res_dict_clean[config_name][image_corruption_level][ 'MSE'].extend(mse.cpu().numpy()) res_dict_clean[config_name][image_corruption_level][ 'NLL'].extend(all_log_prob.cpu().numpy()) res_dict_clean[config_name][image_corruption_level][ 'ED'].extend(energy_score.cpu().numpy()) res_dict_clean[config_name][image_corruption_level][ 'IOU With GT'].extend( torch.cat( (matched_results['true_positives'] ['iou_with_ground_truth'], matched_results['localization_errors'] ['iou_with_ground_truth'][:, 0], matched_results['duplicates'] ['iou_with_ground_truth'], torch.zeros( matched_results['false_positives'] ['predicted_box_means'].shape[0]).to('cuda') * np.NaN)).cpu().numpy()) predicted_multivariate_normal_dists = torch.distributions.multivariate_normal.MultivariateNormal( matched_results['false_positives'] ['predicted_box_means'].to('cpu'), matched_results['false_positives'] ['predicted_box_covariances'].to('cpu') + 1e-2 * torch.eye(matched_results['false_positives'][ 'predicted_box_covariances'].shape[2]).to('cpu')) predicted_multivariate_normal_dists.loc = predicted_multivariate_normal_dists.loc.to( 'cuda') predicted_multivariate_normal_dists.scale_tril = predicted_multivariate_normal_dists.scale_tril.to( 'cuda') predicted_multivariate_normal_dists._unbroadcasted_scale_tril = predicted_multivariate_normal_dists._unbroadcasted_scale_tril.to( 'cuda') predicted_multivariate_normal_dists.covariance_matrix = predicted_multivariate_normal_dists.covariance_matrix.to( 'cuda') predicted_multivariate_normal_dists.precision_matrix = predicted_multivariate_normal_dists.precision_matrix.to( 'cuda') FP_Entropy = predicted_multivariate_normal_dists.entropy() res_dict_clean[config_name][image_corruption_level][ 'FP_Entropy'].extend(FP_Entropy.cpu().numpy()) predicted_cat_dists_fp = matched_results['false_positives'][ 'predicted_cls_probs'] if predicted_cat_dists_fp.shape[1] == 80: predicted_cat_dists_fp, _ = predicted_cat_dists_fp.max( dim=1) predicted_cat_dists_fp = 1 - predicted_cat_dists_fp predicted_categorical_dists = torch.distributions.Bernoulli( probs=predicted_cat_dists_fp) else: predicted_categorical_dists = torch.distributions.Categorical( probs=matched_results['false_positives'] ['predicted_cls_probs']) all_pred_ent = predicted_categorical_dists.entropy() res_dict_clean[config_name][image_corruption_level][ 'Cat_Entropy'].extend(all_pred_ent.cpu().numpy()) if image_corruption_level == 'OpenIm': res_dict_clean[config_name][image_corruption_level][ 'Truncated'].extend( torch.cat( (matched_results['true_positives'] ['is_truncated'], matched_results['localization_errors'] ['is_truncated'], matched_results['duplicates']['is_truncated'], torch.full( (matched_results['false_positives'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda') * np.NaN)).cpu().numpy()) res_dict_clean[config_name][image_corruption_level][ 'Occluded'].extend( torch.cat( (matched_results['true_positives'] ['is_occluded'], matched_results['localization_errors'] ['is_occluded'], matched_results['duplicates']['is_occluded'], torch.full( (matched_results['false_positives'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda') * np.NaN)).cpu().numpy()) else: res_dict_clean[config_name][image_corruption_level][ 'Truncated'].extend( torch.cat( (torch.full( (matched_results['true_positives'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda') * np.NaN, torch.full( (matched_results['localization_errors'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda'), torch.full( (matched_results['duplicates'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda'), torch.full( (matched_results['false_positives'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda') * np.NaN)).cpu().numpy()) res_dict_clean[config_name][image_corruption_level][ 'Occluded'].extend( torch.cat( (torch.full( (matched_results['true_positives'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda') * np.NaN, torch.full( (matched_results['localization_errors'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda') * np.NaN, torch.full( (matched_results['duplicates'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda') * np.NaN, torch.full( (matched_results['false_positives'] ['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).to('cuda') * np.NaN)).cpu().numpy()) else: predicted_multivariate_normal_dists = torch.distributions.multivariate_normal.MultivariateNormal( matched_results['predicted_box_means'].to('cpu'), matched_results['predicted_box_covariances'].to('cpu') + 1e-2 * torch.eye(matched_results['predicted_box_covariances']. shape[2]).to('cpu')) predicted_multivariate_normal_dists.loc = predicted_multivariate_normal_dists.loc.to( 'cuda') predicted_multivariate_normal_dists.scale_tril = predicted_multivariate_normal_dists.scale_tril.to( 'cuda') predicted_multivariate_normal_dists._unbroadcasted_scale_tril = predicted_multivariate_normal_dists._unbroadcasted_scale_tril.to( 'cuda') predicted_multivariate_normal_dists.covariance_matrix = predicted_multivariate_normal_dists.covariance_matrix.to( 'cuda') predicted_multivariate_normal_dists.precision_matrix = predicted_multivariate_normal_dists.precision_matrix.to( 'cuda') all_entropy = predicted_multivariate_normal_dists.entropy() res_dict_clean[config_name][image_corruption_level][ 'FP_Entropy'].extend(all_entropy.cpu().numpy()) res_dict_clean[config_name][image_corruption_level][ 'IOU With GT'].extend( torch.zeros(matched_results['predicted_box_means']. shape[0]).cpu().numpy()) res_dict_clean[config_name][image_corruption_level][ 'Truncated'].extend( torch.full(( matched_results['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).cpu().numpy() * np.NaN) res_dict_clean[config_name][image_corruption_level][ 'Occluded'].extend( torch.full(( matched_results['predicted_box_means'].shape[0], ), -1, dtype=torch.float32).cpu().numpy() * np.NaN) all_results_cat = matched_results['predicted_cls_probs'] if all_results_cat.shape[1] == 80: predicted_cat_dists_fp, _ = all_results_cat.max(dim=1) predicted_cat_dists_fp = 1 - predicted_cat_dists_fp predicted_categorical_dists = torch.distributions.Bernoulli( probs=predicted_cat_dists_fp) else: predicted_categorical_dists = torch.distributions.Categorical( probs=all_results_cat) all_pred_ent = predicted_categorical_dists.entropy() res_dict_clean[config_name][image_corruption_level][ 'Cat_Entropy'].extend(all_pred_ent.cpu().numpy()) return res_dict_clean
def get_clean_results_dict(config_names, configs_list, inference_configs_list): # Level 0 is coco validation set with no corruption, level 10 is open # images, level 11 is open images ood image_corruption_levels = [0, 1, 3, 5, 10, 11] test_dataset_coco = "coco_2017_custom_val" test_dataset_open_images = "openimages_val" test_dataset_open_images_odd = "openimages_odd_val" arg_parser = setup_arg_parser() args = arg_parser.parse_args() # Initiate dataframe dict res_dict_clean = defaultdict(lambda: defaultdict(list)) for config_name, config, inference_config_name in zip( config_names, configs_list, inference_configs_list): # Setup config args.config_file = config args.inference_config = inference_config_name args.test_dataset = test_dataset_coco cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() # Read coco dataset results cfg.ACTUAL_TEST_DATASET = args.test_dataset for image_corruption_level in image_corruption_levels: # Build path to gt instances and inference output args.image_corruption_level = image_corruption_level if image_corruption_level == 0: image_corruption_level = 'Val' elif image_corruption_level == 10: image_corruption_level = 'OpenIm' elif image_corruption_level == 11: image_corruption_level = 'OpenIm OOD' else: image_corruption_level = 'C' + str(image_corruption_level) if 'OpenIm' not in image_corruption_level: inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) dictionary_file_name = glob.glob( os.path.join( inference_output_dir, 'probabilistic_scoring_res_averaged_*.pkl'))[0] else: args.image_corruption_level = 0 args.test_dataset = test_dataset_open_images if image_corruption_level == 'OpenIm' else test_dataset_open_images_odd inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) prob_dict_name = 'probabilistic_scoring_res_averaged_*.pkl' if image_corruption_level == 'OpenIm' else 'probabilistic_scoring_res_odd_*.pkl' dictionary_file_name = glob.glob( os.path.join(inference_output_dir, prob_dict_name))[0] with open(dictionary_file_name, "rb") as pickle_file: res_dict = pickle.load(pickle_file) if image_corruption_level != 'OpenIm OOD': # True Positives Results res_dict_clean['True Positives'][ 'Negative Log Likelihood (Classification)'].extend( res_dict['true_positives_cls_analysis'] ['ignorance_score_mean']) res_dict_clean['True Positives']['Brier Score'].extend( res_dict['true_positives_cls_analysis'] ['brier_score_mean']) res_dict_clean['True Positives'][ 'Negative Log Likelihood (Regression)'].extend( res_dict['true_positives_reg_analysis'] ['ignorance_score_mean']) res_dict_clean['True Positives'][ 'Mean Squared Error'].extend( res_dict['true_positives_reg_analysis'] ['mean_squared_error']) res_dict_clean['True Positives']['Energy Score'].extend( res_dict['true_positives_reg_analysis'] ['energy_score_mean']) res_dict_clean['True Positives'][ 'Image Corruption Level'].extend( [image_corruption_level] * res_dict['true_positives_reg_analysis'] ['energy_score_mean'].shape[0]) res_dict_clean['True Positives']['Method Name'].extend( [config_name] * res_dict['true_positives_reg_analysis'] ['energy_score_mean'].shape[0]) # Duplicates Results res_dict_clean['Duplicates'][ 'Negative Log Likelihood (Classification)'].extend( res_dict['duplicates_cls_analysis'] ['ignorance_score_mean']) res_dict_clean['Duplicates']['Brier Score'].extend( res_dict['duplicates_cls_analysis'] ['brier_score_mean']) res_dict_clean['Duplicates'][ 'Negative Log Likelihood (Regression)'].extend( res_dict['duplicates_reg_analysis'] ['ignorance_score_mean']) res_dict_clean['Duplicates']['Mean Squared Error'].extend( res_dict['duplicates_reg_analysis'] ['mean_squared_error']) res_dict_clean['Duplicates']['Energy Score'].extend( res_dict['duplicates_reg_analysis'] ['energy_score_mean']) res_dict_clean['Duplicates'][ 'Image Corruption Level'].extend( [image_corruption_level] * res_dict['duplicates_reg_analysis'] ['energy_score_mean'].shape[0]) res_dict_clean['Duplicates']['Method Name'].extend( [config_name] * res_dict['duplicates_reg_analysis'] ['energy_score_mean'].shape[0]) # Localization Error Results res_dict_clean['Localization Errors'][ 'Negative Log Likelihood (Classification)'].extend( res_dict['localization_errors_cls_analysis'] ['ignorance_score_mean']) res_dict_clean['Localization Errors'][ 'Brier Score'].extend( res_dict['localization_errors_cls_analysis'] ['brier_score_mean']) res_dict_clean['Localization Errors'][ 'Negative Log Likelihood (Regression)'].extend( res_dict['localization_errors_reg_analysis'] ['ignorance_score_mean']) res_dict_clean['Localization Errors'][ 'Mean Squared Error'].extend( res_dict['localization_errors_reg_analysis'] ['mean_squared_error']) res_dict_clean['Localization Errors'][ 'Energy Score'].extend( res_dict['localization_errors_reg_analysis'] ['energy_score_mean']) res_dict_clean['Localization Errors'][ 'Image Corruption Level'].extend( [image_corruption_level] * res_dict['localization_errors_reg_analysis'] ['energy_score_mean'].shape[0]) res_dict_clean['Localization Errors'][ 'Method Name'].extend( [config_name] * res_dict['localization_errors_reg_analysis'] ['energy_score_mean'].shape[0]) # False Positives Results res_dict_clean['False Positives'][ 'Negative Log Likelihood (Classification)'].extend( res_dict['false_positives_cls_analysis'] ['ignorance_score_mean']) res_dict_clean['False Positives']['Brier Score'].extend( res_dict['false_positives_cls_analysis'] ['brier_score_mean']) res_dict_clean['False Positives']['Entropy'].extend( res_dict['false_positives_reg_analysis'] ['total_entropy_mean']) res_dict_clean['False Positives'][ 'Image Corruption Level'].extend( [image_corruption_level] * res_dict['false_positives_reg_analysis'] ['total_entropy_mean'].shape[0]) res_dict_clean['False Positives']['Method Name'].extend( [config_name] * res_dict['false_positives_reg_analysis'] ['total_entropy_mean'].shape[0]) else: # False Positives Results res_dict_clean['False Positives'][ 'Negative Log Likelihood (Classification)'].append( res_dict['ignorance_score_mean']) res_dict_clean['False Positives']['Brier Score'].append( res_dict['brier_score_mean']) res_dict_clean['False Positives']['Entropy'].append( res_dict['total_entropy_mean']) res_dict_clean['False Positives'][ 'Image Corruption Level'].append( image_corruption_level) res_dict_clean['False Positives']['Method Name'].append( config_name) return res_dict_clean
def main(args, cfg=None, min_allowed_score=None): # Setup config if cfg is None: cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() cfg.ACTUAL_TEST_DATASET = args.test_dataset # Setup torch device and num_threads torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS) # Build path to gt instances and inference output inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) if min_allowed_score is None: # Check if F-1 Score has been previously computed ON THE ORIGINAL # DATASET, and not on VOC. try: train_set_inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], cfg.DATASETS.TEST[0], args.inference_config, 0) with open( os.path.join(train_set_inference_output_dir, "mAP_res.txt"), "r") as f: min_allowed_score = f.read().strip('][\n').split(', ')[-1] min_allowed_score = round(float(min_allowed_score), 4) except FileNotFoundError: # If not, process all detections. Not recommended as the results might be influenced by very low scoring # detections that would normally be removed in robotics/vision # applications. min_allowed_score = 0.0 # Get matched results by either generating them or loading from file. with torch.no_grad(): try: preprocessed_predicted_instances = torch.load(os.path.join( inference_output_dir, "preprocessed_predicted_instances_odd_{}.pth".format( min_allowed_score)), map_location=device) # Process predictions except FileNotFoundError: prediction_file_name = os.path.join(inference_output_dir, 'coco_instances_results.json') predicted_instances = json.load(open(prediction_file_name, 'r')) preprocessed_predicted_instances = eval_predictions_preprocess( predicted_instances, min_allowed_score=min_allowed_score, is_odd=True) torch.save( preprocessed_predicted_instances, os.path.join( inference_output_dir, "preprocessed_predicted_instances_odd_{}.pth".format( min_allowed_score))) predicted_boxes = preprocessed_predicted_instances['predicted_boxes'] predicted_cov_mats = preprocessed_predicted_instances[ 'predicted_covar_mats'] predicted_cls_probs = preprocessed_predicted_instances[ 'predicted_cls_probs'] predicted_boxes = list( itertools.chain.from_iterable( [predicted_boxes[key] for key in predicted_boxes.keys()])) predicted_cov_mats = list( itertools.chain.from_iterable([ predicted_cov_mats[key] for key in predicted_cov_mats.keys() ])) predicted_cls_probs = list( itertools.chain.from_iterable([ predicted_cls_probs[key] for key in predicted_cls_probs.keys() ])) num_false_positives = len(predicted_boxes) valid_idxs = torch.as_tensor([i for i in range(num_false_positives) ]).to(device) predicted_boxes = torch.stack(predicted_boxes, 1).transpose(0, 1) predicted_cov_mats = torch.stack(predicted_cov_mats, 1).transpose(0, 1) predicted_cls_probs = torch.stack(predicted_cls_probs, 1).transpose(0, 1) false_positives_dict = { 'predicted_box_means': predicted_boxes, 'predicted_box_covariances': predicted_cov_mats, 'predicted_cls_probs': predicted_cls_probs } false_positives_reg_analysis = scoring_rules.compute_reg_scores_fn( false_positives_dict, valid_idxs) if cfg.MODEL.META_ARCHITECTURE == 'ProbabilisticRetinaNet': predicted_class_probs, predicted_class_idx = predicted_cls_probs.max( 1) false_positives_dict['predicted_score_of_gt_category'] = 1.0 - \ predicted_class_probs false_positives_cls_analysis = scoring_rules.sigmoid_compute_cls_scores( false_positives_dict, valid_idxs) else: false_positives_dict[ 'predicted_score_of_gt_category'] = predicted_cls_probs[:, -1] _, predicted_class_idx = predicted_cls_probs[:, :-1].max(1) false_positives_cls_analysis = scoring_rules.softmax_compute_cls_scores( false_positives_dict, valid_idxs) # Summarize and print all table = PrettyTable() table.field_names = ([ 'Output Type', 'Number of Instances', 'Cls Ignorance Score', 'Cls Brier/Probability Score', 'Reg Ignorance Score', 'Reg Energy Score' ]) table.add_row([ "False Positives:", num_false_positives, '{:.4f}'.format( false_positives_cls_analysis['ignorance_score_mean'], ), '{:.4f}'.format(false_positives_cls_analysis['brier_score_mean']), '{:.4f}'.format( false_positives_reg_analysis['total_entropy_mean']), '{:.4f}'.format( false_positives_reg_analysis['fp_energy_score_mean']) ]) print(table) text_file_name = os.path.join( inference_output_dir, 'probabilistic_scoring_res_odd_{}.txt'.format(min_allowed_score)) with open(text_file_name, "w") as text_file: print(table, file=text_file) dictionary_file_name = os.path.join( inference_output_dir, 'probabilistic_scoring_res_odd_{}.pkl'.format(min_allowed_score)) false_positives_reg_analysis.update(false_positives_cls_analysis) with open(dictionary_file_name, "wb") as pickle_file: pickle.dump(false_positives_reg_analysis, pickle_file)
def main(args): # Setup config cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) # Make sure only 1 data point is processed at a time. This simulates # deployment. cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 32 cfg.SOLVER.IMS_PER_BATCH = 1 cfg.MODEL.DEVICE = device.type # Set up number of cpu threads torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS) # Create inference output directory and copy inference config file to keep # track of experimental settings inference_output_dir = os.path.join( cfg['OUTPUT_DIR'], 'inference', args.test_dataset, os.path.split(args.inference_config)[-1][:-5]) os.makedirs(inference_output_dir, exist_ok=True) copyfile( args.inference_config, os.path.join(inference_output_dir, os.path.split(args.inference_config)[-1])) # Get category mapping dictionary: train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( args.test_dataset).thing_dataset_id_to_contiguous_id # If both dicts are equal or if we are performing out of distribution # detection, just flip the test dict. if (train_thing_dataset_id_to_contiguous_id == test_thing_dataset_id_to_contiguous_id) or ( cfg.DATASETS.TRAIN[0] == 'coco_not_in_voc_2017_train'): cat_mapping_dict = dict( (v, k) for k, v in test_thing_dataset_id_to_contiguous_id.items()) else: # If not equal, two situations: 1) BDD to KITTI and 2) COCO to PASCAL cat_mapping_dict = dict( (v, k) for k, v in test_thing_dataset_id_to_contiguous_id.items()) if 'voc' in args.test_dataset and 'coco' in cfg.DATASETS.TRAIN[0]: dataset_mapping_dict = dict( (v, k) for k, v in metadata.COCO_TO_VOC_CONTIGUOUS_ID.items()) elif 'kitti' in args.test_dataset and 'bdd' in cfg.DATASETS.TRAIN[0]: dataset_mapping_dict = dict( (v, k) for k, v in metadata.BDD_TO_KITTI_CONTIGUOUS_ID.items()) else: ValueError( 'Cannot generate category mapping dictionary. Please check if training and inference datasets are compatible.' ) cat_mapping_dict = dict( (dataset_mapping_dict[k], v) for k, v in cat_mapping_dict.items()) # Build predictor predictor = build_predictor(cfg) test_data_loader = build_detection_test_loader( cfg, dataset_name=args.test_dataset) final_output_list = [] if not args.eval_only: with torch.no_grad(): with tqdm.tqdm(total=len(test_data_loader)) as pbar: for idx, input_im in enumerate(test_data_loader): outputs = predictor(input_im) final_output_list.extend( instances_to_json(outputs, input_im[0]['image_id'], cat_mapping_dict)) pbar.update(1) with open( os.path.join(inference_output_dir, 'coco_instances_results.json'), 'w') as fp: json.dump(final_output_list, fp, indent=4, separators=(',', ': ')) #compute_average_precision.main(args, cfg) compute_probabilistic_metrics.main(args, cfg) compute_calibration_errors.main(args, cfg)
def main(args, cfg=None, iou_min=None, iou_correct=None, min_allowed_score=None): # Setup config if cfg is None: cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() cfg.ACTUAL_TEST_DATASET = args.test_dataset # Setup torch device and num_threads torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS) # Build path to gt instances and inference output inference_output_dir = os.path.join( cfg['OUTPUT_DIR'], 'inference', args.test_dataset, os.path.split(args.inference_config)[-1][:-5]) # Get thresholds to perform evaluation on if iou_min is None: iou_min = args.iou_min if iou_correct is None: iou_correct = args.iou_correct if min_allowed_score is None: # Check if F-1 Score has been previously computed ON THE ORIGINAL # DATASET such as COCO even when evaluating on VOC. try: train_set_inference_output_dir = os.path.join( cfg['OUTPUT_DIR'], 'inference', cfg.DATASETS.TEST[0], os.path.split(args.inference_config)[-1][:-5]) with open( os.path.join(train_set_inference_output_dir, "mAP_res.txt"), "r") as f: min_allowed_score = f.read().strip('][\n').split(', ')[-1] min_allowed_score = round(float(min_allowed_score), 4) except FileNotFoundError: # If not, process all detections. Not recommended as the results might be influenced by very low scoring # detections that would normally be removed in robotics/vision # applications. min_allowed_score = 0.0 # Get category mapping dictionary: train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( args.test_dataset).thing_dataset_id_to_contiguous_id cat_mapping_dict = get_thing_dataset_id_to_contiguous_id_dict( cfg, args, train_thing_dataset_id_to_contiguous_id, test_thing_dataset_id_to_contiguous_id) # Get matched results by either generating them or loading from file. with torch.no_grad(): matched_results = evaluation_utils.get_matched_results( cfg, inference_output_dir, iou_min=iou_min, iou_correct=iou_correct, min_allowed_score=min_allowed_score) # Build preliminary dicts required for computing classification scores. for matched_results_key in matched_results.keys(): if 'gt_cat_idxs' in matched_results[matched_results_key].keys(): # First we convert the written things indices to contiguous # indices. gt_converted_cat_idxs = matched_results[matched_results_key][ 'gt_cat_idxs'].squeeze(1) gt_converted_cat_idxs = torch.as_tensor([ cat_mapping_dict[class_idx.cpu().tolist()] for class_idx in gt_converted_cat_idxs ]).to(device) matched_results[matched_results_key][ 'gt_converted_cat_idxs'] = gt_converted_cat_idxs.to(device) matched_results[matched_results_key][ 'gt_cat_idxs'] = gt_converted_cat_idxs if 'predicted_cls_probs' in matched_results[ matched_results_key].keys(): predicted_class_probs, predicted_cat_idxs = matched_results[ matched_results_key]['predicted_cls_probs'][:, :-1].max(1) matched_results[matched_results_key][ 'predicted_cat_idxs'] = predicted_cat_idxs matched_results[matched_results_key][ 'output_logits'] = predicted_class_probs # Load the different detection partitions true_positives = matched_results['true_positives'] duplicates = matched_results['duplicates'] false_positives = matched_results['false_positives'] # Get the number of elements in each partition cls_min_uncertainty_error_list = [] reg_maximum_calibration_error_list = [] reg_expected_calibration_error_list = [] reg_min_uncertainty_error_list = [] all_predicted_scores = torch.cat( (true_positives['predicted_cls_probs'].flatten(), duplicates['predicted_cls_probs'].flatten(), false_positives['predicted_cls_probs'].flatten()), 0) all_gt_scores = torch.cat( (torch.nn.functional.one_hot( true_positives['gt_cat_idxs'], true_positives['predicted_cls_probs'].shape[1]).flatten().to( device), torch.nn.functional.one_hot( duplicates['gt_cat_idxs'], duplicates['predicted_cls_probs']. shape[1]).flatten().to(device), torch.zeros_like(false_positives['predicted_cls_probs'].type( torch.LongTensor).flatten()).to(device)), 0) # Compute classification calibration error using calibration # library cls_marginal_calibration_error = cal.get_calibration_error( all_predicted_scores.cpu().numpy(), all_gt_scores.cpu().numpy()) for class_idx in cat_mapping_dict.values(): true_positives_valid_idxs = true_positives[ 'gt_converted_cat_idxs'] == class_idx duplicates_valid_idxs = duplicates[ 'gt_converted_cat_idxs'] == class_idx false_positives_valid_idxs = false_positives[ 'predicted_cat_idxs'] == class_idx # For the rest of the code, gt_scores need to be ones or zeros. All # processing is done on a per-class basis all_gt_scores = torch.cat( (torch.ones_like(true_positives['gt_converted_cat_idxs'] [true_positives_valid_idxs]).to(device), torch.zeros_like(duplicates['gt_converted_cat_idxs'] [duplicates_valid_idxs]).to(device), torch.zeros_like(false_positives['predicted_cat_idxs'] [false_positives_valid_idxs]).to(device)), 0).type(torch.DoubleTensor) # Compute classification minimum uncertainty error distribution_params = torch.cat( (true_positives['output_logits'][true_positives_valid_idxs], duplicates['output_logits'][duplicates_valid_idxs], false_positives['output_logits'][false_positives_valid_idxs]), 0) all_predicted_cat_entropy = -torch.log(distribution_params) random_idxs = torch.randperm(all_predicted_cat_entropy.shape[0]) all_predicted_cat_entropy = all_predicted_cat_entropy[random_idxs] all_gt_scores_cls = all_gt_scores[random_idxs] sorted_entropies, sorted_idxs = all_predicted_cat_entropy.sort() sorted_gt_idxs_tp = all_gt_scores_cls[sorted_idxs] sorted_gt_idxs_fp = 1.0 - sorted_gt_idxs_tp tp_cum_sum = torch.cumsum(sorted_gt_idxs_tp, 0) fp_cum_sum = torch.cumsum(sorted_gt_idxs_fp, 0) cls_u_errors = 0.5 * (sorted_gt_idxs_tp.sum(0) - tp_cum_sum) / \ sorted_gt_idxs_tp.sum(0) + 0.5 * fp_cum_sum / sorted_gt_idxs_fp.sum(0) cls_min_u_error = cls_u_errors.min() cls_min_uncertainty_error_list.append(cls_min_u_error) # Compute regression calibration errors. False negatives cant be evaluated since # those do not have ground truth. all_predicted_means = torch.cat( (true_positives['predicted_box_means'] [true_positives_valid_idxs], duplicates['predicted_box_means'][duplicates_valid_idxs]), 0) all_predicted_covariances = torch.cat( (true_positives['predicted_box_covariances'] [true_positives_valid_idxs], duplicates['predicted_box_covariances'][duplicates_valid_idxs] ), 0) all_predicted_gt = torch.cat( (true_positives['gt_box_means'][true_positives_valid_idxs], duplicates['gt_box_means'][duplicates_valid_idxs]), 0) all_predicted_covariances = torch.diagonal( all_predicted_covariances, dim1=1, dim2=2) # The assumption of uncorrelated components is not accurate, especially when estimating full # covariance matrices. However, using scipy to compute multivariate cdfs is very very # time consuming for such large amounts of data. reg_maximum_calibration_error = [] reg_expected_calibration_error = [] # Regression calibration is computed for every box dimension # separately, and averaged after. for box_dim in range(all_predicted_gt.shape[1]): all_predicted_means_current_dim = all_predicted_means[:, box_dim] all_predicted_gt_current_dim = all_predicted_gt[:, box_dim] all_predicted_covariances_current_dim = all_predicted_covariances[:, box_dim] normal_dists = torch.distributions.Normal( all_predicted_means_current_dim, scale=all_predicted_covariances_current_dim) all_predicted_scores = normal_dists.cdf( all_predicted_gt_current_dim) reg_calibration_error = [] histogram_bin_step_size = 1 / 15.0 for i in torch.arange(0.0, 1.0 - histogram_bin_step_size, histogram_bin_step_size): # Get number of elements in bin elements_in_bin = (all_predicted_scores < (i + histogram_bin_step_size)) num_elems_in_bin_i = elements_in_bin.type( torch.FloatTensor).to(device).sum() # Compute calibration error from "Accurate uncertainties for deep # learning using calibrated regression" paper. reg_calibration_error.append( (num_elems_in_bin_i / all_predicted_scores.shape[0] - (i + histogram_bin_step_size))**2) calibration_error = torch.stack(reg_calibration_error).to( device) reg_maximum_calibration_error.append(calibration_error.max()) reg_expected_calibration_error.append(calibration_error.mean()) reg_maximum_calibration_error_list.append( reg_maximum_calibration_error) reg_expected_calibration_error_list.append( reg_expected_calibration_error) # Compute regression minimum uncertainty error all_predicted_covars = torch.cat(( true_positives['predicted_box_covariances'] [true_positives_valid_idxs], duplicates['predicted_box_covariances'][duplicates_valid_idxs], false_positives['predicted_box_covariances'] [false_positives_valid_idxs]), 0) all_predicted_distributions = torch.distributions.multivariate_normal.MultivariateNormal( torch.zeros(all_predicted_covars.shape[0:2]).to(device), all_predicted_covars + 1e-4 * torch.eye(all_predicted_covars.shape[2]).to(device)) all_predicted_reg_entropy = all_predicted_distributions.entropy() random_idxs = torch.randperm(all_predicted_reg_entropy.shape[0]) all_predicted_reg_entropy = all_predicted_reg_entropy[random_idxs] all_gt_scores_reg = all_gt_scores[random_idxs] sorted_entropies, sorted_idxs = all_predicted_reg_entropy.sort() sorted_gt_idxs_tp = all_gt_scores_reg[sorted_idxs] sorted_gt_idxs_fp = 1.0 - sorted_gt_idxs_tp tp_cum_sum = torch.cumsum(sorted_gt_idxs_tp, 0) fp_cum_sum = torch.cumsum(sorted_gt_idxs_fp, 0) reg_u_errors = 0.5 * ((sorted_gt_idxs_tp.sum(0) - tp_cum_sum) / sorted_gt_idxs_tp.sum(0)) + 0.5 * ( fp_cum_sum / sorted_gt_idxs_fp.sum(0)) reg_min_u_error = reg_u_errors.min() reg_min_uncertainty_error_list.append(reg_min_u_error) # Summarize and print all table = PrettyTable() table.field_names = ([ 'Cls Marginal Calibration Error', 'Reg Expected Calibration Error', 'Reg Maximum Calibration Error', 'Cls Minimum Uncertainty Error', 'Reg Minimum Uncertainty Error' ]) reg_expected_calibration_error = torch.stack([ torch.stack(reg, 0) for reg in reg_expected_calibration_error_list ], 0) reg_expected_calibration_error = reg_expected_calibration_error[ ~torch.isnan(reg_expected_calibration_error)].mean() reg_maximum_calibration_error = torch.stack([ torch.stack(reg, 0) for reg in reg_maximum_calibration_error_list ], 0) reg_maximum_calibration_error = reg_maximum_calibration_error[ ~torch.isnan(reg_maximum_calibration_error)].mean() cls_min_u_error = torch.stack(cls_min_uncertainty_error_list, 0) cls_min_u_error = cls_min_u_error[~torch.isnan(cls_min_u_error)].mean() reg_min_u_error = torch.stack(reg_min_uncertainty_error_list, 0) reg_min_u_error = reg_min_u_error[~torch.isnan(reg_min_u_error)].mean() table.add_row([ '{:.4f}'.format(cls_marginal_calibration_error), '{:.4f}'.format( reg_expected_calibration_error.cpu().numpy().tolist()), '{:.4f}'.format( reg_maximum_calibration_error.cpu().numpy().tolist()), '{:.4f}'.format(cls_min_u_error.cpu().numpy().tolist()), '{:.4f}'.format(reg_min_u_error.cpu().numpy().tolist()) ]) print(table)
def main(args, cfg=None, iou_min=None, iou_correct=None, min_allowed_score=None, print_results=True): # Setup config if cfg is None: cfg = setup_config(args, random_seed=args.random_seed, is_testing=True) cfg.defrost() cfg.ACTUAL_TEST_DATASET = args.test_dataset # Setup torch device and num_threads torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS) # Build path to gt instances and inference output inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config, args.image_corruption_level) # Get thresholds to perform evaluation on if iou_min is None: iou_min = args.iou_min if iou_correct is None: iou_correct = args.iou_correct if min_allowed_score is None: # Check if F-1 Score has been previously computed ON THE ORIGINAL # DATASET such as COCO even when evaluating on OpenImages. try: train_set_inference_output_dir = get_inference_output_dir( cfg['OUTPUT_DIR'], cfg.DATASETS.TEST[0], args.inference_config, 0) with open( os.path.join(train_set_inference_output_dir, "mAP_res.txt"), "r") as f: min_allowed_score = f.read().strip('][\n').split(', ')[-1] min_allowed_score = round(float(min_allowed_score), 4) except FileNotFoundError: # If not, process all detections. Not recommended as the results might be influenced by very low scoring # detections that would normally be removed in robotics/vision # applications. min_allowed_score = 0.0 # Get category mapping dictionary: train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get( args.test_dataset).thing_dataset_id_to_contiguous_id cat_mapping_dict = get_test_thing_dataset_id_to_train_contiguous_id_dict( cfg, args, train_thing_dataset_id_to_contiguous_id, test_thing_dataset_id_to_contiguous_id) # Get matched results by either generating them or loading from file. with torch.no_grad(): matched_results = evaluation_utils.get_matched_results( cfg, inference_output_dir, iou_min=iou_min, iou_correct=iou_correct, min_allowed_score=min_allowed_score) # Build preliminary dicts required for computing classification scores. for matched_results_key in matched_results.keys(): if 'gt_cat_idxs' in matched_results[matched_results_key].keys(): # First we convert the written things indices to contiguous # indices. gt_converted_cat_idxs = matched_results[matched_results_key][ 'gt_cat_idxs'].squeeze(1) gt_converted_cat_idxs = torch.as_tensor([ cat_mapping_dict[class_idx.cpu().tolist()] for class_idx in gt_converted_cat_idxs ]).to(device) matched_results[matched_results_key][ 'gt_converted_cat_idxs'] = gt_converted_cat_idxs.to(device) matched_results[matched_results_key][ 'gt_cat_idxs'] = gt_converted_cat_idxs if 'predicted_cls_probs' in matched_results[ matched_results_key].keys(): if cfg.MODEL.META_ARCHITECTURE == 'ProbabilisticRetinaNet': # For false positives, the correct category is background. For retinanet, since no explicit # background category is available, this value is computed as 1.0 - score of the predicted # category. predicted_class_probs, predicted_cat_idxs = matched_results[ matched_results_key]['predicted_cls_probs'].max(1) matched_results[matched_results_key][ 'output_logits'] = predicted_class_probs else: predicted_class_probs, predicted_cat_idxs = matched_results[ matched_results_key][ 'predicted_cls_probs'][:, :-1].max(1) matched_results[matched_results_key][ 'predicted_cat_idxs'] = predicted_cat_idxs # Load the different detection partitions true_positives = matched_results['true_positives'] duplicates = matched_results['duplicates'] localization_errors = matched_results['localization_errors'] false_positives = matched_results['false_positives'] reg_maximum_calibration_error_list = [] reg_expected_calibration_error_list = [] if cfg.MODEL.META_ARCHITECTURE == 'ProbabilisticRetinaNet': all_predicted_scores = torch.cat( (true_positives['predicted_cls_probs'].flatten(), duplicates['predicted_cls_probs'].flatten(), localization_errors['predicted_cls_probs'].flatten(), false_positives['predicted_cls_probs'].flatten()), 0) all_gt_scores = torch.cat( (torch.nn.functional.one_hot( true_positives['gt_cat_idxs'], true_positives['predicted_cls_probs'].shape[1]).flatten(). to(device), torch.nn.functional.one_hot( duplicates['gt_cat_idxs'], duplicates['predicted_cls_probs'].shape[1]).flatten().to( device), torch.zeros_like( localization_errors['predicted_cls_probs'].type( torch.LongTensor).flatten()).to(device), torch.zeros_like(false_positives['predicted_cls_probs'].type( torch.LongTensor).flatten()).to(device)), 0) else: # For RCNN based networks, a background category is # explicitly available. all_predicted_scores = torch.cat( (true_positives['predicted_cls_probs'], duplicates['predicted_cls_probs'], localization_errors['predicted_cls_probs'], false_positives['predicted_cls_probs']), 0) all_gt_scores = torch.cat( (true_positives['gt_cat_idxs'], duplicates['gt_cat_idxs'], torch.ones_like(localization_errors['predicted_cls_probs'] [:, 0]).fill_(80.0).type( torch.LongTensor).to(device), torch.ones_like(false_positives['predicted_cls_probs'][:, 0]). fill_(80.0).type(torch.LongTensor).to(device)), 0) # Compute classification calibration error using calibration # library cls_marginal_calibration_error = cal.get_calibration_error( all_predicted_scores.cpu().numpy(), all_gt_scores.cpu().numpy()) for class_idx in cat_mapping_dict.values(): true_positives_valid_idxs = true_positives[ 'gt_converted_cat_idxs'] == class_idx localization_errors_valid_idxs = localization_errors[ 'gt_converted_cat_idxs'] == class_idx duplicates_valid_idxs = duplicates[ 'gt_converted_cat_idxs'] == class_idx # Compute regression calibration errors. False negatives cant be evaluated since # those do not have ground truth. all_predicted_means = torch.cat( (true_positives['predicted_box_means'] [true_positives_valid_idxs], duplicates['predicted_box_means'][duplicates_valid_idxs], localization_errors['predicted_box_means'] [localization_errors_valid_idxs]), 0) all_predicted_covariances = torch.cat(( true_positives['predicted_box_covariances'] [true_positives_valid_idxs], duplicates['predicted_box_covariances'][duplicates_valid_idxs], localization_errors['predicted_box_covariances'] [localization_errors_valid_idxs]), 0) all_predicted_gt = torch.cat( (true_positives['gt_box_means'][true_positives_valid_idxs], duplicates['gt_box_means'][duplicates_valid_idxs], localization_errors['gt_box_means'] [localization_errors_valid_idxs]), 0) all_predicted_covariances = torch.diagonal( all_predicted_covariances, dim1=1, dim2=2) # The assumption of uncorrelated components is not accurate, especially when estimating full # covariance matrices. However, using scipy to compute multivariate cdfs is very very # time consuming for such large amounts of data. reg_maximum_calibration_error = [] reg_expected_calibration_error = [] # Regression calibration is computed for every box dimension # separately, and averaged after. for box_dim in range(all_predicted_gt.shape[1]): all_predicted_means_current_dim = all_predicted_means[:, box_dim] all_predicted_gt_current_dim = all_predicted_gt[:, box_dim] all_predicted_covariances_current_dim = all_predicted_covariances[:, box_dim] normal_dists = torch.distributions.Normal( all_predicted_means_current_dim, scale=all_predicted_covariances_current_dim) all_predicted_scores = normal_dists.cdf( all_predicted_gt_current_dim) reg_calibration_error = [] histogram_bin_step_size = 1 / 15.0 for i in torch.arange(0.0, 1.0 - histogram_bin_step_size, histogram_bin_step_size): # Get number of elements in bin elements_in_bin = (all_predicted_scores < (i + histogram_bin_step_size)) num_elems_in_bin_i = elements_in_bin.type( torch.FloatTensor).to(device).sum() # Compute calibration error from "Accurate uncertainties for deep # learning using calibrated regression" paper. reg_calibration_error.append( (num_elems_in_bin_i / all_predicted_scores.shape[0] - (i + histogram_bin_step_size))**2) calibration_error = torch.stack(reg_calibration_error).to( device) reg_maximum_calibration_error.append(calibration_error.max()) reg_expected_calibration_error.append(calibration_error.mean()) reg_maximum_calibration_error_list.append( reg_maximum_calibration_error) reg_expected_calibration_error_list.append( reg_expected_calibration_error) # Summarize and print all reg_expected_calibration_error = torch.stack([ torch.stack(reg, 0) for reg in reg_expected_calibration_error_list ], 0) reg_expected_calibration_error = reg_expected_calibration_error[ ~torch.isnan(reg_expected_calibration_error)].mean() reg_maximum_calibration_error = torch.stack([ torch.stack(reg, 0) for reg in reg_maximum_calibration_error_list ], 0) reg_maximum_calibration_error = reg_maximum_calibration_error[ ~torch.isnan(reg_maximum_calibration_error)].mean() if print_results: table = PrettyTable() table.field_names = ([ 'Cls Marginal Calibration Error', 'Reg Expected Calibration Error', 'Reg Maximum Calibration Error' ]) table.add_row([ cls_marginal_calibration_error, reg_expected_calibration_error.cpu().numpy().tolist(), reg_maximum_calibration_error.cpu().numpy().tolist() ]) print(table) text_file_name = os.path.join( inference_output_dir, 'calibration_errors_{}_{}_{}.txt'.format( iou_min, iou_correct, min_allowed_score)) with open(text_file_name, "w") as text_file: print([ cls_marginal_calibration_error, reg_expected_calibration_error.cpu().numpy().tolist(), reg_maximum_calibration_error.cpu().numpy().tolist() ], file=text_file) dictionary_file_name = os.path.join( inference_output_dir, 'calibration_errors_res_{}_{}_{}.pkl'.format( iou_min, iou_correct, min_allowed_score)) final_accumulated_output_dict = { 'cls_marginal_calibration_error': cls_marginal_calibration_error, 'reg_expected_calibration_error': reg_expected_calibration_error.cpu().numpy(), 'reg_maximum_calibration_error': reg_maximum_calibration_error.cpu().numpy() } with open(dictionary_file_name, "wb") as pickle_file: pickle.dump(final_accumulated_output_dict, pickle_file)