def show_mined_patches(image_id, class_ids, dataloader, hardnegdata): # show mined patches image_to_show = get_image_from_dataloader(image_id, dataloader) # collect data boxes_one_image, labels_one_image, scores_one_image, anchor_boxes_one_image, transform_corners_one_image = [], [], [], [], [] for a in hardnegdata: boxes_one_image.append(a["crop_position_xyxy"]) anchor_boxes_one_image.append(a["anchor_position_xyxy"]) scores_one_image.append(a["score"]) labels_one_image.append(a["label_local"] * (-1 if a["role"] == "neg" else 1)) transform_corners_one_image.append(a["transform_corners"]) scores_one_image = torch.tensor(scores_one_image, dtype=torch.float) boxes_one_image = cat_boxlist(boxes_one_image) anchor_boxes_one_image = cat_boxlist(anchor_boxes_one_image) labels_one_image = torch.tensor(labels_one_image, dtype=torch.long) transform_corners_one_image = torch.stack(transform_corners_one_image, dim=0) # show show_annotated_image( img=image_to_show, boxes=boxes_one_image, labels=labels_one_image, scores=scores_one_image, default_boxes=anchor_boxes_one_image, transform_corners=transform_corners_one_image, class_ids=class_ids, score_threshold=cfg.visualization.mining.score_threshold, max_dets=cfg.visualization.mining.max_detections, showfig=True)
def evaluate(dataloader, detector, cfg_maskrcnn, retrievalnet, opt, cfg_eval, cfg_visualization, is_cuda=False, logger_prefix="detector-retrieval"): logger = logging.getLogger(f"{logger_prefix}.evaluate") dataset_name = dataloader.get_name() dataset_scale = dataloader.get_eval_scale() logger.info("Starting to eval on {0}, scale {1}".format( dataset_name, dataset_scale)) t_start_eval = time.time() detector.eval() retrievalnet.eval() ## setup retrievalnet # setting up the multi-scale parameters ms = [1] msp = 1 if opt.retrieval_multiscale: ms = [1, 1. / math.sqrt(2), 1. / 2] if retrievalnet.meta[ "pooling"] == "gem" and retrievalnet.whiten is None: msp = retrievalnet.pool.p.data.tolist()[0] #setup whitening if opt.retrieval_whitening_path is not None: logger.info("Whitening is precomputed, loading it from {0}".format( opt.retrieval_whitening_path)) whitening_data = torch.load(opt.retrieval_whitening_path) if ( (opt.retrieval_multiscale and "ms" in whitening_data) or \ (not opt.retrieval_multiscale and "ss" in whitening_data ) ): if opt.retrieval_multiscale: Lw = copy.deepcopy(whitening_data["ms"]) else: Lw = copy.deepcopy(whitening_data["ss"]) else: raise RuntimeError( "Whitening should be precomputed with the network") # convert whitening data to torch tensors Lw["m"], Lw["P"] = torch.from_numpy(Lw["m"]), torch.from_numpy(Lw["P"]) if is_cuda: Lw["m"], Lw["P"] = Lw["m"].cuda(), Lw["P"].cuda() else: Lw = None with torch.no_grad( ): # do evaluation in forward mode only (for speed and memory) # extract features from query images query_images, _, _ = dataloader.get_all_class_images(do_resize=False) if is_cuda: query_images = [img.cuda() for img in query_images] query_images = [img[0] for img in query_images ] # get rid of the batch dimension query_images = [ resize_image_tensor(img, opt.retrieval_image_size) for img in query_images ] query_images = [dataloader.unnorm_image(img) for img in query_images] query_images_with_aug = [] for im in query_images: query_images_with_aug.append(im) if not cfg_eval.class_image_augmentation: num_class_views = 1 elif cfg_eval.class_image_augmentation == "rotation90": im90 = im.rot90(1, [1, 2]) im180 = im90.rot90(1, [1, 2]) im270 = im180.rot90(1, [1, 2]) query_images_with_aug.append(im90) query_images_with_aug.append(im180) query_images_with_aug.append(im270) num_class_views = 4 elif cfg_eval.class_image_augmentation == "horflip": im_flipped = im.flip(2) query_images_with_aug.append(im_flipped) num_class_views = 2 else: raise RuntimeError( f"Unknown value of class_image_augmentation: {cfg_eval.class_image_augmentation}" ) query_images = query_images_with_aug query_vectors = extract_vectors_from_images(retrievalnet, query_images, ms=ms, msp=msp) # apply whitening if defined if Lw is not None: query_vectors = whitenapply(query_vectors, Lw["m"], Lw["P"]) query_vectors = torch.transpose(query_vectors, 0, 1) # prepare looping over all iamges iterator = make_iterator_extract_scores_from_images_batched( dataloader, detector, cfg_maskrcnn, logger, image_batch_size=cfg_eval.batch_size, is_cuda=is_cuda) boxes, labels, scores = [], [], [] gt_boxes = [] image_ids = [] losses = OrderedDict() # loop over all dataset images num_evaluted_images = 0 for data in iterator: image_id, boxes_one_image, image_pyramid, query_img_sizes, class_ids, initial_img_size = data image_ids.append(image_id) logger.info(f"Image {num_evaluted_images}: id {image_id}") num_evaluted_images += 1 img_size_pyramid = [ FeatureMapSize(img=img) for img in image_pyramid ] gt_boxes_one_image = dataloader.get_image_annotation_for_imageid( image_id) gt_boxes.append(gt_boxes_one_image) # vizualize GT for debug if cfg_visualization.show_gt_boxes: visualizer.show_gt_boxes(image_id, gt_boxes_one_image, class_ids, dataloader) # decode image predictions # merge boxes_one_image, labels_one_image, scores_one_image from different pyramid layers boxes_one_image = cat_boxlist(boxes_one_image) # do NMS good_indices = nms( boxes_one_image, opt.nms_iou_threshold_detector_score, nms_score_threshold=opt.nms_score_threshold_detector_score) boxes_one_image = boxes_one_image[good_indices] # extract feature vectors from the predictions image_original = dataloader._transform_image(image_id, do_augmentation=True, hflip=False, vflip=False)[0] if is_cuda: image_original = image_original.cuda() image_patches = crop_resize_image_patches( image_original, boxes_one_image, opt.retrieval_image_size, logger, unnorm_image=dataloader.unnorm_image, is_cuda=is_cuda) # filter out cases when failed to crop a box: outside of the image good_indices = [ i for i, p in enumerate(image_patches) if p is not None ] if good_indices: # non empty image_patches = [p for p in image_patches if p is not None] boxes_one_image = boxes_one_image[good_indices] image_vectors = extract_vectors_from_images(retrievalnet, image_patches, ms=ms, msp=msp) # compute class scores from image_vectors and query_vectors (already transposed) if Lw is not None: # apply whitening if defined image_vectors = whitenapply(image_vectors, Lw["m"], Lw["P"]) scores_retrieval = torch.mm(query_vectors, image_vectors) num_queries = scores_retrieval.size(0) num_detections = scores_retrieval.size(1) list_of_active_label = torch.LongTensor(class_ids) if cfg_eval.class_image_augmentation: list_of_active_label = torch.stack( [list_of_active_label] * num_class_views, 1).view(-1) # take all labels for all boxes - will sort them by scores at eval scores_one_image = scores_retrieval.view(-1) boxes_one_image = cat_boxlist([boxes_one_image] * num_queries) labels_one_image = torch.stack([list_of_active_label] * num_detections, 1).contiguous().view(-1) # add scores and labels: overwrite if existed boxes_one_image.add_field("labels", labels_one_image) boxes_one_image.add_field("scores", scores_one_image) # NMS using the retrieval scores good_indices = nms( boxes_one_image, cfg_eval.nms_iou_threshold, nms_score_threshold=cfg_eval.nms_score_threshold, do_separate_per_label=not cfg_eval.nms_across_classes) boxes_one_image = boxes_one_image[good_indices] else: boxes_one_image.add_field( "labels", torch.zeros(0, dtype=torch.long, device=boxes_one_image.bbox_xyxy.device)) boxes_one_image.add_field( "scores", torch.zeros(0, dtype=torch.float, device=boxes_one_image.bbox_xyxy.device)) boxes.append(boxes_one_image.cpu()) if cfg_visualization.show_detections: # do not pass class_ids - this is already taken care of visualizer.show_detections(boxes_one_image, image_id, dataloader, cfg_visualization, class_ids=None) # normalize by number of steps for k in losses: losses[k] /= num_evaluted_images # Save detection if requested if cfg_visualization.path_to_save_detections: data = { "image_ids": image_ids, "boxes_xyxy": [bb.bbox_xyxy for bb in boxes], "labels": [bb.get_field("labels") for bb in boxes], "scores": [bb.get_field("scores") for bb in boxes], "gt_boxes_xyxy": [bb.bbox_xyxy for bb in gt_boxes], "gt_labels": [bb.get_field("labels") for bb in gt_boxes], "gt_difficults": [bb.get_field("difficult") for bb in gt_boxes] } dataset_name = dataloader.get_name() os.makedirs(cfg_visualization.path_to_save_detections, exist_ok=True) save_path = os.path.join(cfg_visualization.path_to_save_detections, dataset_name + "_detections.pth") torch.save(data, save_path) # compute mAP for mAP_iou_threshold in cfg_eval.mAP_iou_thresholds: logger.info("Evaluating at IoU th {:0.2f}".format(mAP_iou_threshold)) ap_data = do_voc_evaluation(boxes, gt_boxes, iou_thresh=mAP_iou_threshold, use_07_metric=False) losses["mAP@{:0.2f}".format(mAP_iou_threshold)] = ap_data["map"] losses["mAPw@{:0.2f}".format( mAP_iou_threshold)] = ap_data["map_weighted"] losses["recall@{:0.2f}".format(mAP_iou_threshold)] = ap_data["recall"] losses["AP_joint_classes@{:0.2f}".format( mAP_iou_threshold)] = ap_data["ap_joint_classes"] # per class AP information for i_class, (ap, recall, n_pos) in enumerate( zip(ap_data["ap_per_class"], ap_data["recall_per_class"], ap_data["n_pos"])): if not np.isnan(ap): assert i_class in class_ids, "Could not find class_id in the list of ids" logger.info( "Class {0} (local {3}), AP {1:0.4f}, #obj {2}, recall {4:0.4f}" .format(i_class, ap, n_pos, class_ids.index(i_class), recall)) # save timing losses["eval_time"] = (time.time() - t_start_eval) logger.info("Evaluated on {0}, scale {1}".format(dataset_name, dataset_scale)) print_meters(losses, logger) return losses
def forward(self, feature_maps): """ Args: feature_maps (Tensor[float], size b^A x d x h^A x w^A) - contains the feature map of the input image b^A - batch size d - feature dimensionality h^A - height of the feature map w^A - width of the feature map Returns: # here b^C is the class batch size, i.e., the number of class images contained in self.class_batch_size passed when creating this object output_localization (Tensor[float], size b^A x b^C x 4 x h^A x w^A) - the localization output w.r.t. the standard box encoding - computed by DetectionBoxCoder.build_loc_targets output_recognition (Tensor[float], size size b^A x b^C x 1 x h^A x w^A) - the recognition output for each of the classes - the correlation, linearly converted to [0, 1] segment, the higher the better match to the class output_recognition_transform_detached (Tensor[float], size b^A x b^C x 1 x h^A x w^A) - same to output_recognition, but with the computational graph detached from the transformation (for backward that does not update the transofrmation - intended for the negatives) corner_coordinates (Tensor[float], size size b^A x b^C x 8 x h^A x w^A) - the corners of the default boxes after the transofrmation, datached from the computational graph, for visualisation only """ # get dims batch_size = feature_maps.size(0) feature_dim = feature_maps.size(1) image_fm_size = FeatureMapSize(img=feature_maps) class_fm_size = FeatureMapSize(img=self.class_feature_maps) feature_dim_for_regression = class_fm_size.h * class_fm_size.w class_feature_dim = self.class_feature_maps.size(1) assert feature_dim == class_feature_dim, "Feature dimensionality of input={0} and class={1} feature maps has to equal".format( feature_dim, class_feature_dim) # L2-normalize the feature map feature_maps = normalize_feature_map_L2(feature_maps, 1e-5) # get correlations all to all corr_maps = torch.einsum("bfhw,afxy->abwhxy", self.class_feature_maps, feature_maps) # need to try to optimize this with opt_einsum: https://optimized-einsum.readthedocs.io/en/latest/ # CAUTION: note the switch of dimensions hw to wh. This is done for compatability with the FeatureCorrelation class by Ignacio Rocco https://github.com/ignacio-rocco/ncnet/blob/master/lib/model.py (to be able to load their models) # reshape to have the correlation map of dimensions similar to the standard tensor for image feature maps corr_maps = corr_maps.contiguous().view( batch_size * self.class_batch_size, feature_dim_for_regression, image_fm_size.h, image_fm_size.w) # compute the grids to resample corr maps resampling_grids_local_coord = self.aligner(corr_maps) # build classifications outputs cor_maps_for_recognition = corr_maps.contiguous().view( batch_size, self.class_batch_size, feature_dim_for_regression, image_fm_size.h, image_fm_size.w) resampling_grids_local_coord = resampling_grids_local_coord.contiguous( ).view(batch_size, self.class_batch_size, image_fm_size.h, image_fm_size.w, self.aligner.out_grid_size.h, self.aligner.out_grid_size.w, 2) # need to recompute resampling_grids to [-1, 1] coordinates w.r.t. the feature maps to sample points with F.grid_sample # first get the list of boxes that corresponds to the receptive fields of the parameter regression network: box sizes are the receptive field sizes, stride is the network stride default_boxes_xyxy_wrt_fm = self.box_grid_generator_feature_map_level.create_strided_boxes_columnfirst( fm_size=image_fm_size) default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.view( 1, 1, image_fm_size.h, image_fm_size.w, 4) # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x box_grid_height x box_grid_width x 4 default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.to( resampling_grids_local_coord.device) resampling_grids_fm_coord = convert_box_coordinates_local_to_global( resampling_grids_local_coord, default_boxes_xyxy_wrt_fm) # covert to coordinates normalized to [-1, 1] (to be compatible with torch.nn.functional.grid_sample) resampling_grids_fm_coord_x = resampling_grids_fm_coord.narrow( -1, 0, 1) resampling_grids_fm_coord_y = resampling_grids_fm_coord.narrow( -1, 1, 1) resampling_grids_fm_coord_unit = torch.cat([ resampling_grids_fm_coord_x / (image_fm_size.w - 1) * 2 - 1, resampling_grids_fm_coord_y / (image_fm_size.h - 1) * 2 - 1 ], dim=-1) # clamp to fit the image plane resampling_grids_fm_coord_unit = resampling_grids_fm_coord_unit.clamp( -1, 1) # extract and pool matches # # slower code: # matches_summed = self.resample_of_correlation_map_simple(cor_maps_for_recognition, # resampling_grids_fm_coord_unit, # self.class_pool_mask) # we use faster, but somewhat more obscure version matches_summed = self.resample_of_correlation_map_fast( cor_maps_for_recognition, resampling_grids_fm_coord_unit, self.class_pool_mask) if matches_summed.requires_grad: matches_summed_transform_detached = self.resample_of_correlation_map_fast( cor_maps_for_recognition, resampling_grids_fm_coord_unit.detach(), self.class_pool_mask) else: # Optimization to make eval faster matches_summed_transform_detached = matches_summed # build localization targets default_boxes_xyxy_wrt_image = self.box_grid_generator_image_level.create_strided_boxes_columnfirst( fm_size=image_fm_size) default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.view( 1, 1, image_fm_size.h, image_fm_size.w, 4) # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x box_grid_height x box_grid_width x 4 default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.to( resampling_grids_local_coord.device) resampling_grids_image_coord = convert_box_coordinates_local_to_global( resampling_grids_local_coord, default_boxes_xyxy_wrt_image) num_pooled_points = self.aligner.out_grid_size.w * self.aligner.out_grid_size.h resampling_grids_x = resampling_grids_image_coord.narrow( -1, 0, 1).contiguous().view(-1, num_pooled_points) resampling_grids_y = resampling_grids_image_coord.narrow( -1, 1, 1).contiguous().view(-1, num_pooled_points) class_boxes_xyxy = torch.stack([ resampling_grids_x.min(dim=1)[0], resampling_grids_y.min(dim=1)[0], resampling_grids_x.max(dim=1)[0], resampling_grids_y.max(dim=1)[0] ], 1) # extract rectangle borders to draw complete boxes corner_coordinates = resampling_grids_image_coord[:, :, :, :, [ 0, -1 ]][:, :, :, :, :, [0, -1]] # only the corners corner_coordinates = corner_coordinates.detach_() corner_coordinates = corner_coordinates.view( batch_size, self.class_batch_size, image_fm_size.h, image_fm_size.w, 8) # batch_size x label_batch_size x fm_height x fm_width x 8 corner_coordinates = corner_coordinates.transpose(3, 4).transpose( 2, 3) # batch_size x label_batch_size x 5 x fm_height x fm_width class_boxes = BoxList(class_boxes_xyxy.view(-1, 4), image_fm_size, mode="xyxy") default_boxes_wrt_image = BoxList(default_boxes_xyxy_wrt_image.view( -1, 4), image_fm_size, mode="xyxy") default_boxes_with_image_batches = cat_boxlist( [default_boxes_wrt_image] * batch_size * self.class_batch_size) output_localization = Os2dBoxCoder.build_loc_targets( class_boxes, default_boxes_with_image_batches) # num_boxes x 4 output_localization = output_localization.view( batch_size, self.class_batch_size, image_fm_size.h, image_fm_size.w, 4) # batch_size x label_batch_size x fm_height x fm_width x 4 output_localization = output_localization.transpose(3, 4).transpose( 2, 3) # batch_size x label_batch_size x 4 x fm_height x fm_width output_recognition = (matches_summed - 1.0) / 2.0 output_recognition_transform_detached = ( matches_summed_transform_detached - 1.0) / 2.0 return output_localization, output_recognition, output_recognition_transform_detached, corner_coordinates
def mine_hard_patches(dataloader, net, cfg, criterion): """Mine patches that are hard: classification false positives and negative, localization errors At each level of sampled image pyramid, we need to cut out a piece of size appropriate for training (levels are defined by cfg.train.mining.num_random_pyramid_scales, cfg.train.mining.num_random_negative_classes) Args: dataloader - dataloader to use (often the same as the one for training) net - the network to use cfg - config with all the parameters criterion - criterion (usually the same one as used for training) Returns: hardnegdata_per_imageid (OrderedDict) - mined data, keys are the image ids; further used in dataloader.set_hard_negative_data(hardnegdata_per_imageid) when preparing batches """ logger = logging.getLogger("OS2D.mining_hard_patches") logger.info("Starting to mine hard patches") t_start_mining = time.time() net.eval() num_batches = len(dataloader) hardnegdata_per_imageid = OrderedDict() iterator = make_iterator_extract_scores_from_images_batched(dataloader, net, logger, image_batch_size=cfg.eval.batch_size, is_cuda=cfg.is_cuda, num_random_pyramid_scales=cfg.train.mining.num_random_pyramid_scales, num_random_negative_labels=cfg.train.mining.num_random_negative_classes) boxes = [] gt_boxes = [] losses = OrderedDict() # loop over all dataset images for data in iterator: t_item_start = time.time() image_id, image_loc_scores_pyramid, image_class_scores_pyramid, \ image_pyramid, query_img_sizes, \ batch_class_ids, box_reverse_transform_pyramid, image_fm_sizes_p, transform_corners_pyramid \ = data img_size_pyramid = [FeatureMapSize(img=image) for image in image_pyramid] gt_boxes_one_image = dataloader.get_image_annotation_for_imageid(image_id) gt_boxes.append(gt_boxes_one_image) # compute losses # change labels to the ones local to the current image dataloader.update_box_labels_to_local(gt_boxes_one_image, batch_class_ids) num_labels = len(batch_class_ids) loc_targets_pyramid, class_targets_pyramid = \ dataloader.box_coder.encode_pyramid(gt_boxes_one_image, img_size_pyramid, num_labels, default_box_transform_pyramid=box_reverse_transform_pyramid) # vizualize GT for debug if cfg.visualization.mining.show_gt_boxes: visualizer.show_gt_boxes(image_id, gt_boxes_one_image, batch_class_ids, dataloader) # compute losses if cfg.is_cuda: loc_targets_pyramid = [loc_targets.cuda() for loc_targets in loc_targets_pyramid] class_targets_pyramid = [class_targets.cuda() for class_targets in class_targets_pyramid] add_batch_dim = lambda list_of_tensors: [t.unsqueeze(0) for t in list_of_tensors] loc_scores_pyramid = add_batch_dim(image_loc_scores_pyramid) cls_targets_remapped_pyramid = [] for loc_scores, img_size, box_reverse_transform in zip(loc_scores_pyramid, img_size_pyramid, box_reverse_transform_pyramid): # loop over the pyramid levels cls_targets_remapped, ious_anchor, ious_anchor_corrected = \ dataloader.box_coder.remap_anchor_targets(loc_scores, [img_size], query_img_sizes, [gt_boxes_one_image], box_reverse_transform=[box_reverse_transform]) cls_targets_remapped_pyramid.append(cls_targets_remapped) losses_iter, losses_per_anchor = criterion(loc_scores_pyramid, add_batch_dim(loc_targets_pyramid), add_batch_dim(image_class_scores_pyramid), add_batch_dim(class_targets_pyramid), cls_targets_remapped=cls_targets_remapped_pyramid, patch_mining_mode=True) if cfg.visualization.mining.show_class_heatmaps: visualizer.show_class_heatmaps(image_id, batch_class_ids, image_fm_sizes_p, class_targets_pyramid, image_class_scores_pyramid, cfg_local=cfg.visualization.mining) assert dataloader.data_augmentation is not None, "Can mine hard patches only through data augmentation" crop_size = dataloader.data_augmentation.random_crop_size # convert to floats for l in losses_iter: losses_iter[l] = losses_iter[l].mean().item() # printing print_meters(losses_iter, logger) # update logs add_to_meters_in_dict(losses_iter, losses) # construct crop boxes for all the anchors and NMS them - NMS pos ang neg anchors separately query_fm_sizes = [dataloader.box_coder._get_feature_map_size_per_image_size(sz) for sz in query_img_sizes] crops = [] achors = [] labels_of_anchors = [] pyramid_level_of_anchors = [] losses_of_anchors = [] corners_of_anchors = [] losses_loc_of_anchors = [] pos_mask_of_anchors = [] pos_loc_mask_of_anchors = [] neg_mask_of_anchors = [] anchor_indices = [] i_image_in_batch = 0 # only one image comes here for i_p, img_size in enumerate(img_size_pyramid): for i_label, query_fm_size in enumerate(query_fm_sizes): crop_position, anchor_position, anchor_index = \ dataloader.box_coder.output_box_grid_generator.get_box_to_cut_anchor(img_size, crop_size, image_fm_sizes_p[i_p], box_reverse_transform_pyramid[i_p]) cur_corners = transform_corners_pyramid[i_p][i_label].transpose(0,1) cur_corners = dataloader.box_coder.apply_transform_to_corners(cur_corners, box_reverse_transform_pyramid[i_p], img_size) if cfg.is_cuda: crop_position, anchor_position = crop_position.cuda(), anchor_position.cuda() crops.append(crop_position) achors.append(anchor_position) device = crop_position.bbox_xyxy.device losses_of_anchors.append(losses_per_anchor["cls_loss"][i_p][i_image_in_batch, i_label].to(crop_position.bbox_xyxy)) pos_mask_of_anchors.append(losses_per_anchor["pos_mask"][i_p][i_image_in_batch, i_label].to(device=device)) neg_mask_of_anchors.append(losses_per_anchor["neg_mask"][i_p][i_image_in_batch, i_label].to(device=device)) losses_loc_of_anchors.append(losses_per_anchor["loc_loss"][i_p][i_image_in_batch, i_label].to(crop_position.bbox_xyxy)) pos_loc_mask_of_anchors.append(losses_per_anchor["pos_for_regression"][i_p][i_image_in_batch, i_label].to(device=device)) corners_of_anchors.append(cur_corners.to(crop_position.bbox_xyxy)) num_anchors = len(crop_position) labels_of_anchors.append(torch.full([num_anchors], i_label, dtype=torch.long)) pyramid_level_of_anchors.append(torch.full([num_anchors], i_p, dtype=torch.long)) anchor_indices.append(anchor_index) # stack all crops = cat_boxlist(crops) achors = cat_boxlist(achors) labels_of_anchors = torch.cat(labels_of_anchors, 0) pyramid_level_of_anchors = torch.cat(pyramid_level_of_anchors, 0) losses_of_anchors = torch.cat(losses_of_anchors, 0) losses_loc_of_anchors = torch.cat(losses_loc_of_anchors, 0) pos_mask_of_anchors = torch.cat(pos_mask_of_anchors, 0) pos_loc_mask_of_anchors = torch.cat(pos_loc_mask_of_anchors, 0) neg_mask_of_anchors = torch.cat(neg_mask_of_anchors, 0) anchor_indices = torch.cat(anchor_indices, 0) corners_of_anchors = torch.cat(corners_of_anchors, 0) def nms_masked_and_collect_data(mask, crops_xyxy, scores, nms_iou_threshold_in_mining, max_etries=None): mask_ids = torch.nonzero(mask).squeeze(1) boxes_selected = copy.deepcopy(crops_xyxy[mask]) boxes_selected.add_field("scores", scores[mask]) remaining_boxes = nms(boxes_selected, nms_iou_threshold_in_mining) remaining_boxes = mask_ids[remaining_boxes] # sort and take the topk, because NMS is not sorting by default ids = torch.argsort(scores[remaining_boxes], descending=True) if max_etries is not None: ids = ids[:max_etries] remaining_boxes = remaining_boxes[ids] return remaining_boxes nms_iou_threshold_in_mining = cfg.train.mining.nms_iou_threshold_in_mining num_hard_patches_per_image = cfg.train.mining.num_hard_patches_per_image # hard negatives hard_negs = nms_masked_and_collect_data(neg_mask_of_anchors, crops, losses_of_anchors, nms_iou_threshold_in_mining, num_hard_patches_per_image) # hard positives for classification hard_pos = nms_masked_and_collect_data(pos_mask_of_anchors, crops, losses_of_anchors, nms_iou_threshold_in_mining, num_hard_patches_per_image) # hard positives for localization hard_pos_loc = nms_masked_and_collect_data(pos_loc_mask_of_anchors, crops, losses_loc_of_anchors, nms_iou_threshold_in_mining, num_hard_patches_per_image) # merge all together def standardize(v): return v.item() if type(v) == torch.Tensor else v def add_item(data, role, pyramid_level, label_local, anchor_index, crop_position_xyxy, anchor_position_xyxy, transform_corners): new_item = OrderedDict() new_item["pyramid_level"] = standardize(pyramid_level) new_item["label_local"] = standardize(label_local) new_item["anchor_index"] = standardize(anchor_index) new_item["role"] = role new_item["crop_position_xyxy"] = crop_position_xyxy new_item["anchor_position_xyxy"] = anchor_position_xyxy new_item["transform_corners"] = transform_corners data.append(new_item) hardnegdata = [] for i in hard_negs: add_item(hardnegdata, "neg", pyramid_level_of_anchors[i], labels_of_anchors[i], anchor_indices[i], crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu()) for i in hard_pos: add_item(hardnegdata, "pos", pyramid_level_of_anchors[i], labels_of_anchors[i], anchor_indices[i], crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu()) for i in hard_pos_loc: add_item(hardnegdata, "pos_loc", pyramid_level_of_anchors[i], labels_of_anchors[i], anchor_indices[i], crops[i].cpu(), achors[i].cpu(), corners_of_anchors[i].cpu()) # extract loss values and compute the box positions to crop for a in hardnegdata: a["label_global"] = standardize(batch_class_ids[ a["label_local"] ]) a["loss"] = standardize(losses_per_anchor["cls_loss"][a["pyramid_level"]][i_image_in_batch, a["label_local"], a["anchor_index"]]) a["loss_loc"] = standardize(losses_per_anchor["loc_loss"][a["pyramid_level"]][i_image_in_batch, a["label_local"], a["anchor_index"]]) a["score"] = standardize(image_class_scores_pyramid[a["pyramid_level"]][a["label_local"], a["anchor_index"]]) a["image_id"] = standardize(image_id) hardnegdata_per_imageid[image_id] = hardnegdata if cfg.visualization.mining.show_mined_patches: visualizer.show_mined_patches(image_id, batch_class_ids, dataloader, hardnegdata) logger.info("Item time: {0}, since mining start: {1}".format(time_since(t_item_start), time_since(t_start_mining))) logger.info("Hard negative mining finished in {0}".format(time_since(t_start_mining))) return hardnegdata_per_imageid
def save_cropped_boxes(dataset, tgt_image_path, extension=".jpg", num_random_crops_per_image=0): # crop all the boxes db = {"cids":[], "cluster":[], "gtbboxid":[], "classid":[], "imageid":[], "difficult":[], "type":[], "size":[], "bbox":[]} for image_id in tqdm(dataset.image_ids): img = dataset._get_dataset_image_by_id(image_id) boxes = dataset.get_image_annotation_for_imageid(image_id) assert boxes.has_field("labels"), "GT boxes need a field 'labels'" # remove all fields except "labels" and "difficult" for f in boxes.fields(): if f not in ["labels", "difficult"]: boxes.remove_field(f) if not boxes.has_field("difficult"): boxes.add_field("difficult", torch.zeros(len(boxes), dtype=torch.bool)) num_gt_boxes = len(boxes) im_size = FeatureMapSize(img=img) assert im_size == boxes.image_size eval_scale = dataset.get_eval_scale() # sample random boxes if needed if num_random_crops_per_image > 0: boxes_random = torch.rand(num_random_crops_per_image, 4) x1 = torch.min(boxes_random[:, 0], boxes_random[:, 2]) * im_size.w x2 = torch.max(boxes_random[:, 0], boxes_random[:, 2]) * im_size.w y1 = torch.min(boxes_random[:, 1], boxes_random[:, 3]) * im_size.h y2 = torch.max(boxes_random[:, 1], boxes_random[:, 3]) * im_size.h boxes_random = torch.stack([x1, y1, x2, y2], 1).floor() # crop boxes that are too small min_size = 10.0 / eval_scale * max(im_size.w, im_size.h) mask_bad_boxes = (boxes_random[:,0] + min_size > boxes_random[:,2]) | (boxes_random[:,1] + min_size > boxes_random[:,3]) good_boxes = torch.nonzero(~mask_bad_boxes).view(-1) boxes_random = boxes_random[good_boxes] boxes_random = BoxList(boxes_random, im_size, mode="xyxy") boxes_random.add_field("labels", torch.full([len(boxes_random)], -1, dtype=torch.long)) boxes_random.add_field("difficult", torch.zeros(len(boxes_random), dtype=torch.bool)) boxes = cat_boxlist([boxes, boxes_random]) if boxes is not None: for i_box in range(len(boxes)): # box format: left, top, right, bottom box = boxes[i_box].bbox_xyxy.view(-1) box = [b.item() for b in box] cropped_img = img.crop(box) if i_box < num_gt_boxes: lbl = boxes[i_box].get_field("labels").item() dif_flag = boxes[i_box].get_field("difficult").item() box_id = i_box box_type = "GT" else: lbl = -1 dif_flag = 0 box_id = i_box box_type = "RN" # create the file name to be used with cirtorch.datasets.datahelpers.cid2filename and their dataloader cid = "box{box_id:05d}_lbl{label:05d}_dif{dif:01d}_im{image_id:05d}{box_type}".format(box_id=box_id, image_id = image_id, label = lbl, dif = dif_flag, box_type=box_type) file_name = cid2filename(cid, prefix=tgt_image_path) # save the image image_path, _ = os.path.split(file_name) mkdir(image_path) if extension: cropped_img.save("{}{}".format(file_name, extension)) else: # cirtorch uses files with empty extension for training for some reason, need to support that cropped_img.save("{}".format(file_name), format="jpeg") # add to the db structure db["cids"].append(cid) db["cluster"].append(lbl) # use labels as clusters not to sample negatives from the same object db["classid"].append(lbl) db["gtbboxid"].append(box_id) db["imageid"].append(image_id) db["difficult"].append(dif_flag) if i_box < num_gt_boxes: db["type"].append("gtproposal") else: db["type"].append("randomcrop") db["size"].append(cropped_img.size) db["bbox"].append(box) # format (x1,y1,x2,y2) return db
def evaluate_detections(self, all_boxes, output_dir, mAP_iou_threshold=0.5): predictions = [] gt_boxes = [] roidb = self.roidb for i_image, roi in enumerate(roidb): image_size = FeatureMapSize(w=roi["width"], h=roi["height"]) if roi["boxes"].size > 0: roi_gt_boxes = BoxList(roi["boxes"], image_size, mode="xyxy") else: roi_gt_boxes = BoxList.create_empty(image_size) roi_gt_boxes.add_field( "labels", torch.as_tensor(roi["gt_classes"], dtype=torch.int32)) roi_gt_boxes.add_field( "difficult", torch.as_tensor(roi["gt_ishard"], dtype=torch.int32)) gt_boxes.append(roi_gt_boxes) roi_detections = [] for i_class, class_boxes in enumerate(all_boxes): assert len(class_boxes) == len(roidb), \ "Number of detection for class {0} image{1} ({2}) inconsistent with the length of roidb ({3})".format(i_class, i_image, len(class_boxes), len(roidb)) boxes = class_boxes[i_image] if len(boxes) > 0: assert boxes.shape[ 1] == 5, "Detections should be of shape (:,5), but are {0} for class {1}, image {2}".format( boxes.shape, i_class, i_image) bbox = BoxList(boxes[:, :4], image_size, mode="xyxy") scores = boxes[:, -1] bbox.add_field( "scores", torch.as_tensor(scores, dtype=torch.float32)) bbox.add_field( "labels", torch.full(scores.shape, i_class, dtype=torch.int32)) roi_detections.append(bbox) if roi_detections: roi_detections = cat_boxlist(roi_detections) else: roi_detections = BoxList.create_empty(image_size) roi_detections.add_field( "scores", torch.zeros((0, ), dtype=torch.float32)) roi_detections.add_field("labels", torch.zeros((0, ), dtype=torch.int32)) predictions.append(roi_detections) if False: self.visualize_detections(i_image, gt=roi_gt_boxes, dets=roi_detections) ap_data = do_voc_evaluation(predictions, gt_boxes, iou_thresh=mAP_iou_threshold, use_07_metric=False) print("mAP@{:0.2f}: {:0.4f}".format(mAP_iou_threshold, ap_data["map"])) print("mAPw@{:0.2f}: {:0.4f}".format(mAP_iou_threshold, ap_data["map_weighted"])) print("recall@{:0.2f}: {:0.4f}".format(mAP_iou_threshold, ap_data["recall"])) return ap_data['map']