def convert_to_coco_dict(dataset_name, dataset_dicts, metadata): """ Convert a dataset in cvpods's standard format into COCO json format COCO data format description can be found here: http://cocodataset.org/#format-data Args: dataset_name: name of the source dataset must be registered in DatastCatalog and in cvpods's standard format Returns: coco_dict: serializable dict in COCO json format """ if dataset_name not in [ "citypersons_train", "citypersons_val", "crowdhuman_train", "crowdhuman_val", "coco_2017_train", "coco_2017_val", "widerface_2019_train", "widerface_2019_val" ]: raise NotImplementedError( "Dataset name '{}' not supported".format(dataset_name)) # unmap the category mapping ids for COCO if hasattr(metadata, "thing_dataset_id_to_contiguous_id"): reverse_id_mapping = { v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items() } def reverse_id_mapper(contiguous_id): return reverse_id_mapping[contiguous_id] # noqa else: def reverse_id_mapper(contiguous_id): return contiguous_id # noqa categories = [{ "id": reverse_id_mapper(id), "name": name } for id, name in enumerate(metadata.thing_classes)] logger.info("Converting dataset dicts into COCO format") coco_images = [] coco_annotations = [] for image_id, image_dict in enumerate(dataset_dicts): coco_image = { "id": image_dict.get("image_id", image_id), "width": image_dict["width"], "height": image_dict["height"], "file_name": image_dict["file_name"], } coco_images.append(coco_image) anns_per_image = image_dict["annotations"] for annotation in anns_per_image: # create a new dict with only COCO fields coco_annotation = {} # COCO requirement: XYWH box format bbox = annotation["bbox"] bbox_mode = annotation["bbox_mode"] bbox = BoxMode.convert(bbox, bbox_mode, BoxMode.XYWH_ABS) # COCO requirement: instance area if "segmentation" in annotation: # Computing areas for instances by counting the pixels segmentation = annotation["segmentation"] # TODO: check segmentation type: RLE, BinaryMask or Polygon polygons = PolygonMasks([segmentation]) area = polygons.area()[0].item() else: # Computing areas using bounding boxes bbox_xy = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) area = Boxes([bbox_xy]).area()[0].item() if "keypoints" in annotation: keypoints = annotation["keypoints"] # list[int] for idx, v in enumerate(keypoints): if idx % 3 != 2: # COCO's segmentation coordinates are floating points in [0, H or W], # but keypoint coordinates are integers in [0, H-1 or W-1] # For COCO format consistency we substract 0.5 # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163 keypoints[idx] = v - 0.5 if "num_keypoints" in annotation: num_keypoints = annotation["num_keypoints"] else: num_keypoints = sum(kp > 0 for kp in keypoints[2::3]) # COCO requirement: # linking annotations to images # "id" field must start with 1 coco_annotation["id"] = len(coco_annotations) + 1 coco_annotation["image_id"] = coco_image["id"] coco_annotation["bbox"] = [round(float(x), 3) for x in bbox] coco_annotation["area"] = area coco_annotation["category_id"] = reverse_id_mapper( annotation["category_id"]) coco_annotation["iscrowd"] = annotation.get("iscrowd", 0) # Add optional fields if "keypoints" in annotation: coco_annotation["keypoints"] = keypoints coco_annotation["num_keypoints"] = num_keypoints if "segmentation" in annotation: coco_annotation["segmentation"] = annotation["segmentation"] coco_annotations.append(coco_annotation) logger.info( "Conversion finished, " f"num images: {len(coco_images)}, num annotations: {len(coco_annotations)}" ) info = { "date_created": str(datetime.datetime.now()), "description": "Automatically generated COCO json file for cvpods.", } coco_dict = { "info": info, "images": coco_images, "annotations": coco_annotations, "categories": categories, "licenses": None, } return coco_dict
def inference_single_image(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: pred_logits (list[Tensor]): list of #feature levels. Each entry contains tensor of size (AxHxW, K) pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4. pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors. Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ pred_logits = pred_logits.flatten().sigmoid_() # We get top locations across all levels to accelerate the inference speed, # which does not seem to affect the accuracy. # First select values above the threshold logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0] # Then get the top values num_topk = min(self.topk_candidates, logits_top_idxs.shape[0]) pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort( descending=True) # Keep top k scoring values pred_prob = pred_prob[:num_topk] # Keep top k values top_idxs = logits_top_idxs[topk_idxs[:num_topk]] # class index cls_idxs = top_idxs % self.num_classes # HWA index top_idxs //= self.num_classes # predict boxes pred_boxes = self.box2box_transform.apply_deltas( pred_deltas[top_idxs], anchors[top_idxs].tensor) # apply nms keep = generalized_batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold, nms_type=self.nms_type) # pick the top ones keep = keep[:self.detections_im] results = Instances(image_size) results.pred_boxes = Boxes(pred_boxes[keep]) results.scores = pred_prob[keep] results.pred_classes = cls_idxs[keep] # deal with masks result_masks, result_anchors = [], None if self.mask_on: # index and anchors, useful for masks top_indexes = indexes[top_idxs] top_anchors = anchors[top_idxs] result_indexes = top_indexes[keep] result_anchors = top_anchors[keep] # Get masks and do sigmoid for lvl, _, h, w, anc in result_indexes.tolist(): cur_size = self.mask_sizes[anc] * (2**lvl if self.bipyramid_on else 1) result_masks.append( torch.sigmoid(pred_masks[lvl][anc][:, h, w].view( 1, cur_size, cur_size))) return results, (result_masks, result_anchors)
def label_and_sample_proposals( self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]: """ Prepare some proposals to be used to train the ROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than ``self.positive_sample_fraction``. Args: See :meth:`ROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the proposal boxes - gt_boxes: the ground-truth box that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) Other fields such as "gt_classes", "gt_masks", that's included in `targets`. """ gt_boxes = [x.gt_boxes for x in targets] # Augment proposals with ground-truth boxes. # In the case of learned proposals (e.g., RPN), when training starts # the proposals will be low quality due to random initialization. # It's possible that none of these initial # proposals have high enough overlap with the gt objects to be used # as positive examples for the second stage components (box head, # cls head, mask head). Adding the gt boxes to the set of proposals # ensures that the second stage components will have some positive # examples from the start of training. For RPN, this augmentation improves # convergence and empirically improves box AP on COCO by about 0.5 # points (under one tested configuration). if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) match_quality_ignore = pairwise_ioa( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes, targets_per_image.gt_classes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix, match_quality_ignore) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes) # Set target attributes of the sampled proposals: proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes # We index all the attributes of targets that start with "gt_" # and have not been added to proposals yet (="gt_classes"). if has_gt: sampled_targets = matched_idxs[sampled_idxs] # NOTE: here the indexing waste some compute, because heads # like masks, keypoints, etc, will filter the proposals again, # (by foreground/background, or number of keypoints in the image, etc) # so we essentially index the data twice. for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith( "gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 4))) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append( (gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def _inference_one_image(self, inputs): augmented_inputs = self.tta_mapper(inputs) assert len({x["file_name"] for x in augmented_inputs }) == 1, "inference different images" heights = [k["height"] for k in augmented_inputs] widths = [k["width"] for k in augmented_inputs] assert ( len(set(heights)) == 1 and len(set(widths)) == 1 ), "Augmented version of the inputs should have the same original resolution!" height = heights[0] width = widths[0] # 1. Detect boxes from all augmented versions # TODO wangfeng02: use box structures instead of boxes, scores and classes all_boxes = [] all_scores = [] all_classes = [] factors = 2 if self.tta_mapper.flip else 1 if self.enable_scale_filter: assert len(augmented_inputs) == len(self.scale_ranges) * factors for i, single_input in enumerate(augmented_inputs): do_hflip = single_input.pop("horiz_flip", False) # 1.1: forward with single augmented image output = self.model._inference_for_ms_test([single_input]) # 1.2: union the results pred_boxes = output.get("pred_boxes").tensor if do_hflip: pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]] pred_scores = output.get("scores") pred_classes = output.get("pred_classes") if self.enable_scale_filter: keep = filter_boxes(pred_boxes, *self.scale_ranges[i // factors]) pred_boxes = pred_boxes[keep] pred_scores = pred_scores[keep] pred_classes = pred_classes[keep] all_boxes.append(pred_boxes) all_scores.append(pred_scores) all_classes.append(pred_classes) boxes_all = torch.cat(all_boxes, dim=0) scores_all = torch.cat(all_scores, dim=0) class_idxs_all = torch.cat(all_classes, dim=0) boxes_all, scores_all, class_idxs_all = merge_result_from_multi_scales( boxes_all, scores_all, class_idxs_all, nms_type="soft_vote", vote_thresh=0.65, max_detection=self.max_detection) result = Instances((height, width)) result.pred_boxes = Boxes(boxes_all) result.scores = scores_all result.pred_classes = class_idxs_all return {"instances": result}
def inference_single_image(self, box_cls, box_delta, shifts, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. shifts (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the shifts for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, shifts_i in zip(box_cls, box_delta, shifts): # (HxWxK,) box_cls_i = box_cls_i.sigmoid_().flatten() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] shift_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[shift_idxs] shifts_i = shifts_i[shift_idxs] # predict boxes predicted_boxes = self.shift2box_transform.apply_deltas( box_reg_i, shifts_i) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] if self.nms_type is None: # strategies above (e.g. topk_candidates and score_threshold) are # useless for POTO, just keep them for debug and analysis keep = scores_all.argsort(descending=True) else: keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result
def func(x): boxes = Boxes(x) return boxes.area()
def losses(self, shifts, gt_instances, box_cls, box_delta, box_center): box_cls_flattened = [ permute_to_N_HWA_K(x, self.num_classes) for x in box_cls ] box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] box_center_flattened = [permute_to_N_HWA_K(x, 1) for x in box_center] pred_class_logits = cat(box_cls_flattened, dim=1) pred_shift_deltas = cat(box_delta_flattened, dim=1) pred_obj_logits = cat(box_center_flattened, dim=1) pred_class_probs = pred_class_logits.sigmoid() pred_obj_probs = pred_obj_logits.sigmoid() pred_box_probs = [] num_foreground = pred_class_logits.new_zeros(1) num_background = pred_class_logits.new_zeros(1) positive_losses = [] gaussian_norm_losses = [] for shifts_per_image, gt_instances_per_image, \ pred_class_probs_per_image, pred_shift_deltas_per_image, \ pred_obj_probs_per_image in zip( shifts, gt_instances, pred_class_probs, pred_shift_deltas, pred_obj_probs): locations = torch.cat(shifts_per_image, dim=0) labels = gt_instances_per_image.gt_classes gt_boxes = gt_instances_per_image.gt_boxes target_shift_deltas = self.shift2box_transform.get_deltas( locations, gt_boxes.tensor.unsqueeze(1)) is_in_boxes = target_shift_deltas.min(dim=-1).values > 0 foreground_idxs = torch.nonzero(is_in_boxes, as_tuple=True) with torch.no_grad(): # predicted_boxes_per_image: a_{j}^{loc}, shape: [j, 4] predicted_boxes_per_image = self.shift2box_transform.apply_deltas( pred_shift_deltas_per_image, locations) # gt_pred_iou: IoU_{ij}^{loc}, shape: [i, j] gt_pred_iou = pairwise_iou( gt_boxes, Boxes(predicted_boxes_per_image)).max( dim=0, keepdim=True).values.repeat( len(gt_instances_per_image), 1) # pred_box_prob_per_image: P{a_{j} \in A_{+}}, shape: [j, c] pred_box_prob_per_image = torch.zeros_like( pred_class_probs_per_image) box_prob = 1 / (1 - gt_pred_iou[foreground_idxs]).clamp_(1e-12) for i in range(len(gt_instances_per_image)): idxs = foreground_idxs[0] == i if idxs.sum() > 0: box_prob[idxs] = normalize(box_prob[idxs]) pred_box_prob_per_image[foreground_idxs[1], labels[foreground_idxs[0]]] = box_prob pred_box_probs.append(pred_box_prob_per_image) normal_probs = [] for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): gt_shift_deltas = self.shift2box_transform.get_deltas( shifts_i, gt_boxes.tensor.unsqueeze(1)) distances = (gt_shift_deltas[..., :2] - gt_shift_deltas[..., 2:]) / 2 normal_probs.append( normal_distribution(distances / stride, self.mu[labels].unsqueeze(1), self.sigma[labels].unsqueeze(1))) normal_probs = torch.cat(normal_probs, dim=1).prod(dim=-1) composed_cls_prob = pred_class_probs_per_image[:, labels] * pred_obj_probs_per_image # matched_gt_shift_deltas: P_{ij}^{loc} loss_box_reg = iou_loss(pred_shift_deltas_per_image.unsqueeze(0), target_shift_deltas, box_mode="ltrb", loss_type=self.iou_loss_type, reduction="none") * self.reg_weight pred_reg_probs = (-loss_box_reg).exp() # positive_losses: { -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } positive_losses.append( positive_bag_loss( composed_cls_prob.transpose(1, 0) * pred_reg_probs, is_in_boxes.float(), normal_probs)) num_foreground += len(gt_instances_per_image) num_background += normal_probs[foreground_idxs].sum().item() gaussian_norm_losses.append( len(gt_instances_per_image) / normal_probs[foreground_idxs].sum().clamp_(1e-12)) if dist.is_initialized(): dist.all_reduce(num_foreground) num_foreground /= dist.get_world_size() dist.all_reduce(num_background) num_background /= dist.get_world_size() # positive_loss: \sum_{i}{ -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } / ||B|| positive_loss = torch.cat(positive_losses).sum() / max( 1, num_foreground) # pred_box_probs: P{a_{j} \in A_{+}} pred_box_probs = torch.stack(pred_box_probs, dim=0) # negative_loss: \sum_{j}{ FL( (1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}) ) } / n||B|| negative_loss = negative_bag_loss( pred_class_probs * pred_obj_probs * (1 - pred_box_probs), self.focal_loss_gamma).sum() / max(1, num_background) loss_pos = positive_loss * self.focal_loss_alpha loss_neg = negative_loss * (1 - self.focal_loss_alpha) loss_norm = torch.stack(gaussian_norm_losses).mean() * ( 1 - self.focal_loss_alpha) return { "loss_pos": loss_pos, "loss_neg": loss_neg, "loss_norm": loss_norm, }
def inference_single_image(self, conf_pred_per_image, loc_pred_per_image, default_boxes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains tensor of size [Hi x Wi x D, C]. loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except that C becomes 4. default_boxes (list['Boxes']): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ # predict confidence conf_pred = torch.cat(conf_pred_per_image, dim=0) # [R, C] conf_pred = conf_pred.softmax(dim=1) # predict boxes loc_pred = torch.cat(loc_pred_per_image, dim=0) # [R, 4] default_boxes = Boxes.cat(default_boxes) # [R, 4] boxes_pred = self.box2box_transform.apply_deltas( loc_pred, default_boxes.tensor) num_boxes, num_classes = conf_pred.shape boxes_pred = boxes_pred.view(num_boxes, 1, 4).expand(num_boxes, num_classes, 4) # [R, C, 4] labels = torch.arange(num_classes, device=self.device) # [0, ..., C] labels = labels.view(1, num_classes).expand_as(conf_pred) # [R, C] # remove predictions with the background label boxes_pred = boxes_pred[:, :-1] conf_pred = conf_pred[:, :-1] labels = labels[:, :-1] # batch everything, by making every class prediction be a separate instance boxes_pred = boxes_pred.reshape(-1, 4) conf_pred = conf_pred.reshape(-1) labels = labels.reshape(-1) # remove low scoring boxes indices = torch.nonzero(conf_pred > self.score_threshold, as_tuple=False).squeeze(1) boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[ indices], labels[indices] keep = generalized_batched_nms(boxes_pred, conf_pred, labels, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_pred[keep]) result.scores = conf_pred[keep] result.pred_classes = labels[keep] return result
def find_top_rpn_proposals( proposals, pred_objectness_logits, images, nms_thresh, nms_type, pre_nms_topk, post_nms_topk, min_box_side_len, training, # pylint: disable=W0613 ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). images (ImageList): Input images as an :class:`ImageList`. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ image_sizes = images.image_sizes # in (h, w) order num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip( itertools.count(), proposals, pred_objectness_logits ): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_side_len) lvl = level_ids if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], level_ids[keep] keep = generalized_batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh, nms_type=nms_type) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in cvpods to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) B, C, H, W = images.tensor.shape device = images.tensor.device mask = torch.ones((B, H, W), dtype=torch.bool, device=device) for img_shape, m in zip(images.image_sizes, mask): m[:img_shape[0], :img_shape[1]] = False src = self.backbone(images.tensor)["res5"] mask = F.interpolate(mask[None].float(), size=src.shape[-2:]).bool()[0] pos = self.position_embedding(src, mask) hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos)[0] outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() out = { "pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1] } if self.training: targets = self.convert_anno_format(batched_inputs) if self.aux_loss: out["aux_outputs"] = [{ "pred_logits": a, "pred_boxes": b } for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] loss_dict = self.criterion(out, targets) for k, v in loss_dict.items(): loss_dict[k] = v * self.weight_dict[ k] if k in self.weight_dict else v return loss_dict else: target_sizes = torch.stack([ torch.tensor([ bi.get("height", img_size[0]), bi.get("width", img_size[1]) ], device=self.device) for bi, img_size in zip(batched_inputs, images.image_sizes) ]) res = self.post_processors["bbox"](out, target_sizes) processed_results = [] # for results_per_image, input_per_image, image_size in zip( for results_per_image, _, image_size in zip( res, batched_inputs, images.image_sizes): result = Instances(image_size) result.pred_boxes = Boxes(results_per_image["boxes"].float()) result.scores = results_per_image["scores"].float() result.pred_classes = results_per_image["labels"] processed_results.append({"instances": result}) return processed_results
def inference_single_image(self, box_cls, box_center, border_cls, border_delta, bd_based_box, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. box_center (list[Tensor]): Same shape as 'box_cls' except that K becomes 1. shifts (list[Tensor]): list of #feature levels. Each entry contains a tensor, which contains all the shifts for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] border_bbox_std = bd_based_box[0].new_tensor(self.border_bbox_std) # Iterate over every feature level for box_cls_i, box_ctr_i, bd_box_cls_i, bd_box_reg_i, bd_based_box_i in zip( box_cls, box_center, border_cls, border_delta, bd_based_box): # (HxWxK,) box_cls_i = box_cls_i.sigmoid_() box_ctr_i = box_ctr_i.sigmoid_() bd_box_cls_i = bd_box_cls_i.sigmoid_() predicted_prob = (box_cls_i * bd_box_cls_i).sqrt() # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob * box_ctr_i predicted_prob = predicted_prob[keep_idxs] # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, predicted_prob.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = predicted_prob.sort(descending=True) topk_idxs = topk_idxs[:num_topk] keep_idxs = keep_idxs.nonzero() keep_idxs = keep_idxs[topk_idxs] keep_box_idxs = keep_idxs[:, 0] classes_idxs = keep_idxs[:, 1] predicted_prob = predicted_prob[:num_topk] bd_box_reg_i = bd_box_reg_i[keep_box_idxs] bd_based_box_i = bd_based_box_i[keep_box_idxs] det_wh = (bd_based_box_i[..., 2:4] - bd_based_box_i[..., :2]) det_wh = torch.cat([det_wh, det_wh], dim=1) predicted_boxes = bd_based_box_i + (bd_box_reg_i * border_bbox_std * det_wh) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob.sqrt()) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = generalized_batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold, nms_type=self.nms_type) boxes_all = boxes_all[keep] scores_all = scores_all[keep] class_idxs_all = class_idxs_all[keep] number_of_detections = len(keep) # Limit to max_per_image detections **over all classes** if number_of_detections > self.max_detections_per_image > 0: image_thresh, _ = torch.kthvalue( scores_all, number_of_detections - self.max_detections_per_image + 1) keep = scores_all >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) boxes_all = boxes_all[keep] scores_all = scores_all[keep] class_idxs_all = class_idxs_all[keep] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all) result.scores = scores_all result.pred_classes = class_idxs_all return result
def get_ground_truth(self, shifts, targets, pre_boxes_list): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. gt_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. gt_centerness (Tensor): An float tensor (0, 1) of shape (N, R) whose values in [0, 1] storing ground-truth centerness for each shift. border_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. border_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. """ gt_classes = [] gt_shifts_deltas = [] gt_centerness = [] border_classes = [] border_shifts_deltas = [] for shifts_per_image, targets_per_image, pre_boxes in zip( shifts, targets, pre_boxes_list): object_sizes_of_interest = torch.cat([ shifts_i.new_tensor(size).unsqueeze(0).expand( shifts_i.size(0), -1) for shifts_i, size in zip( shifts_per_image, self.object_sizes_of_interest) ], dim=0) shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes deltas = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)) if self.center_sampling_radius > 0: centers = gt_boxes.get_centers() is_in_boxes = [] for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): radius = stride * self.center_sampling_radius center_boxes = torch.cat(( torch.max(centers - radius, gt_boxes.tensor[:, :2]), torch.min(centers + radius, gt_boxes.tensor[:, 2:]), ), dim=-1) center_deltas = self.shift2box_transform.get_deltas( shifts_i, center_boxes.unsqueeze(1)) is_in_boxes.append(center_deltas.min(dim=-1).values > 0) is_in_boxes = torch.cat(is_in_boxes, dim=1) else: # no center sampling, it will use all the locations within a ground-truth box is_in_boxes = deltas.min(dim=-1).values > 0 max_deltas = deltas.max(dim=-1).values # limit the regression range for each location is_cared_in_the_level = \ (max_deltas >= object_sizes_of_interest[None, :, 0]) & \ (max_deltas <= object_sizes_of_interest[None, :, 1]) gt_positions_area = gt_boxes.area().unsqueeze(1).repeat( 1, shifts_over_all_feature_maps.size(0)) gt_positions_area[~is_in_boxes] = math.inf gt_positions_area[~is_cared_in_the_level] = math.inf # if there are still more than one objects for a position, # we choose the one with minimal area positions_min_area, gt_matched_idxs = gt_positions_area.min(dim=0) # ground truth box regression gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Shifts with area inf are treated as background. gt_classes_i[positions_min_area == math.inf] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes # ground truth centerness left_right = gt_shifts_reg_deltas_i[:, [0, 2]] top_bottom = gt_shifts_reg_deltas_i[:, [1, 3]] gt_centerness_i = torch.sqrt( (left_right.min(dim=-1).values / left_right.max(dim=-1).values).clamp_(min=0) * (top_bottom.min(dim=-1).values / top_bottom.max(dim=-1).values).clamp_(min=0)) gt_classes.append(gt_classes_i) gt_shifts_deltas.append(gt_shifts_reg_deltas_i) gt_centerness.append(gt_centerness_i) # border iou = pairwise_iou(Boxes(pre_boxes), gt_boxes) (max_iou, argmax_iou) = iou.max(dim=1) invalid = max_iou < self.border_iou_thresh gt_target = gt_boxes[argmax_iou].tensor border_cls_target = targets_per_image.gt_classes[argmax_iou] border_cls_target[invalid] = self.num_classes border_bbox_std = pre_boxes.new_tensor(self.border_bbox_std) pre_boxes_wh = pre_boxes[:, 2:4] - pre_boxes[:, 0:2] pre_boxes_wh = torch.cat([pre_boxes_wh, pre_boxes_wh], dim=1) border_off_target = (gt_target - pre_boxes) / (pre_boxes_wh * border_bbox_std) border_classes.append(border_cls_target) border_shifts_deltas.append(border_off_target) return ( torch.stack(gt_classes), torch.stack(gt_shifts_deltas), torch.stack(gt_centerness), torch.stack(border_classes), torch.stack(border_shifts_deltas), )
def test_rpn(self): torch.manual_seed(121) cfg = RCNNConfig() # PROPOSAL_GENERATOR: "RPN" # ANCHOR_GENERATOR: "DefaultAnchorGenerator" cfg.MODEL.RESNETS.DEPTH = 50 cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1) backbone = build_backbone(cfg) proposal_generator = RPN(cfg, backbone.output_shape()) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) image_shape = (15, 15) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) gt_instances = Instances(image_shape) gt_instances.gt_boxes = Boxes(gt_boxes) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, [gt_instances[0], gt_instances[1]]) expected_losses = { "loss_rpn_cls": torch.tensor(0.0804563984), "loss_rpn_loc": torch.tensor(0.0990132466), } for name in expected_losses.keys(): err_msg = "proposal_losses[{}] = {}, expected losses = {}".format( name, proposal_losses[name], expected_losses[name]) self.assertTrue( torch.allclose(proposal_losses[name], expected_losses[name]), err_msg) expected_proposal_boxes = [ Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])), Boxes( torch.tensor([ [0, 0, 30, 20], [0, 0, 16.7862777710, 13.1362524033], [0, 0, 30, 13.3173446655], [0, 0, 10.8602609634, 20], [7.7165775299, 0, 27.3875980377, 20], ])), ] expected_objectness_logits = [ torch.tensor([0.1225359365, -0.0133192837]), torch.tensor([ 0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837 ]), ] for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits): self.assertEqual(len(proposal), len(expected_proposal_box)) self.assertEqual(proposal.image_size, im_size) self.assertTrue( torch.allclose(proposal.proposal_boxes.tensor, expected_proposal_box.tensor)) self.assertTrue( torch.allclose(proposal.objectness_logits, expected_objectness_logit))
def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None): """ Evaluate detection proposal recall metrics. This function is a much faster alternative to the official LVIS API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0 ** 2, 1e5 ** 2], # all [0 ** 2, 32 ** 2], # small [32 ** 2, 96 ** 2], # medium [96 ** 2, 1e5 ** 2], # large [96 ** 2, 128 ** 2], # 96-128 [128 ** 2, 256 ** 2], # 128-256 [256 ** 2, 512 ** 2], # 256-512 [512 ** 2, 1e5 ** 2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for prediction_dict in dataset_predictions: predictions = prediction_dict["proposals"] # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = predictions.objectness_logits.sort(descending=True)[1] predictions = predictions[inds] ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]]) anno = lvis_api.load_anns(ann_ids) gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno ] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_areas = torch.as_tensor([obj["area"] for obj in anno]) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if limit is not None and len(predictions) > limit: predictions = predictions[:limit] overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = torch.cat(gt_overlaps, dim=0) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, }
def get_ground_truth(self, shifts, targets): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. gt_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. gt_centerness (Tensor): An float tensor (0, 1) of shape (N, R) whose values in [0, 1] storing ground-truth centerness for each shift. """ gt_classes = [] gt_shifts_deltas = [] gt_centerness = [] for shifts_per_image, targets_per_image in zip(shifts, targets): shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes is_in_boxes = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)).min(dim=-1).values > 0 gt_positions_iou = [] candidate_idxs = [] base = 0 for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): gt_positions_iou.append( pairwise_iou( gt_boxes, Boxes( torch.cat(( shifts_i - stride * self.anchor_scale / 2, shifts_i + stride * self.anchor_scale / 2, ), dim=1)))) distances = (gt_boxes.get_centers().unsqueeze(1) - shifts_i).pow_(2).sum(dim=-1).sqrt_() _, topk_idxs = distances.topk(self.atss_topk, dim=1, largest=False) candidate_idxs.append(base + topk_idxs) base += len(shifts_i) gt_positions_iou = torch.cat(gt_positions_iou, dim=1) candidate_idxs = torch.cat(candidate_idxs, dim=1) candidate_ious = gt_positions_iou.gather(1, candidate_idxs) ious_thr = (candidate_ious.mean(dim=1, keepdim=True) + candidate_ious.std(dim=1, keepdim=True)) is_foreground = torch.zeros_like(is_in_boxes).scatter_( 1, candidate_idxs, True) is_foreground &= gt_positions_iou >= ious_thr gt_positions_iou[~is_in_boxes] = -1 gt_positions_iou[~is_foreground] = -1 # if there are still more than one objects for a position, # we choose the one with maximum iou positions_max_iou, gt_matched_idxs = gt_positions_iou.max(dim=0) # ground truth box regression gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Shifts with iou -1 are treated as background. gt_classes_i[positions_max_iou == -1] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes # ground truth centerness left_right = gt_shifts_reg_deltas_i[:, [0, 2]] top_bottom = gt_shifts_reg_deltas_i[:, [1, 3]] gt_centerness_i = torch.sqrt( (left_right.min(dim=-1).values / left_right.max(dim=-1).values).clamp_(min=0) * (top_bottom.min(dim=-1).values / top_bottom.max(dim=-1).values).clamp_(min=0)) gt_classes.append(gt_classes_i) gt_shifts_deltas.append(gt_shifts_reg_deltas_i) gt_centerness.append(gt_centerness_i) return torch.stack(gt_classes), torch.stack( gt_shifts_deltas), torch.stack(gt_centerness)
def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = np.array([obj.get("keypoints", []) for obj in annos]) # (N, K, 3) # Set all out-of-boundary points to "unlabeled" kpts_xy = kpts[:, :, :2] inside = (kpts_xy >= np.array([0, 0])) & (kpts_xy <= np.array( image_size[::-1])) inside = inside.all(axis=2) kpts[:, :, :2] = kpts_xy kpts[:, :, 2][~inside] = 0 target.gt_keypoints = Keypoints(kpts) return target
def get_ground_truth(self, shifts, targets, box_cls, box_delta, box_filter): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. gt_shifts_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth shift2box transform targets (dl, dt, dr, db) that map each shift to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding shift is labeled as foreground. """ gt_classes = [] gt_shifts_deltas = [] box_cls = torch.cat([permute_to_N_HWA_K(x, self.num_classes) for x in box_cls], dim=1) box_delta = torch.cat([permute_to_N_HWA_K(x, 4) for x in box_delta], dim=1) box_filter = torch.cat([permute_to_N_HWA_K(x, 1) for x in box_filter], dim=1) box_cls = box_cls.sigmoid_() * box_filter.sigmoid_() num_fg = 0 num_gt = 0 for shifts_per_image, targets_per_image, box_cls_per_image, box_delta_per_image in zip( shifts, targets, box_cls, box_delta): shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes prob = box_cls_per_image[:, targets_per_image.gt_classes].t() boxes = self.shift2box_transform.apply_deltas( box_delta_per_image, shifts_over_all_feature_maps ) iou = pairwise_iou(gt_boxes, Boxes(boxes)) quality = prob ** (1 - self.poto_alpha) * iou ** self.poto_alpha deltas = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)) if self.center_sampling_radius > 0: centers = gt_boxes.get_centers() is_in_boxes = [] for stride, shifts_i in zip(self.fpn_strides, shifts_per_image): radius = stride * self.center_sampling_radius center_boxes = torch.cat(( torch.max(centers - radius, gt_boxes.tensor[:, :2]), torch.min(centers + radius, gt_boxes.tensor[:, 2:]), ), dim=-1) center_deltas = self.shift2box_transform.get_deltas( shifts_i, center_boxes.unsqueeze(1)) is_in_boxes.append(center_deltas.min(dim=-1).values > 0) is_in_boxes = torch.cat(is_in_boxes, dim=1) else: # no center sampling, it will use all the locations within a ground-truth box is_in_boxes = deltas.min(dim=-1).values > 0 quality[~is_in_boxes] = -1 gt_idxs, shift_idxs = linear_sum_assignment(quality.cpu().numpy(), maximize=True) num_fg += len(shift_idxs) num_gt += len(targets_per_image) gt_classes_i = shifts_over_all_feature_maps.new_full( (len(shifts_over_all_feature_maps),), self.num_classes, dtype=torch.long ) gt_shifts_reg_deltas_i = shifts_over_all_feature_maps.new_zeros( len(shifts_over_all_feature_maps), 4 ) if len(targets_per_image) > 0: # ground truth classes gt_classes_i[shift_idxs] = targets_per_image.gt_classes[gt_idxs] # ground truth box regression gt_shifts_reg_deltas_i[shift_idxs] = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps[shift_idxs], gt_boxes[gt_idxs].tensor ) gt_classes.append(gt_classes_i) gt_shifts_deltas.append(gt_shifts_reg_deltas_i) get_event_storage().put_scalar("num_fg_per_gt", num_fg / num_gt) return torch.stack(gt_classes), torch.stack(gt_shifts_deltas)
def __call__(self): # compute box_size m = len(self.feature_map_size) - 1 size_stride = math.floor( (math.floor(self.s_max * 100) - math.floor(self.s_min * 100)) / (m - 1)) bbox_size = [self.conv4_3_scale * self.image_size] bbox_size += [(self.s_min + i * size_stride / 100) * self.image_size for i in range(m)] bbox_size += [1.05 * self.image_size] self.widths = [[] for _ in self.aspect_ratios] self.heights = [[] for _ in self.aspect_ratios] # each a_r denotes the aspect ratios of one feature map for i, a_rs in enumerate(self.aspect_ratios): # ratio = 1 a_r = 1 self.widths[i].append(bbox_size[i] * sqrt(a_r)) self.heights[i].append(bbox_size[i] / sqrt(a_r)) self.widths[i].append( sqrt(bbox_size[i] * bbox_size[i + 1]) * sqrt(a_r)) self.heights[i].append( sqrt(bbox_size[i] * bbox_size[i + 1]) / sqrt(a_r)) # other ratios for a_r in a_rs: self.widths[i].append(bbox_size[i] * sqrt(a_r)) self.heights[i].append(bbox_size[i] / sqrt(a_r)) a_r = 1 / a_r self.widths[i].append(bbox_size[i] * sqrt(a_r)) self.heights[i].append(bbox_size[i] / sqrt(a_r)) # compute center of default boxes self.center_xs = [[] for _ in self.feature_map_size] self.center_ys = [[] for _ in self.feature_map_size] for k, f_k in enumerate(self.feature_map_size): for i, j in product(range(f_k), repeat=2): # bbox center x, y cx = (j + 0.5) / f_k * self.image_size cy = (i + 0.5) / f_k * self.image_size self.center_xs[k].append(cx) self.center_ys[k].append(cy) default_boxes = [] for i, cxs, cys in zip(range(len(self.feature_map_size)), self.center_xs, self.center_ys): one_feature_map_boxes = [] widths = self.widths[i] heights = self.heights[i] for cx, cy in zip(cxs, cys): for w, h in zip(widths, heights): (xmin, ymin, xmax, ymax) = cx - 0.5 * \ w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h one_feature_map_boxes.append([xmin, ymin, xmax, ymax]) one_feature_map_boxes = torch.tensor(one_feature_map_boxes, device=self.device) if self.clip: one_feature_map_boxes = one_feature_map_boxes.clamp_( max=self.image_size, min=0) default_boxes.append(Boxes(one_feature_map_boxes)) return default_boxes
def get_aux_ground_truth(self, shifts, targets, box_cls, box_delta): """ Args: shifts (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level tensors. The tensors contains shifts of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each shift. R is the total number of shifts, i.e. the sum of Hi x Wi for all levels. Shifts in the valid boxes are assigned their corresponding label in the [0, K-1] range. Shifts in the background are assigned the label "K". Shifts in the ignore areas are assigned a label "-1", i.e. ignore. """ gt_classes = [] box_cls = torch.cat([permute_to_N_HWA_K(x, self.num_classes) for x in box_cls], dim=1) box_delta = torch.cat([permute_to_N_HWA_K(x, 4) for x in box_delta], dim=1) box_cls = box_cls.sigmoid_() num_fg = 0 num_gt = 0 for shifts_per_image, targets_per_image, box_cls_per_image, box_delta_per_image in zip( shifts, targets, box_cls, box_delta): shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes prob = box_cls_per_image[:, targets_per_image.gt_classes].t() boxes = self.shift2box_transform.apply_deltas( box_delta_per_image, shifts_over_all_feature_maps ) iou = pairwise_iou(gt_boxes, Boxes(boxes)) quality = prob ** (1 - self.poto_alpha) * iou ** self.poto_alpha candidate_idxs = [] st, ed = 0, 0 for shifts_i in shifts_per_image: ed += len(shifts_i) _, topk_idxs = quality[:, st:ed].topk(self.poto_aux_topk, dim=1) candidate_idxs.append(st + topk_idxs) st = ed candidate_idxs = torch.cat(candidate_idxs, dim=1) is_in_boxes = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1) ).min(dim=-1).values > 0 candidate_qualities = quality.gather(1, candidate_idxs) quality_thr = candidate_qualities.mean(dim=1, keepdim=True) + \ candidate_qualities.std(dim=1, keepdim=True) is_foreground = torch.zeros_like(is_in_boxes).scatter_(1, candidate_idxs, True) is_foreground &= quality >= quality_thr quality[~is_in_boxes] = -1 quality[~is_foreground] = -1 # if there are still more than one objects for a position, # we choose the one with maximum quality positions_max_quality, gt_matched_idxs = quality.max(dim=0) num_fg += (positions_max_quality != -1).sum().item() num_gt += len(targets_per_image) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Shifts with quality -1 are treated as background. gt_classes_i[positions_max_quality == -1] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_classes.append(gt_classes_i) get_event_storage().put_scalar("num_fg_per_gt_aux", num_fg / num_gt) return torch.stack(gt_classes)
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images, labels = self.preprocess_image(batched_inputs, self.training) # batched_inputs[0]['image'] = images.tensor[0].cpu() * 255 # self.visualize_data(batched_inputs[0]) x = images.tensor img_size = x.shape[-2:] def _branch(_embedding, _in): for i, e in enumerate(_embedding): _in = e(_in) if i == 4: out_branch = _in return _in, out_branch # backbone # x2, x1, x0 = self.backbone(x) out_features = self.backbone(x) features = [out_features[f] for f in self.in_features] [x2, x1, x0] = features # yolo branch 0 out0, out0_branch = _branch(self.out0, x0) # yolo branch 1 x1_in = self.out1_cbl(out0_branch) x1_in = self.out1_upsample(x1_in) x1_in = torch.cat([x1_in, x1], 1) out1, out1_branch = _branch(self.out1, x1_in) # yolo branch 2 x2_in = self.out2_cbl(out1_branch) x2_in = self.out2_upsample(x2_in) x2_in = torch.cat([x2_in, x2], 1) out2, out2_branch = _branch(self.out2, x2_in) outputs = [out0, out1, out2] if self.training: losses = [ loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip( outputs, self.loss_evaluators) ] keys = ["loss_x", "loss_y", "loss_w", "loss_h", "loss_conf", "loss_cls"] losses_dict = {} for key in keys: losses_dict[key] = sum([loss[key] for loss in losses]) return losses_dict else: predictions_list = [loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip(outputs, self.loss_evaluators)] predictions = torch.cat(predictions_list, 1) detections = postprocess(predictions, self.num_classes, self.conf_threshold, self.nms_threshold, nms_type=self.nms_type) results = [] for idx, out in enumerate(detections): if out is None: out = x.new_zeros((0, 7)) # image_size = images.image_sizes[idx] image_size = img_size result = Instances(image_size) result.pred_boxes = Boxes(out[:, :4]) result.scores = out[:, 5] * out[:, 4] result.pred_classes = out[:, -1] results.append(result) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def losses(self, anchors, gt_instances, box_cls, box_delta): anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] box_cls_flattened = [ permute_to_N_HWA_K(x, self.num_classes) for x in box_cls ] box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] pred_class_logits = cat(box_cls_flattened, dim=1) pred_anchor_deltas = cat(box_delta_flattened, dim=1) pred_class_probs = pred_class_logits.sigmoid() pred_box_probs = [] num_foreground = 0 positive_losses = [] for anchors_per_image, \ gt_instances_per_image, \ pred_class_probs_per_image, \ pred_anchor_deltas_per_image in zip( anchors, gt_instances, pred_class_probs, pred_anchor_deltas): gt_classes_per_image = gt_instances_per_image.gt_classes with torch.no_grad(): # predicted_boxes_per_image: a_{j}^{loc}, shape: [j, 4] predicted_boxes_per_image = self.box2box_transform.apply_deltas( pred_anchor_deltas_per_image, anchors_per_image.tensor) # gt_pred_iou: IoU_{ij}^{loc}, shape: [i, j] gt_pred_iou = pairwise_iou(gt_instances_per_image.gt_boxes, Boxes(predicted_boxes_per_image)) t1 = self.bbox_threshold t2 = gt_pred_iou.max(dim=1, keepdim=True).values.clamp_( min=t1 + torch.finfo(torch.float32).eps) # gt_pred_prob: P{a_{j} -> b_{i}}, shape: [i, j] gt_pred_prob = ((gt_pred_iou - t1) / (t2 - t1)).clamp_(min=0, max=1) # pred_box_prob_per_image: P{a_{j} \in A_{+}}, shape: [j, c] nonzero_idxs = torch.nonzero(gt_pred_prob, as_tuple=True) pred_box_prob_per_image = torch.zeros_like( pred_class_probs_per_image) pred_box_prob_per_image[nonzero_idxs[1], gt_classes_per_image[nonzero_idxs[0]]] \ = gt_pred_prob[nonzero_idxs] pred_box_probs.append(pred_box_prob_per_image) # construct bags for objects match_quality_matrix = pairwise_iou( gt_instances_per_image.gt_boxes, anchors_per_image) _, foreground_idxs = torch.topk(match_quality_matrix, self.pos_anchor_topk, dim=1, sorted=False) # matched_pred_class_probs_per_image: P_{ij}^{cls} matched_pred_class_probs_per_image = torch.gather( pred_class_probs_per_image[foreground_idxs], 2, gt_classes_per_image.view(-1, 1, 1).repeat(1, self.pos_anchor_topk, 1)).squeeze(2) # matched_gt_anchor_deltas_per_image: P_{ij}^{loc} matched_gt_anchor_deltas_per_image = self.box2box_transform.get_deltas( anchors_per_image.tensor[foreground_idxs], gt_instances_per_image.gt_boxes.tensor.unsqueeze(1)) loss_box_reg = smooth_l1_loss( pred_anchor_deltas_per_image[foreground_idxs], matched_gt_anchor_deltas_per_image, beta=self.smooth_l1_loss_beta, reduction="none").sum(dim=-1) * self.reg_weight matched_pred_reg_probs_per_image = (-loss_box_reg).exp() # positive_losses: { -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } num_foreground += len(gt_instances_per_image) positive_losses.append( positive_bag_loss(matched_pred_class_probs_per_image * matched_pred_reg_probs_per_image, dim=1)) # positive_loss: \sum_{i}{ -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } / ||B|| positive_loss = torch.cat(positive_losses).sum() / max( 1, num_foreground) # pred_box_probs: P{a_{j} \in A_{+}} pred_box_probs = torch.stack(pred_box_probs, dim=0) # negative_loss: \sum_{j}{ FL( (1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}) ) } / n||B|| negative_loss = negative_bag_loss( pred_class_probs * (1 - pred_box_probs), self.focal_loss_gamma).sum() / max( 1, num_foreground * self.pos_anchor_topk) loss_pos = positive_loss * self.focal_loss_alpha loss_neg = negative_loss * (1 - self.focal_loss_alpha) return {"loss_pos": loss_pos, "loss_neg": loss_neg}