def annotations_to_instances(annos, image_size): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes return target
def get_ground_truth(self, anchors, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each anchor. R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels. Anchors with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, K-1] range. Anchors whose IoU are below the background threshold are assigned the label "K". Anchors whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_anchors_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding anchor is labeled as foreground. """ gt_classes = [] gt_anchors_deltas = [] anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] # list[Tensor(R, 4)], one for each image for anchors_per_image, targets_per_image in zip(anchors, targets): match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors_per_image) gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) # ground truth box regression matched_gt_boxes = targets_per_image[gt_matched_idxs].gt_boxes gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors_per_image.tensor, matched_gt_boxes.tensor ) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label 0 are treated as background. gt_classes_i[anchor_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_classes_i[anchor_labels == -1] = -1 else: gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def transform_proposals(dataset_dict, image_shape, transforms, min_box_side_len, proposal_topk): """ Apply transformations to the proposals in dataset_dict, if any. Args: dataset_dict (dict): a dict read from the dataset, possibly contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode" image_shape (tuple): height, width transforms (TransformList): min_box_side_len (int): keep proposals with at least this size proposal_topk (int): only keep top-K scoring proposals The input dict is modified in-place, with abovementioned keys removed. A new key "proposals" will be added. Its value is an `Instances` object which contains the transformed proposals in its field "proposal_boxes" and "objectness_logits". """ if "proposal_boxes" in dataset_dict: # Transform proposal boxes boxes = transforms.apply_box( BoxMode.convert( dataset_dict.pop("proposal_boxes"), dataset_dict.pop("proposal_bbox_mode"), BoxMode.XYXY_ABS, )) boxes = Boxes(boxes) objectness_logits = torch.as_tensor( dataset_dict.pop("proposal_objectness_logits").astype("float32")) boxes.clip(image_shape) keep = boxes.nonempty(threshold=min_box_side_len) boxes = boxes[keep] objectness_logits = objectness_logits[keep] proposals = Instances(image_shape) proposals.proposal_boxes = boxes[:proposal_topk] proposals.objectness_logits = objectness_logits[:proposal_topk] dataset_dict["proposals"] = proposals
def _get_ground_truth(self): """ Returns: gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])). Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4). """ gt_objectness_logits = [] gt_anchor_deltas = [] # Concatenate anchors from all feature maps into a single Boxes per image anchors = [Boxes.cat(anchors_i) for anchors_i in self.anchors] for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes, anchors, self.gt_boxes): """ image_size_i: (h, w) for the i-th image anchors_i: anchors for i-th image gt_boxes_i: ground-truth boxes for i-th image """ match_quality_matrix = pairwise_iou(gt_boxes_i, anchors_i) matched_idxs, gt_objectness_logits_i = self.anchor_matcher( match_quality_matrix) if self.boundary_threshold >= 0: # Discard anchors that go out of the boundaries of the image # NOTE: This is legacy functionality that is turned off by default in Detectron2 anchors_inside_image = anchors_i.inside_box( image_size_i, self.boundary_threshold) gt_objectness_logits_i[~anchors_inside_image] = -1 if len(gt_boxes_i) == 0: # These values won't be used anyway since the anchor is labeled as background gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor) else: # TODO wasted computation for ignored boxes matched_gt_boxes = gt_boxes_i[matched_idxs] gt_anchor_deltas_i = self.box2box_transform.get_deltas( anchors_i.tensor, matched_gt_boxes.tensor) gt_objectness_logits.append(gt_objectness_logits_i) gt_anchor_deltas.append(gt_anchor_deltas_i) return gt_objectness_logits, gt_anchor_deltas
def create_instances(predictions, image_size): ret = Instances(image_size) score = np.asarray([x["score"] for x in predictions]) chosen = (score > args.conf_threshold).nonzero()[0] score = score[chosen] bbox = np.asarray([predictions[i]["bbox"] for i in chosen]) bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) labels = np.asarray([dataset_id_map(predictions[i]["category_id"]) for i in chosen]) ret.scores = score ret.pred_boxes = Boxes(bbox) ret.pred_classes = labels try: ret.pred_masks = [predictions[i]["segmentation"] for i in chosen] except KeyError: pass return ret
def forward(self, features): """ Args: features (list[Tensor]): list of backbone feature maps on which to generate anchors. Returns: list[list[Boxes]]: a list of #image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. """ num_images = len(features[0]) grid_sizes = [feature_map.shape[-2:] for feature_map in features] anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) anchors_in_image = [] for anchors_per_feature_map in anchors_over_all_feature_maps: boxes = Boxes(anchors_per_feature_map) anchors_in_image.append(boxes) anchors = [copy.deepcopy(anchors_in_image) for _ in range(num_images)] return anchors
def fast_rcnn_inference_single_image(boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero() if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] # Apply per-class NMS keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0]
def label_and_sample_proposals(self, proposals, targets): """ Prepare some proposals to be used to train the ROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than `self.positive_sample_fraction. Args: See :meth:`ROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the proposal boxes - gt_boxes: the ground-truth box that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) Other fields such as "gt_classes" that's included in `targets`. """ gt_boxes = [x.gt_boxes for x in targets] # Augment proposals with ground-truth boxes. # In the case of learned proposals (e.g., RPN), when training starts # the proposals will be low quality due to random initialization. # It's possible that none of these initial # proposals have high enough overlap with the gt objects to be used # as positive examples for the second stage components (box head, # cls head). Adding the gt boxes to the set of proposals # ensures that the second stage components will have some positive # examples from the start of training. For RPN, this augmentation improves # convergence and empirically improves box AP on COCO by about 0.5 # points (under one tested configuration). if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes ) matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes ) # Set target attributes of the sampled proposals: proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes # We index all the attributes of targets that start with "gt_" # and have not been added to proposals yet (="gt_classes"). if has_gt: sampled_targets = matched_idxs[sampled_idxs] # NOTE: here the indexing waste some compute, because heads # will filter the proposals again (by foreground/background, # etc), so we essentially index the data twice. for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros((len(sampled_idxs), 4)) ) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append((gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def find_top_rpn_proposals( proposals, pred_objectness_logits, images, nms_thresh, pre_nms_topk, post_nms_topk, min_box_side_len, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). images (ImageList): Input images as an :class:`ImageList`. nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_side_len (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ image_sizes = images.image_sizes # in (h, w) order num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits): Hi_Wi_A = logits_i.shape[1] num_proposals_i = min(pre_nms_topk, Hi_Wi_A) # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) logits_i, idx = logits_i.sort(descending=True, dim=1) topk_scores_i = logits_i[batch_idx, :num_proposals_i] topk_idx = idx[batch_idx, :num_proposals_i] # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( torch.full((num_proposals_i, ), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_side_len) lvl = level_ids if keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[ keep], level_ids[keep] keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results
def inference_single_image(self, box_cls, box_delta, anchors, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors for that image in that feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ boxes_all = [] scores_all = [] class_idxs_all = [] # Iterate over every feature level for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): # (HxWxAxK,) box_cls_i = box_cls_i.flatten().sigmoid_() # Keep top k top scoring indices only. num_topk = min(self.topk_candidates, box_reg_i.size(0)) # torch.sort is actually faster than .topk (at least on GPUs) predicted_prob, topk_idxs = box_cls_i.sort(descending=True) predicted_prob = predicted_prob[:num_topk] topk_idxs = topk_idxs[:num_topk] # filter out the proposals with low confidence score keep_idxs = predicted_prob > self.score_threshold predicted_prob = predicted_prob[keep_idxs] topk_idxs = topk_idxs[keep_idxs] anchor_idxs = topk_idxs // self.num_classes classes_idxs = topk_idxs % self.num_classes box_reg_i = box_reg_i[anchor_idxs] anchors_i = anchors_i[anchor_idxs] # predict boxes predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor) boxes_all.append(predicted_boxes) scores_all.append(predicted_prob) class_idxs_all.append(classes_idxs) boxes_all, scores_all, class_idxs_all = [ cat(x) for x in [boxes_all, scores_all, class_idxs_all] ] keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) keep = keep[: self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_all[keep]) result.scores = scores_all[keep] result.pred_classes = class_idxs_all[keep] return result