def _log_accuracy(self): """ Log the accuracy metrics to EventStorage. """ num_instances = self.gt_classes.numel() pred_classes = self.pred_class_logits.argmax(dim=1) bg_class_ind = self.pred_class_logits.shape[1] - 1 fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind) num_fg = fg_inds.nonzero().numel() fg_gt_classes = self.gt_classes[fg_inds] fg_pred_classes = pred_classes[fg_inds] num_false_negative = ( fg_pred_classes == bg_class_ind).nonzero().numel() num_accurate = (pred_classes == self.gt_classes).nonzero().numel() fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel() storage = get_event_storage() storage.put_scalar("fast_rcnn/cls_accuracy", num_accurate / num_instances) if num_fg > 0: storage.put_scalar("fast_rcnn/fg_cls_accuracy", fg_num_accurate / num_fg) storage.put_scalar("fast_rcnn/false_negative", num_false_negative / num_fg)
def label_and_sample_proposals(self, proposals, targets): """ Prepare some proposals to be used to train the ROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than `self.positive_sample_fraction. Args: See :meth:`ROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the proposal boxes - gt_boxes: the ground-truth box that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) Other fields such as "gt_classes" that's included in `targets`. """ gt_boxes = [x.gt_boxes for x in targets] # Augment proposals with ground-truth boxes. # In the case of learned proposals (e.g., RPN), when training starts # the proposals will be low quality due to random initialization. # It's possible that none of these initial # proposals have high enough overlap with the gt objects to be used # as positive examples for the second stage components (box head, # cls head). Adding the gt boxes to the set of proposals # ensures that the second stage components will have some positive # examples from the start of training. For RPN, this augmentation improves # convergence and empirically improves box AP on COCO by about 0.5 # points (under one tested configuration). if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes ) matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes ) # Set target attributes of the sampled proposals: proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes # We index all the attributes of targets that start with "gt_" # and have not been added to proposals yet (="gt_classes"). if has_gt: sampled_targets = matched_idxs[sampled_idxs] # NOTE: here the indexing waste some compute, because heads # will filter the proposals again (by foreground/background, # etc), so we essentially index the data twice. for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros((len(sampled_idxs), 4)) ) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append((gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def losses(self): """ Return the losses from a set of RPN predictions and their associated ground-truth. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ def resample(label): """ Randomly sample a subset of positive and negative examples by overwriting the label vector to the ignore value (-1) for all elements that are not included in the sample. """ pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0) # Fill with the ignore label (-1), then set positive and negative labels label.fill_(-1) label.scatter_(0, pos_idx, 1) label.scatter_(0, neg_idx, 0) return label gt_objectness_logits, gt_anchor_deltas = self._get_ground_truth() """ gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])) gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), B), where B is the box dimension """ # Collect all objectness labels and delta targets over feature maps and images # The final ordering is L, N, H, W, A from slowest to fastest axis. num_anchors_per_map = [ np.prod(x.shape[1:]) for x in self.pred_objectness_logits ] num_anchors_per_image = sum(num_anchors_per_map) # Stack to: (N, num_anchors_per_image) gt_objectness_logits = torch.stack( [resample(label) for label in gt_objectness_logits], dim=0) # Log the number of positive/negative anchors per-image that's used in training num_pos_anchors = (gt_objectness_logits == 1).sum().item() num_neg_anchors = (gt_objectness_logits == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / self.num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / self.num_images) assert gt_objectness_logits.shape[1] == num_anchors_per_image # Split to tuple of L tensors, each with shape (N, num_anchors_per_map) gt_objectness_logits = torch.split(gt_objectness_logits, num_anchors_per_map, dim=1) # Concat from all feature maps gt_objectness_logits = cat([x.flatten() for x in gt_objectness_logits], dim=0) # Stack to: (N, num_anchors_per_image, B) gt_anchor_deltas = torch.stack(gt_anchor_deltas, dim=0) assert gt_anchor_deltas.shape[1] == num_anchors_per_image B = gt_anchor_deltas.shape[2] # box dimension (4 or 5) # Split to tuple of L tensors, each with shape (N, num_anchors_per_image) gt_anchor_deltas = torch.split(gt_anchor_deltas, num_anchors_per_map, dim=1) # Concat from all feature maps gt_anchor_deltas = cat([x.reshape(-1, B) for x in gt_anchor_deltas], dim=0) # Collect all objectness logits and delta predictions over feature maps # and images to arrive at the same shape as the labels and targets # The final ordering is L, N, H, W, A from slowest to fastest axis. pred_objectness_logits = cat( [ # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N*Hi*Wi*A, ) x.permute(0, 2, 3, 1).flatten() for x in self.pred_objectness_logits ], dim=0, ) pred_anchor_deltas = cat( [ # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) # -> (N*Hi*Wi*A, B) x.view(x.shape[0], -1, B, x.shape[-2], x.shape[-1]).permute( 0, 3, 4, 1, 2).reshape(-1, B) for x in self.pred_anchor_deltas ], dim=0, ) objectness_loss, localization_loss = rpn_losses( gt_objectness_logits, gt_anchor_deltas, pred_objectness_logits, pred_anchor_deltas, self.smooth_l1_beta, ) normalizer = 1.0 / (self.batch_size_per_image * self.num_images) loss_cls = objectness_loss * normalizer # cls: classification loss loss_loc = localization_loss * normalizer # loc: localization loss losses = {"loss_rpn_cls": loss_cls, "loss_rpn_loc": loss_loc} return losses