def visualize_training(self, batched_inputs, proposals): """ A function used to visualize images and proposals. It shows ground truth bounding boxes on the original image and up to 20 predicted object proposals on the original image. Users can implement different visualization functions for different models. Args: batched_inputs (list): a list that contains input to the model. proposals (list): a list that contains predicted proposals. Both batched_inputs and proposals should have the same length. """ from mydl.utils.visualizer import Visualizer storage = get_event_storage() max_vis_prop = 20 for input, prop in zip(batched_inputs, proposals): img = input["image"].cpu().numpy() assert img.shape[0] == 3, "Images should have 3 channels." if self.input_format == "BGR": img = img[::-1, :, :] img = img.transpose(1, 2, 0) v_gt = Visualizer(img, None) v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) anno_img = v_gt.get_image() box_size = min(len(prop.proposal_boxes), max_vis_prop) v_pred = Visualizer(img, None) v_pred = v_pred.overlay_instances( boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() ) prop_img = v_pred.get_image() vis_img = np.concatenate((anno_img, prop_img), axis=1) vis_img = vis_img.transpose(2, 0, 1) vis_name = " 1. GT bounding boxes 2. Predicted proposals" storage.put_image(vis_name, vis_img)
def roi_mask_point_loss(mask_logits, instances, points_coord): """ Compute the point-based loss for instance segmentation mask predictions. Args: mask_logits (Tensor): A tensor of shape (R, C, P) or (R, 1, P) for class-specific or class-agnostic, where R is the total number of predicted masks in all images, C is the number of foreground classes, and P is the number of points sampled for each mask. The values are logits. instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. These instances are in 1:1 correspondence with the `mask_logits`. So, i_th elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R. The ground-truth labels (class, box, mask, ...) associated with each instance are stored in fields. points_coords (Tensor): A tensor of shape (R, P, 2), where R is the total number of predicted masks and P is the number of points for each mask. The coordinates are in the image pixel coordinate space, i.e. [0, H] x [0, W]. Returns: point_loss (Tensor): A scalar tensor containing the loss. """ assert len(instances) == 0 or isinstance( instances[0].gt_masks, BitMasks ), "Point head works with GT in 'bitmask' format only. Set INPUT.MASK_FORMAT to 'bitmask'." with torch.no_grad(): cls_agnostic_mask = mask_logits.size(1) == 1 total_num_masks = mask_logits.size(0) gt_classes = [] gt_mask_logits = [] idx = 0 for instances_per_image in instances: if not cls_agnostic_mask: gt_classes_per_image = instances_per_image.gt_classes.to( dtype=torch.int64) gt_classes.append(gt_classes_per_image) gt_bit_masks = instances_per_image.gt_masks.tensor h, w = instances_per_image.gt_masks.image_size scale = torch.tensor([w, h], dtype=torch.float, device=gt_bit_masks.device) points_coord_grid_sample_format = ( points_coord[idx:idx + len(instances_per_image)] / scale) idx += len(instances_per_image) gt_mask_logits.append( point_sample( gt_bit_masks.to(torch.float32).unsqueeze(1), points_coord_grid_sample_format, align_corners=False, ).squeeze(1)) gt_mask_logits = cat(gt_mask_logits) # torch.mean (in binary_cross_entropy_with_logits) doesn't # accept empty tensors, so handle it separately if gt_mask_logits.numel() == 0: return mask_logits.sum() * 0 if cls_agnostic_mask: mask_logits = mask_logits[:, 0] else: indices = torch.arange(total_num_masks) gt_classes = cat(gt_classes, dim=0) mask_logits = mask_logits[indices, gt_classes] # Log the training accuracy (using gt classes and 0.0 threshold for the logits) mask_accurate = (mask_logits > 0.0) == gt_mask_logits.to(dtype=torch.uint8) mask_accuracy = mask_accurate.nonzero().size(0) / mask_accurate.numel() get_event_storage().put_scalar("point_rend/accuracy", mask_accuracy) point_loss = F.binary_cross_entropy_with_logits( mask_logits, gt_mask_logits.to(dtype=torch.float32), reduction="mean") return point_loss
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) features = [features[f] for f in self.in_features] box_cls, box_delta = self.head(features) anchors = self.anchor_generator(features) if self.training: gt_classes, gt_anchors_reg_deltas = self.get_ground_truth( anchors, gt_instances) losses = self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: results = self.inference(box_cls, box_delta, anchors, images.image_sizes) self.visualize_training(batched_inputs, results) return losses else: results = self.inference(box_cls, box_delta, anchors, images.image_sizes) processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results
def losses(self): """ Return the losses from a set of RPN predictions and their associated ground-truth. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ def resample(label): """ Randomly sample a subset of positive and negative examples by overwriting the label vector to the ignore value (-1) for all elements that are not included in the sample. """ pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0) # Fill with the ignore label (-1), then set positive and negative labels label.fill_(-1) label.scatter_(0, pos_idx, 1) label.scatter_(0, neg_idx, 0) return label gt_objectness_logits, gt_anchor_deltas = self._get_ground_truth() """ gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])) gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), B), where B is the box dimension """ # Collect all objectness labels and delta targets over feature maps and images # The final ordering is L, N, H, W, A from slowest to fastest axis. num_anchors_per_map = [ np.prod(x.shape[1:]) for x in self.pred_objectness_logits ] num_anchors_per_image = sum(num_anchors_per_map) # Stack to: (N, num_anchors_per_image) gt_objectness_logits = torch.stack( [resample(label) for label in gt_objectness_logits], dim=0) # Log the number of positive/negative anchors per-image that's used in training num_pos_anchors = (gt_objectness_logits == 1).sum().item() num_neg_anchors = (gt_objectness_logits == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / self.num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / self.num_images) assert gt_objectness_logits.shape[1] == num_anchors_per_image # Split to tuple of L tensors, each with shape (N, num_anchors_per_map) gt_objectness_logits = torch.split(gt_objectness_logits, num_anchors_per_map, dim=1) # Concat from all feature maps gt_objectness_logits = cat([x.flatten() for x in gt_objectness_logits], dim=0) # Stack to: (N, num_anchors_per_image, B) gt_anchor_deltas = torch.stack(gt_anchor_deltas, dim=0) assert gt_anchor_deltas.shape[1] == num_anchors_per_image B = gt_anchor_deltas.shape[2] # box dimension (4 or 5) # Split to tuple of L tensors, each with shape (N, num_anchors_per_image) gt_anchor_deltas = torch.split(gt_anchor_deltas, num_anchors_per_map, dim=1) # Concat from all feature maps gt_anchor_deltas = cat([x.reshape(-1, B) for x in gt_anchor_deltas], dim=0) # Collect all objectness logits and delta predictions over feature maps # and images to arrive at the same shape as the labels and targets # The final ordering is L, N, H, W, A from slowest to fastest axis. pred_objectness_logits = cat( [ # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N*Hi*Wi*A, ) x.permute(0, 2, 3, 1).flatten() for x in self.pred_objectness_logits ], dim=0, ) pred_anchor_deltas = cat( [ # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) # -> (N*Hi*Wi*A, B) x.view(x.shape[0], -1, B, x.shape[-2], x.shape[-1]).permute( 0, 3, 4, 1, 2).reshape(-1, B) for x in self.pred_anchor_deltas ], dim=0, ) objectness_loss, localization_loss = rpn_losses( gt_objectness_logits, gt_anchor_deltas, pred_objectness_logits, pred_anchor_deltas, self.smooth_l1_beta, ) normalizer = 1.0 / (self.batch_size_per_image * self.num_images) loss_cls = objectness_loss * normalizer # cls: classification loss loss_loc = localization_loss * normalizer # loc: localization loss losses = {"loss_rpn_cls": loss_cls, "loss_rpn_loc": loss_loc} return losses
def mask_rcnn_loss(pred_mask_logits, instances, vis_period=0): """ Compute the mask prediction loss defined in the Mask R-CNN paper. Args: pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask) for class-specific or class-agnostic, where B is the total number of predicted masks in all images, C is the number of foreground classes, and Hmask, Wmask are the height and width of the mask predictions. The values are logits. instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. These instances are in 1:1 correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask, ...) associated with each instance are stored in fields. vis_period (int): the period (in steps) to dump visualization. Returns: mask_loss (Tensor): A scalar tensor containing the loss. """ cls_agnostic_mask = pred_mask_logits.size(1) == 1 total_num_masks = pred_mask_logits.size(0) mask_side_len = pred_mask_logits.size(2) assert pred_mask_logits.size(2) == pred_mask_logits.size( 3), "Mask prediction must be square!" gt_classes = [] gt_masks = [] for instances_per_image in instances: if len(instances_per_image) == 0: continue if not cls_agnostic_mask: gt_classes_per_image = instances_per_image.gt_classes.to( dtype=torch.int64) gt_classes.append(gt_classes_per_image) gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize( instances_per_image.proposal_boxes.tensor, mask_side_len).to(device=pred_mask_logits.device) # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len gt_masks.append(gt_masks_per_image) if len(gt_masks) == 0: return pred_mask_logits.sum() * 0 gt_masks = cat(gt_masks, dim=0) if cls_agnostic_mask: pred_mask_logits = pred_mask_logits[:, 0] else: indices = torch.arange(total_num_masks) gt_classes = cat(gt_classes, dim=0) pred_mask_logits = pred_mask_logits[indices, gt_classes] if gt_masks.dtype == torch.bool: gt_masks_bool = gt_masks else: # Here we allow gt_masks to be float as well (depend on the implementation of rasterize()) gt_masks_bool = gt_masks > 0.5 gt_masks = gt_masks.to(dtype=torch.float32) # Log the training accuracy (using gt classes and 0.5 threshold) mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0)) num_positive = gt_masks_bool.sum().item() false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max( gt_masks_bool.numel() - num_positive, 1.0) false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max( num_positive, 1.0) storage = get_event_storage() storage.put_scalar("mask_rcnn/accuracy", mask_accuracy) storage.put_scalar("mask_rcnn/false_positive", false_positive) storage.put_scalar("mask_rcnn/false_negative", false_negative) if vis_period > 0 and storage.iter % vis_period == 0: pred_masks = pred_mask_logits.sigmoid() vis_masks = torch.cat([pred_masks, gt_masks], axis=2) name = "Left: mask prediction; Right: mask GT" for idx, vis_mask in enumerate(vis_masks): vis_mask = torch.stack([vis_mask] * 3, axis=0) storage.put_image(name + f" ({idx})", vis_mask) mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean") return mask_loss
def label_and_sample_proposals(self, proposals, targets): """ Prepare some proposals to be used to train the RROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than `self.positive_sample_fraction. Args: See :meth:`StandardROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the rotated proposal boxes - gt_boxes: the ground-truth rotated boxes that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) - gt_classes: the ground-truth classification lable for each proposal """ gt_boxes = [x.gt_boxes for x in targets] if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou_rotated( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes) proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes if has_gt: sampled_targets = matched_idxs[sampled_idxs] proposals_per_image.gt_boxes = targets_per_image.gt_boxes[ sampled_targets] else: gt_boxes = RotatedBoxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 5))) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append( (gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def label_and_sample_proposals( self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]: """ Prepare some proposals to be used to train the ROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than ``self.positive_sample_fraction``. Args: See :meth:`ROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the proposal boxes - gt_boxes: the ground-truth box that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) Other fields such as "gt_classes", "gt_masks", that's included in `targets`. """ gt_boxes = [x.gt_boxes for x in targets] # Augment proposals with ground-truth boxes. # In the case of learned proposals (e.g., RPN), when training starts # the proposals will be low quality due to random initialization. # It's possible that none of these initial # proposals have high enough overlap with the gt objects to be used # as positive examples for the second stage components (box head, # cls head, mask head). Adding the gt boxes to the set of proposals # ensures that the second stage components will have some positive # examples from the start of training. For RPN, this augmentation improves # convergence and empirically improves box AP on COCO by about 0.5 # points (under one tested configuration). if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(gt_boxes, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) matched_idxs, matched_labels = self.proposal_matcher( match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes) # Set target attributes of the sampled proposals: proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes # We index all the attributes of targets that start with "gt_" # and have not been added to proposals yet (="gt_classes"). if has_gt: sampled_targets = matched_idxs[sampled_idxs] # NOTE: here the indexing waste some compute, because heads # like masks, keypoints, etc, will filter the proposals again, # (by foreground/background, or number of keypoints in the image, etc) # so we essentially index the data twice. for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith( "gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) else: gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros( (len(sampled_idxs), 4))) proposals_per_image.gt_boxes = gt_boxes num_bg_samples.append( (gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses