def test_empty_inputs(self) -> None: box1 = torch.randn([0, 4], dtype=torch.float32).requires_grad_() box2 = torch.randn([0, 4], dtype=torch.float32).requires_grad_() loss = giou_loss(box1, box2, reduction="mean") loss.backward() self.assertEqual(loss.detach().numpy(), 0.0) self.assertIsNotNone(box1.grad) self.assertIsNotNone(box2.grad) loss = giou_loss(box1, box2, reduction="none") self.assertEqual(loss.numel(), 0)
def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes): """ Args: All boxes are tensors with the same shape Rx(4 or 5). gt_classes is a long tensor of shape R, the gt class label of each proposal. R shall be the number of proposals. """ box_dim = proposal_boxes.shape[1] # 4 or 5 # Regression loss is only computed for foreground proposals (those matched to a GT) fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0] if pred_deltas.shape[1] == box_dim: # cls-agnostic regression fg_pred_deltas = pred_deltas[fg_inds] else: fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[fg_inds, gt_classes[fg_inds]] if self.box_reg_loss_type == "smooth_l1": gt_pred_deltas = self.box2box_transform.get_deltas( proposal_boxes[fg_inds], gt_boxes[fg_inds], ) loss_box_reg = smooth_l1_loss(fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum") elif self.box_reg_loss_type == "giou": fg_pred_boxes = self.box2box_transform.apply_deltas( fg_pred_deltas, proposal_boxes[fg_inds]) loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum") elif self.box_reg_loss_type == "diou": fg_pred_boxes = self.box2box_transform.apply_deltas( fg_pred_deltas, proposal_boxes[fg_inds]) loss_box_reg = diou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum") elif self.box_reg_loss_type == "ciou": fg_pred_boxes = self.box2box_transform.apply_deltas( fg_pred_deltas, proposal_boxes[fg_inds]) loss_box_reg = ciou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum") else: raise ValueError( f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") # The reg loss is normalized using the total number of regions (R), not the number # of foreground regions even though the box regression loss is only defined on # foreground regions. Why? Because doing so gives equal training influence to # each foreground example. To see how, consider two different minibatches: # (1) Contains a single foreground region # (2) Contains 100 foreground regions # If we normalize by the number of foreground regions, the single example in # minibatch (1) will be given 100 times as much influence as each foreground # example in minibatch (2). Normalizing by the total number of regions, R, # means that the single example in minibatch (1) and each of the 100 examples # in minibatch (2) are given equal influence. return loss_box_reg / max(gt_classes.numel(), 1.0) # return 0 if empty
def losses(self, anchors, pred_objectness_logits, gt_labels, pred_anchor_deltas, gt_boxes, loss_weights=None): num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) # Log the number of positive/negative anchors per-image that's used in training pos_mask = gt_labels == 1 num_pos_anchors = pos_mask.sum().item() num_neg_anchors = (gt_labels == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images) reduction = "sum" if loss_weights is None else "none" if self.box_reg_loss_type == "smooth_l1": anchors = type(anchors[0]).cat(anchors).tensor # Ax(4 or 5) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack( gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4 or 5) localization_loss = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.smooth_l1_beta, reduction=reduction, ) elif self.box_reg_loss_type == "giou": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = giou_loss(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask], reduction=reduction) else: raise ValueError( f"Invalid rpn box reg loss type '{self.box_reg_loss_type}'") valid_mask = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction=reduction, ) normalizer = self.batch_size_per_image * num_images losses = { "loss_rpn_cls": objectness_loss / normalizer, "loss_rpn_loc": localization_loss / normalizer, } losses = { k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items() } return losses
def _dense_box_regression_loss( anchors: List[Boxes], box2box_transform: Box2BoxTransform, pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], fg_mask: torch.Tensor, box_reg_loss_type="smooth_l1", smooth_l1_beta=0.0, ): """ Compute loss for dense multi-level box regression. Loss is accumulated over ``fg_mask``. Args: anchors: #lvl anchor boxes, each is (HixWixA, 4) pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4) gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A)) fg_mask: the foreground boolean mask of shape (N, R) to compute loss on box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou", "diou", "ciou". smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1" """ anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) if box_reg_loss_type == "smooth_l1": gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) loss_box_reg = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[fg_mask], gt_anchor_deltas[fg_mask], beta=smooth_l1_beta, reduction="sum", ) elif box_reg_loss_type == "giou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = giou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) elif box_reg_loss_type == "diou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = diou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) elif box_reg_loss_type == "ciou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = ciou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) else: raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'") return loss_box_reg
def box_reg_loss(self): """ Deprecated """ if self._no_instances: return 0.0 * self.pred_proposal_deltas.sum() box_dim = self.proposals.tensor.size(1) # 4 or 5 cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim device = self.pred_proposal_deltas.device bg_class_ind = self.pred_class_logits.shape[1] - 1 # Box delta loss is only computed between the prediction for the gt class k # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions # for non-gt classes and background. # Empty fg_inds should produce a valid loss of zero because reduction=sum. fg_inds = nonzero_tuple((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind))[0] if cls_agnostic_bbox_reg: # pred_proposal_deltas only corresponds to foreground class for agnostic gt_class_cols = torch.arange(box_dim, device=device) else: # pred_proposal_deltas for class k are located in columns [b * k : b * k + b], # where b is the dimension of box representation (4 or 5) # Note that compared to Detectron1, # we do not perform bounding box regression for background classes. gt_class_cols = box_dim * self.gt_classes[ fg_inds, None] + torch.arange(box_dim, device=device) if self.box_reg_loss_type == "smooth_l1": gt_proposal_deltas = self.box2box_transform.get_deltas( self.proposals.tensor, self.gt_boxes.tensor) loss_box_reg = smooth_l1_loss( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], gt_proposal_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": fg_pred_boxes = self.box2box_transform.apply_deltas( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], self.proposals.tensor[fg_inds], ) loss_box_reg = giou_loss( fg_pred_boxes, self.gt_boxes.tensor[fg_inds], reduction="sum", ) else: raise ValueError( f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") loss_box_reg = loss_box_reg / self.gt_classes.numel() return loss_box_reg
def loss_fn(preds: torch.Tensor, gt: torch.Tensor): ''' - Arguments: - preds: torch.Tensor of shape (nb_tracks, 4) - gt: torch.Tensor of shape (nb_tracks, 4) - Returns: - loss ''' #return smooth_l1_loss(preds, gt, 0.05) return giou_loss(preds, gt)
def test_giou_loss(self) -> None: # Identical boxes should have loss of 0 box = torch.tensor([-1, -1, 1, 1], dtype=torch.float32) loss = giou_loss(box, box) self.assertTrue(np.allclose(loss, [0.0])) # quarter size box inside other box = IoU of 0.25 box2 = torch.tensor([0, 0, 1, 1], dtype=torch.float32) loss = giou_loss(box, box2) self.assertTrue(np.allclose(loss, [0.75])) # Two side by side boxes, area=union # IoU=0 and GIoU=0 (loss 1.0) box3 = torch.tensor([0, 1, 1, 2], dtype=torch.float32) loss = giou_loss(box2, box3) self.assertTrue(np.allclose(loss, [1.0])) # Two diagonally adjacent boxes, area=2*union # IoU=0 and GIoU=-0.5 (loss 1.5) box4 = torch.tensor([1, 1, 2, 2], dtype=torch.float32) loss = giou_loss(box2, box4) self.assertTrue(np.allclose(loss, [1.5])) # Test batched loss and reductions box1s = torch.stack([box2, box2], dim=0) box2s = torch.stack([box3, box4], dim=0) loss = giou_loss(box1s, box2s, reduction="sum") self.assertTrue(np.allclose(loss, [2.5])) loss = giou_loss(box1s, box2s, reduction="mean") self.assertTrue(np.allclose(loss, [1.25]))
def box_reg_loss(self): """ change _no_instance handling and normalization """ if self._no_instances: print('No instance in box reg loss') return self.pred_proposal_deltas.sum() * 0. box_dim = self.gt_boxes.tensor.size(1) # 4 or 5 cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim device = self.pred_proposal_deltas.device bg_class_ind = self.pred_class_logits.shape[1] - 1 fg_inds = nonzero_tuple((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind))[0] if cls_agnostic_bbox_reg: gt_class_cols = torch.arange(box_dim, device=device) else: fg_gt_classes = self.gt_classes[fg_inds] gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange( box_dim, device=device) if self.box_reg_loss_type == "smooth_l1": gt_proposal_deltas = self.box2box_transform.get_deltas( self.proposals.tensor, self.gt_boxes.tensor) loss_box_reg = smooth_l1_loss( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], gt_proposal_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": loss_box_reg = giou_loss( self._predict_boxes()[fg_inds[:, None], gt_class_cols], self.gt_boxes.tensor[fg_inds], reduction="sum", ) else: raise ValueError( f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") if self.fix_norm_reg: loss_box_reg = loss_box_reg / self.box_batch_size else: loss_box_reg = loss_box_reg / self.gt_classes.numel() return loss_box_reg
def box_reg_loss(self): """ Compute the smooth L1 loss for box regression. Returns: scalar Tensor """ if self._no_instances: return 0.0 * self.pred_proposal_deltas.sum() box_dim = self.gt_boxes.tensor.size(1) # 4 or 5 cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim device = self.pred_proposal_deltas.device bg_class_ind = self.pred_category_score.shape[1] - 1 # Box delta loss is only computed between the prediction for the gt class k # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions # for non-gt classes and background. # Empty fg_inds produces a valid loss of zero as long as the size_average # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally # and would produce a nan loss). fg_inds = nonzero_tuple((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind))[0] if cls_agnostic_bbox_reg: # pred_proposal_deltas only corresponds to foreground class for agnostic gt_class_cols = torch.arange(box_dim, device=device) else: fg_gt_classes = self.gt_classes[fg_inds] # pred_proposal_deltas for class k are located in columns [b * k : b * k + b], # where b is the dimension of box representation (4 or 5) # Note that compared to Detectron1, # we do not perform bounding box regression for background classes. gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange( box_dim, device=device) if self.box_reg_loss_type == "smooth_l1": gt_proposal_deltas = self.box2box_transform.get_deltas( self.proposals.tensor, self.gt_boxes.tensor) loss_box_reg = smooth_l1_loss( self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], gt_proposal_deltas[fg_inds], self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": loss_box_reg = giou_loss( self._predict_boxes()[fg_inds[:, None], gt_class_cols], self.gt_boxes.tensor[fg_inds], reduction="sum", ) else: raise ValueError( f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") # The loss is normalized using the total number of regions (R), not the number # of foreground regions even though the box regression loss is only defined on # foreground regions. Why? Because doing so gives equal training influence to # each foreground example. To see how, consider two different minibatches: # (1) Contains a single foreground region # (2) Contains 100 foreground regions # If we normalize by the number of foreground regions, the single example in # minibatch (1) will be given 100 times as much influence as each foreground # example in minibatch (2). Normalizing by the total number of regions, R, # means that the single example in minibatch (1) and each of the 100 examples # in minibatch (2) are given equal influence. loss_box_reg = loss_box_reg * self.box_reg_loss_weight / self.gt_classes.numel( ) return loss_box_reg
def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes): """ Args: anchors (list[Boxes]): a list of #feature level Boxes gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi x Ai) pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4). Where K is the number of classes used in `pred_logits`. Returns: dict[str, Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, R) anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) valid_mask = gt_labels >= 0 pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) num_pos_anchors = pos_mask.sum().item() get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + ( 1 - self.loss_normalizer_momentum ) * max(num_pos_anchors, 1) # classification and regression loss gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[ :, :-1 ] # no loss for the last (background) class loss_cls = sigmoid_focal_loss_jit( cat(pred_logits, dim=1)[valid_mask], gt_labels_target.to(pred_logits[0].dtype), alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) if self.box_reg_loss_type == "smooth_l1": loss_box_reg = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], beta=self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": pred_boxes = [ self.box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = giou_loss( torch.stack(pred_boxes)[pos_mask], torch.stack(gt_boxes)[pos_mask], reduction="sum" ) else: raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") return { "loss_cls": loss_cls / self.loss_normalizer, "loss_box_reg": loss_box_reg / self.loss_normalizer, }
def losses( self, anchors: List[Boxes], pred_objectness_logits: List[torch.Tensor], gt_labels: List[torch.Tensor], pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], ) -> Dict[str, torch.Tensor]: """ Return the losses from a set of RPN predictions and their associated ground-truth. Args: anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each has shape (Hi*Wi*A, B), where B is box dimension (4 or 5). pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A) representing the predicted objectness logits for all anchors. gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`. pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors to proposals. gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) # Log the number of positive/negative anchors per-image that's used in training pos_mask = gt_labels == 1 num_pos_anchors = pos_mask.sum().item() num_neg_anchors = (gt_labels == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images) if self.box_reg_loss_type == "smooth_l1": anchors = type(anchors[0]).cat(anchors).tensor # Ax(4 or 5) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack( gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4 or 5) localization_loss = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = giou_loss(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask], reduction="sum") elif self.box_reg_loss_type == "diou": anchors = type(anchors[0]).cat(anchors).tensor # Ax(4 or 5) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack( gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4 or 5) localization_loss = compute_diou( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.box2box_transform.weights, self.box2box_transform.scale_clamp) # elif self.box_reg_loss_type == "diou_bbox": # pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) # pred_proposals = cat(pred_proposals, dim=1) # pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) # pos_mask = pos_mask.view(-1) # localization_loss = giou_loss( # pred_proposals[pos_mask], cat(gt_boxes)[pos_mask] # ) elif self.box_reg_loss_type == "diou_mmdet": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = compute_diou_mmdet(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask]) elif self.box_reg_loss_type == "ciou_mmdet": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = compute_ciou_mmdet(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask]) else: raise ValueError( f"Invalid rpn box reg loss type '{self.box_reg_loss_type}'") valid_mask = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction="sum", ) normalizer = self.batch_size_per_image * num_images losses = { "loss_rpn_cls": objectness_loss / normalizer, "loss_rpn_loc": localization_loss / normalizer, } losses = { k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items() } return losses
def losses(self, anchors, pred_logits, pred_boxes_init, pred_anchor_deltas, gt_instances, point_centers, strides): """ Args: anchors (list[Boxes]): a list of #feature level Boxes gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi x Ai) pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4). Where K is the number of classes used in `pred_logits`. Returns: dict[str, Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances) gt_labels_init, gt_boxes_init = self.get_ground_truth( point_centers, strides, gt_instances) # Transpose the Hi*Wi*A dimension to the middle: pred_logits = [ permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits ] pred_anchor_deltas = [ permute_to_N_HWA_K(x, 4) for x in pred_anchor_deltas ] num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, R) anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) valid_mask = gt_labels >= 0 pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) num_pos_anchors = pos_mask.sum().item() get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + ( 1 - self.loss_normalizer_momentum) * max(num_pos_anchors, 1) # classification and regression loss gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[:, :-1] # no loss for the last (background) class loss_cls = sigmoid_focal_loss_jit( cat(pred_logits, dim=1)[valid_mask], gt_labels_target.to(pred_logits[0].dtype), alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) * self.loss_cls_weight init_foreground_idxs = gt_labels_init > 0 strides = strides[None].repeat(pred_logits[0].shape[0], 1) coords_norm_init = strides[init_foreground_idxs].unsqueeze(-1) * 4 loss_loc_init = smooth_l1_loss( pred_boxes_init[init_foreground_idxs] / coords_norm_init, gt_boxes_init[init_foreground_idxs] / coords_norm_init, beta=0.11, reduction="sum", ) / max(init_foreground_idxs.sum(), 1) if self.box_reg_loss_type == "smooth_l1": loss_loc_refine = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], beta=0.11, reduction="sum", ) elif self.box_reg_loss_type == "giou": pred_boxes = [ self.box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_loc_refine = giou_loss(torch.stack(pred_boxes)[pos_mask], torch.stack(gt_boxes)[pos_mask], reduction="sum") else: raise ValueError( f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") return { "loss_cls": loss_cls / self.loss_normalizer, "loss_loc_init": loss_loc_init * self.loss_loc_init_weight, "loss_loc_refine": loss_loc_refine / self.loss_normalizer * self.loss_loc_refine_weight, }
def forward(self, indices, gt_instances, anchors, pred_class_logits, pred_anchor_deltas): pred_class_logits = cat(pred_class_logits, dim=1).view(-1, self.num_classes) pred_anchor_deltas = cat(pred_anchor_deltas, dim=1).view(-1, 4) anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] N = len(anchors) # list[Tensor(R, 4)], one for each image all_anchors = Boxes.cat(anchors).tensor # Boxes(Tensor(N*R, 4)) predicted_boxes = self.box2box_transform.apply_deltas( pred_anchor_deltas, all_anchors) predicted_boxes = predicted_boxes.reshape(N, -1, 4) ious = [] pos_ious = [] for i in range(N): src_idx, tgt_idx = indices[i] iou = box_iou(predicted_boxes[i, ...], gt_instances[i].gt_boxes.tensor) if iou.numel() == 0: max_iou = iou.new_full((iou.size(0), ), 0) else: max_iou = iou.max(dim=1)[0] a_iou = box_iou(anchors[i].tensor, gt_instances[i].gt_boxes.tensor) if a_iou.numel() == 0: pos_iou = a_iou.new_full((0, ), 0) else: pos_iou = a_iou[src_idx, tgt_idx] ious.append(max_iou) pos_ious.append(pos_iou) ious = torch.cat(ious) ignore_idx = ious > self.neg_ignore_thresh pos_ious = torch.cat(pos_ious) pos_ignore_idx = pos_ious < self.pos_ignore_thresh src_idx = torch.cat([ src + idx * anchors[0].tensor.shape[0] for idx, (src, _) in enumerate(indices) ]) gt_classes = torch.full(pred_class_logits.shape[:1], self.num_classes, dtype=torch.int64, device=pred_class_logits.device) gt_classes[ignore_idx] = -1 target_classes_o = torch.cat( [t.gt_classes[J] for t, (_, J) in zip(gt_instances, indices)]) target_classes_o[pos_ignore_idx] = -1 gt_classes[src_idx] = target_classes_o valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 if comm.get_world_size() > 1: dist.all_reduce(num_foreground) num_foreground = num_foreground * 1.0 / comm.get_world_size() # cls loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) # reg loss target_boxes = torch.cat( [t.gt_boxes.tensor[i] for t, (_, i) in zip(gt_instances, indices)], dim=0) target_boxes = target_boxes[~pos_ignore_idx] matched_predicted_boxes = predicted_boxes.reshape( -1, 4)[src_idx[~pos_ignore_idx]] loss_box_reg = giou_loss(matched_predicted_boxes, target_boxes, reduction="sum") return { "loss_cls": loss_cls / max(1, num_foreground), "loss_box_reg": loss_box_reg / max(1, num_foreground), }
def losses(self, indices, gt_instances, anchors, pred_class_logits, pred_anchor_deltas): pred_class_logits = cat(pred_class_logits, dim=1).view(-1, self.num_classes) pred_anchor_deltas = cat(pred_anchor_deltas, dim=1).view(-1, 4) anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] N = len(anchors) # list[Tensor(R, 4)], one for each image all_anchors = Boxes.cat(anchors).tensor # Boxes(Tensor(N*R, 4)) predicted_boxes = self.box2box_transform.apply_deltas( pred_anchor_deltas, all_anchors) predicted_boxes = predicted_boxes.reshape(N, -1, 4) # We obtain positive anchors by choosing gt boxes' k nearest anchors # and leave the rest to be negative anchors. However, there may # exist negative anchors that have similar distances with the chosen # positives. These negatives may cause ambiguity for model training # if we just set them as negatives. Given that we want the model's # predict boxes on negative anchors to have low IoU with gt boxes, # we set a threshold on the IoU between predicted boxes and gt boxes # instead of the IoU between anchor boxes and gt boxes. ious = [] pos_ious = [] for i in range(N): src_idx, tgt_idx = indices[i] iou = box_iou(predicted_boxes[i, ...], gt_instances[i].gt_boxes.tensor) if iou.numel() == 0: max_iou = iou.new_full((iou.size(0), ), 0) else: max_iou = iou.max(dim=1)[0] a_iou = box_iou(anchors[i].tensor, gt_instances[i].gt_boxes.tensor) if a_iou.numel() == 0: pos_iou = a_iou.new_full((0, ), 0) else: pos_iou = a_iou[src_idx, tgt_idx] ious.append(max_iou) pos_ious.append(pos_iou) ious = torch.cat(ious) ignore_idx = ious > self.neg_ignore_thresh pos_ious = torch.cat(pos_ious) pos_ignore_idx = pos_ious < self.pos_ignore_thresh src_idx = torch.cat([ src + idx * anchors[0].tensor.shape[0] for idx, (src, _) in enumerate(indices) ]) gt_classes = torch.full(pred_class_logits.shape[:1], self.num_classes, dtype=torch.int64, device=pred_class_logits.device) gt_classes[ignore_idx] = -1 target_classes_o = torch.cat( [t.gt_classes[J] for t, (_, J) in zip(gt_instances, indices)]) target_classes_o[pos_ignore_idx] = -1 gt_classes[src_idx] = target_classes_o valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 if comm.get_world_size() > 1: dist.all_reduce(num_foreground) num_foreground = num_foreground * 1.0 / comm.get_world_size() # cls loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) # reg loss target_boxes = torch.cat( [t.gt_boxes.tensor[i] for t, (_, i) in zip(gt_instances, indices)], dim=0) target_boxes = target_boxes[~pos_ignore_idx] matched_predicted_boxes = predicted_boxes.reshape( -1, 4)[src_idx[~pos_ignore_idx]] loss_box_reg = giou_loss(matched_predicted_boxes, target_boxes, reduction="sum") return { "loss_cls": loss_cls / max(1, num_foreground), "loss_box_reg": loss_box_reg / max(1, num_foreground), }
def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_mark_deltas, gt_marks, gt_marks_labels): """ Args: For `gt_classes`, `gt_anchors_deltas`, `gt_landmarks_deltas` and `gt_landmarks_labels` parameters, see :meth:`RetinaNet.get_ground_truth`. Their shapes are (N, R), (N, R, 4), (N, R, num_landmark * 2) and (N, R), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi x A) For `pred_class_logits`, `pred_anchor_deltas` and `pred_landmark_deltas`, see :meth:`RetinaNetHead.forward`. Returns: dict[str: Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls", "loss_box_reg" and "loss_landmark_reg" """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, R) anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) gt_landmark_deltas = [self.mark2mark_transform.get_deltas(anchors, k) for k in gt_marks] gt_landmark_deltas = torch.stack(gt_landmark_deltas) # (N, R, Marks * 2) valid_mask = gt_labels >= 0 pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) num_pos_anchors = pos_mask.sum().item() get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + ( 1 - self.loss_normalizer_momentum ) * max(num_pos_anchors, 1) # classification and regression loss gt_labels_target = F.one_hot( gt_labels[valid_mask], num_classes=self.num_classes + 1)[:, :-1] # no loss for the last (background) class loss_cls = sigmoid_focal_loss_jit( cat(pred_logits, dim=1)[valid_mask], gt_labels_target.to(pred_logits[0].dtype), alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) if self.box_reg_loss_type == "smooth_l1": loss_box_reg = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], beta=self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": pred_boxes = [ self.box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = giou_loss( torch.stack(pred_boxes)[pos_mask], torch.stack(gt_boxes)[pos_mask], reduction="sum" ) else: raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") loss_box_reg = loss_box_reg * self.loc_weight # keypoints regression loss # NOTE filter in-valid keypoints gt_marks_labels = torch.stack(gt_marks_labels) marks_pos_mask = pos_mask & (gt_marks_labels > 0) # NOTE loss_normalizer for landmark may be not consistence with score or bbox loss_marks_reg = smooth_l1_loss( cat(pred_mark_deltas, dim=1)[marks_pos_mask], gt_landmark_deltas[marks_pos_mask], beta=self.smooth_l1_beta, reduction="sum", ) return { "loss_cls": loss_cls / self.loss_normalizer, "loss_box_reg": loss_box_reg / self.loss_normalizer, "loss_landmark_reg": loss_marks_reg / self.loss_normalizer, }
def losses(self, anchors, pred_objectness_logits: List[torch.Tensor], gt_labels: List[torch.Tensor], pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], integral_sem_seg_targets=None): """ Return the losses from a set of RPN predictions and their associated ground-truth. Args: anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each has shape (Hi*Wi*A, B), where B is box dimension (4 or 5). pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A) representing the predicted objectness logits for all anchors. gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`. pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors to proposals. gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ num_images = len(gt_labels) original_gt_labels = gt_labels gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) # Log the number of positive/negative anchors per-image that's used in training pos_mask = gt_labels == 1 num_pos_anchors = pos_mask.sum().item() num_neg_anchors = (gt_labels == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images) if self.box_reg_loss_type == "smooth_l1": anchors = type(anchors[0]).cat(anchors).tensor # Ax(4 or 5) gt_anchor_deltas = [ self.box2box_transform.get_deltas(anchors, k) for k in gt_boxes ] gt_anchor_deltas = torch.stack( gt_anchor_deltas) # (N, sum(Hi*Wi*Ai), 4 or 5) localization_loss = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[pos_mask], gt_anchor_deltas[pos_mask], self.smooth_l1_beta, reduction="sum", ) elif self.box_reg_loss_type == "giou": pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) pred_proposals = cat(pred_proposals, dim=1) pred_proposals = pred_proposals.view(-1, pred_proposals.shape[-1]) pos_mask = pos_mask.view(-1) localization_loss = giou_loss(pred_proposals[pos_mask], cat(gt_boxes)[pos_mask], reduction="sum") else: raise ValueError( f"Invalid rpn box reg loss type '{self.box_reg_loss_type}'") valid_mask = gt_labels >= 0 if integral_sem_seg_targets is not None: objectness_loss = 0 objectness_logits = cat(pred_objectness_logits, dim=1) normalizer = 0 for labels, obj_logits, integral_sem_seg in zip( original_gt_labels, objectness_logits, integral_sem_seg_targets): valid_mask = labels >= 0 valid_bbox = anchors[valid_mask] valid_label = labels[valid_mask] valid_obj_logits = obj_logits[valid_mask] _, filtered_idx = add_unlabeled_class(valid_bbox, valid_label * 2, integral_sem_seg, bg=0) objectness_loss += F.binary_cross_entropy_with_logits( valid_obj_logits[filtered_idx], valid_label[filtered_idx].to(torch.float32), reduction="sum") normalizer += sum(filtered_idx) if normalizer == 0: normalizer = self.batch_size_per_image * num_images else: objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction="sum", ) normalizer = self.batch_size_per_image * num_images loss_rpn_cls = objectness_loss / normalizer loss_rpn_loc = localization_loss / normalizer return { "loss_rpn_cls": loss_rpn_cls.clamp(max=10), "loss_rpn_loc": loss_rpn_loc.clamp(max=10), }