def get_ground_truth(self, anchors, bbox_preds, targets): anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] N = len(anchors) # list[Tensor(R, 4)], one for each image all_anchors = Boxes.cat(anchors).tensor.reshape(N, -1, 4) # Boxes(Tensor(N*R, 4)) box_delta = cat(bbox_preds, dim=1) # box_pred: xyxy; targets: xyxy box_pred = self.box2box_transform.apply_deltas(box_delta, all_anchors) indices = self.matcher(box_pred, all_anchors, targets) return indices
def inference_single_image(self, conf_pred_per_image, loc_pred_per_image, default_boxes, image_size): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: conf_pred_per_image (list[Tensor]): list of #feature levels. Each entry contains tensor of size [Hi x Wi x D, C]. loc_pred_per_image (list[Tensor]): same shape as 'conf_pred_per_image' except that C becomes 4. default_boxes (list['Boxes']): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ # predict confidence conf_pred = torch.cat(conf_pred_per_image, dim=0) # [R, C] conf_pred = conf_pred.softmax(dim=1) # predict boxes loc_pred = torch.cat(loc_pred_per_image, dim=0) # [R, 4] default_boxes = Boxes.cat(default_boxes) # [R, 4] boxes_pred = self.box2box_transform.apply_deltas( loc_pred, default_boxes.tensor) num_boxes, num_classes = conf_pred.shape boxes_pred = boxes_pred.view(num_boxes, 1, 4).expand( num_boxes, num_classes, 4) # [R, C, 4] labels = torch.arange(num_classes, device=self.device) # [0, ..., C] labels = labels.view(1, num_classes).expand_as(conf_pred) # [R, C] # remove predictions with the background label boxes_pred = boxes_pred[:, :-1] conf_pred = conf_pred[:, :-1] labels = labels[:, :-1] # batch everything, by making every class prediction be a separate instance boxes_pred = boxes_pred.reshape(-1, 4) conf_pred = conf_pred.reshape(-1) labels = labels.reshape(-1) # remove low scoring boxes indices = torch.nonzero(conf_pred > self.score_threshold, as_tuple=False).squeeze(1) boxes_pred, conf_pred, labels = boxes_pred[indices], conf_pred[indices], labels[indices] keep = generalized_batched_nms(boxes_pred, conf_pred, labels, self.nms_threshold, nms_type=self.nms_type) keep = keep[:self.max_detections_per_image] result = Instances(image_size) result.pred_boxes = Boxes(boxes_pred[keep]) result.scores = conf_pred[keep] result.pred_classes = labels[keep] return result
def get_ground_truth(self, anchors, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each anchor. R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels. Anchors with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, K-1] range. Anchors whose IoU are below the background threshold are assigned the label "K". Anchors whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_anchors_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding anchor is labeled as foreground. """ gt_classes = [] gt_anchors_deltas = [] anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] # list[Tensor(R, 4)], one for each image for anchors_per_image, targets_per_image in zip(anchors, targets): match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors_per_image) gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) has_gt = len(targets_per_image) > 0 if has_gt: # ground truth box regression matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors_per_image.tensor, matched_gt_boxes.tensor ) gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label 0 are treated as background. gt_classes_i[anchor_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_classes_i[anchor_labels == -1] = -1 else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_anchors_reg_deltas_i = torch.zeros_like( anchors_per_image.tensor) gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def _get_ground_truth(self): """ Returns: gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])). Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4). """ gt_objectness_logits = [] gt_anchor_deltas = [] # Concatenate anchors from all feature maps into a single Boxes per image anchors = [Boxes.cat(anchors_i) for anchors_i in self.anchors] for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes, anchors, self.gt_boxes): """ image_size_i: (h, w) for the i-th image anchors_i: anchors for i-th image gt_boxes_i: ground-truth boxes for i-th image """ match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors_i) matched_idxs, gt_objectness_logits_i = retry_if_cuda_oom( self.anchor_matcher)(match_quality_matrix) # Matching is memory-expensive and may result in CPU tensors. But the result is small gt_objectness_logits_i = gt_objectness_logits_i.to( device=gt_boxes_i.device) del match_quality_matrix if self.boundary_threshold >= 0: # Discard anchors that go out of the boundaries of the image # NOTE: This is legacy functionality that is turned off by default in cvpods anchors_inside_image = anchors_i.inside_box( image_size_i, self.boundary_threshold) gt_objectness_logits_i[~anchors_inside_image] = -1 if len(gt_boxes_i) == 0: # These values won't be used anyway since the anchor is labeled as background gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor) else: # TODO wasted computation for ignored boxes matched_gt_boxes = gt_boxes_i[matched_idxs] gt_anchor_deltas_i = self.box2box_transform.get_deltas( anchors_i.tensor, matched_gt_boxes.tensor) gt_objectness_logits.append(gt_objectness_logits_i) gt_anchor_deltas.append(gt_anchor_deltas_i) return gt_objectness_logits, gt_anchor_deltas
def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords): """ Get features from feature maps in `features_list` that correspond to specific point coordinates inside each bounding box from `boxes`. Args: features_list (list[Tensor]): A list of feature map tensors to get features from. feature_scales (list[float]): A list of scales for tensors in `features_list`. boxes (list[Boxes]): A list of I Boxes objects that contain R_1 + ... + R_I = R boxes all together. point_coords (Tensor): A tensor of shape (R, P, 2) that contains [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. Returns: point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled from all features maps in feature_list for P sampled points for all R boxes in `boxes`. point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level coordinates of P points. """ cat_boxes = Boxes.cat(boxes) num_boxes = [len(b) for b in boxes] point_coords_wrt_image = get_point_coords_wrt_image( cat_boxes.tensor, point_coords) split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes) point_features = [] for idx_img, point_coords_wrt_image_per_image in enumerate( split_point_coords_wrt_image): point_features_per_image = [] for idx_feature, feature_map in enumerate(features_list): h, w = feature_map.shape[-2:] scale = torch.tensor( [w, h], device=feature_map.device) / feature_scales[idx_feature] point_coords_scaled = point_coords_wrt_image_per_image / scale point_features_per_image.append( point_sample( feature_map[idx_img].unsqueeze(0), point_coords_scaled.unsqueeze(0), align_corners=False, ).squeeze(0).transpose(1, 0)) point_features.append(cat(point_features_per_image, dim=1)) return cat(point_features, dim=0), point_coords_wrt_image
def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images): """ Arguments: pred_logits, pred_deltas, pred_masks: Same as the output of: meth:`TensorMaskHead.forward` anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth` images (ImageList): the input images Returns: results (List[Instances]): a list of #images elements. """ assert len(anchors) == len(images) results = [] pred_logits = [ permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits ] pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas] pred_logits = cat(pred_logits, dim=1) pred_deltas = cat(pred_deltas, dim=1) for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)): # Get the size of the current image image_size = images.image_sizes[img_idx] logits_im = pred_logits[img_idx] deltas_im = pred_deltas[img_idx] if self.mask_on: masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks] else: masks_im = [None] * self.num_levels results_im = self.inference_single_image( logits_im, deltas_im, masks_im, Boxes.cat(anchors_im), cat(indexes_im), tuple(image_size), ) results.append(results_im) return results
def get_ground_truth(self, anchors, unit_lengths, indexes, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level Tensor. The tensor contains unit lengths for anchors of this image on the specific feature level. indexes (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level Tensor. The tensor contains the 5D index of each anchor, the second dimension means (L, N, H, W, A), where L is level, I is image, H is height, W is width, and A is anchor. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_class_info (Tensor, Tensor): A pair of two tensors for classification. The first one is an integer tensor of shape (R, #classes) storing ground-truth labels for each anchor. R is the total number of anchors in the batch. The second one is an integer tensor of shape (R,), to indicate which anchors are valid for loss computation, which anchors are not. gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes. The first one, of shape (F, 4). F=#foreground anchors. The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. Only foreground anchors have values in this tensor. Could be `None` if F=0. The second one, of shape (R,), is an integer tensor indicating which anchors are foreground ones used for box regression. Could be `None` if F=0. gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks. The first one is a list of P=#feature level elements. Each is a list of A=#anchor tensors. Each tensor contains the ground truth masks of the same size and for the same feature level. Could be `None`. The second one is a list of P=#feature level elements. Each is a list of A=#anchor tensors. Each tensor contains the location of the ground truth masks of the same size and for the same feature level. The second dimension means (N, H, W), where N is image, H is height, and W is width. Could be `None`. num_fg (int): F=#foreground anchors, used later for loss normalization. """ gt_classes = [] gt_deltas = [] gt_masks = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)] gt_mask_inds = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)] anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths] indexes = [cat(indexes_i) for indexes_i in indexes] num_fg = 0 for i, (anchors_im, unit_lengths_im, indexes_im, targets_im) in enumerate( zip(anchors, unit_lengths, indexes, targets)): # Initialize all gt_classes_i = torch.full_like(unit_lengths_im, self.num_classes, dtype=torch.int64, device=self.device) # Ground truth classes has_gt = len(targets_im) > 0 if has_gt: # Compute the pairwise matrix gt_matched_inds, anchor_labels = _assignment_rule( targets_im.gt_boxes, anchors_im, unit_lengths_im, self.min_anchor_size, ) # Find the foreground instances fg_inds = anchor_labels == 1 fg_anchors = anchors_im[fg_inds] num_fg += len(fg_anchors) # Find the ground truths for foreground instances gt_fg_matched_inds = gt_matched_inds[fg_inds] # Assign labels for foreground instances gt_classes_i[fg_inds] = targets_im.gt_classes[ gt_fg_matched_inds] # Anchors with label -1 are ignored, others are left as negative gt_classes_i[anchor_labels == -1] = -1 # Boxes # Ground truth box regression, only for foregrounds matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes # Compute box regression offsets for foregrounds only gt_deltas_i = self.box2box_transform.get_deltas( fg_anchors.tensor, matched_gt_boxes.tensor) gt_deltas.append(gt_deltas_i) # Masks if self.mask_on: # Compute masks for each level and each anchor matched_indexes = indexes_im[fg_inds, :] for lvl in range(self.num_levels): ids_lvl = matched_indexes[:, 0] == lvl if torch.any(ids_lvl): cur_level_factor = 2**lvl if self.bipyramid_on else 1 for anc in range(self.num_anchors): ids_lvl_anchor = ids_lvl & ( matched_indexes[:, 4] == anc) if torch.any(ids_lvl_anchor): gt_masks[lvl][anc].append( targets_im[ gt_fg_matched_inds[ids_lvl_anchor]] .gt_masks.crop_and_resize( fg_anchors[ids_lvl_anchor].tensor, self.mask_sizes[anc] * cur_level_factor, )) # Select (N, H, W) dimensions gt_mask_inds_lvl_anc = matched_indexes[ ids_lvl_anchor, 1:4] # Set the image index to the current image gt_mask_inds_lvl_anc[:, 0] = i gt_mask_inds[lvl][anc].append( gt_mask_inds_lvl_anc) gt_classes.append(gt_classes_i) # Classes and boxes gt_classes = cat(gt_classes) gt_valid_inds = gt_classes >= 0 gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes) gt_classes_target = torch.zeros( (gt_classes.shape[0], self.num_classes), dtype=torch.float32, device=self.device, ) gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1 gt_deltas = cat(gt_deltas) if gt_deltas else None # Masks gt_masks = [[cat(mla) if mla else None for mla in ml] for ml in gt_masks] gt_mask_inds = [[cat(ila) if ila else None for ila in il] for il in gt_mask_inds] return ( (gt_classes_target, gt_valid_inds), (gt_deltas, gt_fg_inds), (gt_masks, gt_mask_inds), num_fg, )
def losses(self, indices, gt_instances, anchors, pred_class_logits, pred_anchor_deltas): pred_class_logits = cat(pred_class_logits, dim=1).view(-1, self.num_classes) pred_anchor_deltas = cat(pred_anchor_deltas, dim=1).view(-1, 4) anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] N = len(anchors) # list[Tensor(R, 4)], one for each image all_anchors = Boxes.cat(anchors).tensor # Boxes(Tensor(N*R, 4)) predicted_boxes = self.box2box_transform.apply_deltas( pred_anchor_deltas, all_anchors) predicted_boxes = predicted_boxes.reshape(N, -1, 4) ious = [] pos_ious = [] for i in range(N): src_idx, tgt_idx = indices[i] iou, _ = box_iou(predicted_boxes[i, ...], gt_instances[i].gt_boxes.tensor) if iou.numel() == 0: max_iou = iou.new_full((iou.size(0), ), 0) else: max_iou = iou.max(dim=1)[0] a_iou, _ = box_iou(anchors[i].tensor, gt_instances[i].gt_boxes.tensor) if a_iou.numel() == 0: pos_iou = a_iou.new_full((0, ), 0) else: pos_iou = a_iou[src_idx, tgt_idx] ious.append(max_iou) pos_ious.append(pos_iou) ious = torch.cat(ious) ignore_idx = ious > self.neg_ignore_thresh pos_ious = torch.cat(pos_ious) pos_ignore_idx = pos_ious < self.pos_ignore_thresh src_idx = torch.cat([ src + idx * anchors[0].tensor.shape[0] for idx, (src, _) in enumerate(indices) ]) gt_classes = torch.full(pred_class_logits.shape[:1], self.num_classes, dtype=torch.int64, device=pred_class_logits.device) gt_classes[ignore_idx] = -1 target_classes_o = torch.cat( [t.gt_classes[J] for t, (_, J) in zip(gt_instances, indices)]) target_classes_o[pos_ignore_idx] = -1 gt_classes[src_idx] = target_classes_o valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 if comm.get_world_size() > 1: dist.all_reduce(num_foreground) num_foreground = num_foreground * 1.0 / comm.get_world_size() # cls loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) # reg loss target_boxes = torch.cat( [t.gt_boxes.tensor[i] for t, (_, i) in zip(gt_instances, indices)], dim=0) target_boxes = target_boxes[~pos_ignore_idx] matched_predicted_boxes = predicted_boxes.reshape( -1, 4)[src_idx[~pos_ignore_idx]] loss_box_reg = (1 - torch.diag( generalized_box_iou(matched_predicted_boxes, target_boxes))).sum() return { "loss_cls": loss_cls / max(1, num_foreground), "loss_box_reg": loss_box_reg / max(1, num_foreground), }
def losses(self, anchors, gt_instances, box_cls, box_delta): anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] box_cls_flattened = [ permute_to_N_HWA_K(x, self.num_classes) for x in box_cls ] box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] pred_class_logits = cat(box_cls_flattened, dim=1) pred_anchor_deltas = cat(box_delta_flattened, dim=1) pred_class_probs = pred_class_logits.sigmoid() pred_box_probs = [] num_foreground = 0 positive_losses = [] for anchors_per_image, \ gt_instances_per_image, \ pred_class_probs_per_image, \ pred_anchor_deltas_per_image in zip( anchors, gt_instances, pred_class_probs, pred_anchor_deltas): gt_classes_per_image = gt_instances_per_image.gt_classes with torch.no_grad(): # predicted_boxes_per_image: a_{j}^{loc}, shape: [j, 4] predicted_boxes_per_image = self.box2box_transform.apply_deltas( pred_anchor_deltas_per_image, anchors_per_image.tensor) # gt_pred_iou: IoU_{ij}^{loc}, shape: [i, j] gt_pred_iou = pairwise_iou(gt_instances_per_image.gt_boxes, Boxes(predicted_boxes_per_image)) t1 = self.bbox_threshold t2 = gt_pred_iou.max(dim=1, keepdim=True).values.clamp_( min=t1 + torch.finfo(torch.float32).eps) # gt_pred_prob: P{a_{j} -> b_{i}}, shape: [i, j] gt_pred_prob = ((gt_pred_iou - t1) / (t2 - t1)).clamp_(min=0, max=1) # pred_box_prob_per_image: P{a_{j} \in A_{+}}, shape: [j, c] nonzero_idxs = torch.nonzero(gt_pred_prob, as_tuple=True) pred_box_prob_per_image = torch.zeros_like( pred_class_probs_per_image) pred_box_prob_per_image[nonzero_idxs[1], gt_classes_per_image[nonzero_idxs[0]]] \ = gt_pred_prob[nonzero_idxs] pred_box_probs.append(pred_box_prob_per_image) # construct bags for objects match_quality_matrix = pairwise_iou( gt_instances_per_image.gt_boxes, anchors_per_image) _, foreground_idxs = torch.topk(match_quality_matrix, self.pos_anchor_topk, dim=1, sorted=False) # matched_pred_class_probs_per_image: P_{ij}^{cls} matched_pred_class_probs_per_image = torch.gather( pred_class_probs_per_image[foreground_idxs], 2, gt_classes_per_image.view(-1, 1, 1).repeat(1, self.pos_anchor_topk, 1)).squeeze(2) # matched_gt_anchor_deltas_per_image: P_{ij}^{loc} matched_gt_anchor_deltas_per_image = self.box2box_transform.get_deltas( anchors_per_image.tensor[foreground_idxs], gt_instances_per_image.gt_boxes.tensor.unsqueeze(1)) loss_box_reg = smooth_l1_loss( pred_anchor_deltas_per_image[foreground_idxs], matched_gt_anchor_deltas_per_image, beta=self.smooth_l1_loss_beta, reduction="none").sum(dim=-1) * self.reg_weight matched_pred_reg_probs_per_image = (-loss_box_reg).exp() # positive_losses: { -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } num_foreground += len(gt_instances_per_image) positive_losses.append( positive_bag_loss(matched_pred_class_probs_per_image * matched_pred_reg_probs_per_image, dim=1)) # positive_loss: \sum_{i}{ -log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) ) } / ||B|| positive_loss = torch.cat(positive_losses).sum() / max( 1, num_foreground) # pred_box_probs: P{a_{j} \in A_{+}} pred_box_probs = torch.stack(pred_box_probs, dim=0) # negative_loss: \sum_{j}{ FL( (1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}) ) } / n||B|| negative_loss = negative_bag_loss( pred_class_probs * (1 - pred_box_probs), self.focal_loss_gamma).sum() / max( 1, num_foreground * self.pos_anchor_topk) loss_pos = positive_loss * self.focal_loss_alpha loss_neg = negative_loss * (1 - self.focal_loss_alpha) return {"loss_pos": loss_pos, "loss_neg": loss_neg}
def get_ground_truth(self, default_boxes, targets): """ Args: default_boxes (list[Boxes]): a list of 'Boxes' elements. The Boxes contains default boxes of one image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_conf (Tensor): An integer tensor of shape [N, R] storing ground-truth labels for each default box. R is the total number of default box, i.e. the sum of Hi x Wi x D for all levels. * Default box with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, C-1] range. * Default box whose IoU are below the background threshold are assigned the label "C". * Default box whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_default_boxes_deltas (Tensor): Shape [N, R, 4]. The last dimension represents ground-truth box2box transform targets (g^cx, g^cy, g^w, g^h)that map each default box to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding default box is labeled as foreground. """ gt_conf = list() gt_default_boxes_deltas = list() # list[Tensor(R, 4)], one for each image default_boxes_per_image = Boxes.cat(default_boxes) # each Instances (for one image) for targets_per_image in targets: match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, default_boxes_per_image) # M * N gt_matched_idxs, default_box_labels = self.matcher( match_quality_matrix) has_gt = len(targets_per_image) > 0 if has_gt: # ground truth box regression matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] # meaningful only when the corresponding default box is labeled as foreground. gt_default_boxes_deltas_i = self.box2box_transform.get_deltas( default_boxes_per_image.tensor, matched_gt_boxes.tensor) gt_conf_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label 0 are treated as background. gt_conf_i[default_box_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_conf_i[default_box_labels == -1] = -1 else: gt_conf_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_default_boxes_deltas_i = torch.zeros_like( default_boxes_per_image.tensor) gt_conf.append(gt_conf_i) gt_default_boxes_deltas.append(gt_default_boxes_deltas_i) return torch.stack(gt_conf), torch.stack(gt_default_boxes_deltas)
def get_ground_truth(self, anchors, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each anchor. R is the total number of anchors, i.e. the sum of Hi x Wi for all levels. Anchors with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, K-1] range. Anchors whose IoU are below the background threshold are assigned the label "K". Anchors whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_anchors_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding anchor is labeled as foreground. """ gt_classes = [] gt_anchors_deltas = [] num_fg = 0 num_gt = 0 for anchors_per_image, targets_per_image in zip(anchors, targets): anchors_per_image = Boxes.cat(anchors_per_image) gt_boxes = targets_per_image.gt_boxes match_quality_matrix = pairwise_iou(gt_boxes, anchors_per_image) _, is_positive = match_quality_matrix.topk(self.iou_topk, dim=1) is_foreground = torch.zeros_like(match_quality_matrix, dtype=torch.bool).scatter_( 1, is_positive, True) match_quality_matrix[~is_foreground] = -1 # if there are still more than one objects for a position, # we choose the one with maximum quality anchor_labels, gt_matched_idxs = match_quality_matrix.max(dim=0) num_fg += (anchor_labels != -1).sum().item() num_gt += len(targets_per_image) # ground truth box regression gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors_per_image.tensor, gt_boxes[gt_matched_idxs].tensor) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label -1 are treated as background. gt_classes_i[anchor_labels == -1] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) get_event_storage().put_scalar("num_fg_per_gt", num_fg / num_gt) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits, pred_anchor_deltas, anchors): """ Args: For `gt_classes` and `gt_anchors_deltas` parameters, see :meth:`FCOS.get_ground_truth`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi) For `pred_class_logits` and `pred_anchor_deltas`, see :meth:`FCOSHead.forward`. Returns: dict[str: Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ pred_class_logits, pred_anchor_deltas = \ permute_all_cls_and_box_to_N_HWA_K_and_concat( pred_class_logits, pred_anchor_deltas, self.num_classes ) # Shapes: (N x R, K) and (N x R, 4), respectively. gt_classes = gt_classes.flatten() gt_anchors_deltas = gt_anchors_deltas.view(-1, 4) valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 num_foreground = comm.all_reduce(num_foreground) / float( comm.get_world_size()) # logits loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / max(1.0, num_foreground) anchors = Boxes.cat([Boxes.cat(anchors_i) for anchors_i in anchors]) pred_anchor_deltas = self.box2box_transform.apply_deltas( pred_anchor_deltas, anchors.tensor) gt_anchors_deltas = self.box2box_transform.apply_deltas( gt_anchors_deltas, anchors.tensor) # regression loss loss_box_reg = iou_loss( pred_anchor_deltas[foreground_idxs], gt_anchors_deltas[foreground_idxs], box_mode="xyxy", loss_type=self.iou_loss_type, reduction="sum", ) / max(1.0, num_foreground) * self.reg_weight return { "loss_cls": loss_cls, "loss_box_reg": loss_box_reg, }
def test_empty_cat(self): x = Boxes.cat([]) self.assertTrue(x.tensor.shape, (0, 4))