def get_ground_truth(self, anchors, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_classes (Tensor): An integer tensor of shape (N, R) storing ground-truth labels for each anchor. R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels. Anchors with an IoU with some target higher than the foreground threshold are assigned their corresponding label in the [0, K-1] range. Anchors whose IoU are below the background threshold are assigned the label "K". Anchors whose IoU are between the foreground and background thresholds are assigned a label "-1", i.e. ignore. gt_anchors_deltas (Tensor): Shape (N, R, 4). The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. The values in the tensor are meaningful only when the corresponding anchor is labeled as foreground. """ gt_classes = [] gt_anchors_deltas = [] anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] # list[Tensor(R, 4)], one for each image for anchors_per_image, targets_per_image in zip(anchors, targets): match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors_per_image) gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) has_gt = len(targets_per_image) > 0 if has_gt: # ground truth box regression matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( anchors_per_image.tensor, matched_gt_boxes.tensor) gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Anchors with label 0 are treated as background. gt_classes_i[anchor_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_classes_i[anchor_labels == -1] = -1 else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes gt_anchors_reg_deltas_i = torch.zeros_like( anchors_per_image.tensor) gt_classes.append(gt_classes_i) gt_anchors_deltas.append(gt_anchors_reg_deltas_i) return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def _get_ground_truth(self): """ Returns: gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])). Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4). """ gt_objectness_logits = [] gt_anchor_deltas = [] # Concatenate anchors from all feature maps into a single Boxes per image anchors = [Boxes.cat(anchors_i) for anchors_i in self.anchors] for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes, anchors, self.gt_boxes): """ image_size_i: (h, w) for the i-th image anchors_i: anchors for i-th image gt_boxes_i: ground-truth boxes for i-th image """ match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors_i) matched_idxs, gt_objectness_logits_i = retry_if_cuda_oom( self.anchor_matcher)(match_quality_matrix) # Matching is memory-expensive and may result in CPU tensors. But the result is small gt_objectness_logits_i = gt_objectness_logits_i.to( device=gt_boxes_i.device) del match_quality_matrix if self.boundary_threshold >= 0: # Discard anchors that go out of the boundaries of the image # NOTE: This is legacy functionality that is turned off by default in mydl anchors_inside_image = anchors_i.inside_box( image_size_i, self.boundary_threshold) gt_objectness_logits_i[~anchors_inside_image] = -1 if len(gt_boxes_i) == 0: # These values won't be used anyway since the anchor is labeled as background gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor) else: # TODO wasted computation for ignored boxes matched_gt_boxes = gt_boxes_i[matched_idxs] gt_anchor_deltas_i = self.box2box_transform.get_deltas( anchors_i.tensor, matched_gt_boxes.tensor) gt_objectness_logits.append(gt_objectness_logits_i) gt_anchor_deltas.append(gt_anchor_deltas_i) return gt_objectness_logits, gt_anchor_deltas
def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords): """ Get features from feature maps in `features_list` that correspond to specific point coordinates inside each bounding box from `boxes`. Args: features_list (list[Tensor]): A list of feature map tensors to get features from. feature_scales (list[float]): A list of scales for tensors in `features_list`. boxes (list[Boxes]): A list of I Boxes objects that contain R_1 + ... + R_I = R boxes all together. point_coords (Tensor): A tensor of shape (R, P, 2) that contains [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. Returns: point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled from all features maps in feature_list for P sampled points for all R boxes in `boxes`. point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level coordinates of P points. """ cat_boxes = Boxes.cat(boxes) num_boxes = [len(b) for b in boxes] point_coords_wrt_image = get_point_coords_wrt_image( cat_boxes.tensor, point_coords) split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes) point_features = [] for idx_img, point_coords_wrt_image_per_image in enumerate( split_point_coords_wrt_image): point_features_per_image = [] for idx_feature, feature_map in enumerate(features_list): h, w = feature_map.shape[-2:] scale = torch.tensor( [w, h], device=feature_map.device) / feature_scales[idx_feature] point_coords_scaled = point_coords_wrt_image_per_image / scale point_features_per_image.append( point_sample( feature_map[idx_img].unsqueeze(0), point_coords_scaled.unsqueeze(0), align_corners=False, ).squeeze(0).transpose(1, 0)) point_features.append(cat(point_features_per_image, dim=1)) return cat(point_features, dim=0), point_coords_wrt_image
def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images): """ Arguments: pred_logits, pred_deltas, pred_masks: Same as the output of: meth:`TensorMaskHead.forward` anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth` images (ImageList): the input images Returns: results (List[Instances]): a list of #images elements. """ assert len(anchors) == len(images) results = [] pred_logits = [ permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits ] pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas] pred_logits = cat(pred_logits, dim=1) pred_deltas = cat(pred_deltas, dim=1) for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)): # Get the size of the current image image_size = images.image_sizes[img_idx] logits_im = pred_logits[img_idx] deltas_im = pred_deltas[img_idx] if self.mask_on: masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks] else: masks_im = [None] * self.num_levels results_im = self.inference_single_image( logits_im, deltas_im, masks_im, Boxes.cat(anchors_im), cat(indexes_im), tuple(image_size), ) results.append(results_im) return results
def get_ground_truth(self, anchors, unit_lengths, indexes, targets): """ Args: anchors (list[list[Boxes]]): a list of N=#image elements. Each is a list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level Tensor. The tensor contains unit lengths for anchors of this image on the specific feature level. indexes (list[list[Tensor]]): a list of N=#image elements. Each is a list of #feature level Tensor. The tensor contains the 5D index of each anchor, the second dimension means (L, N, H, W, A), where L is level, I is image, H is height, W is width, and A is anchor. targets (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. Returns: gt_class_info (Tensor, Tensor): A pair of two tensors for classification. The first one is an integer tensor of shape (R, #classes) storing ground-truth labels for each anchor. R is the total number of anchors in the batch. The second one is an integer tensor of shape (R,), to indicate which anchors are valid for loss computation, which anchors are not. gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes. The first one, of shape (F, 4). F=#foreground anchors. The last dimension represents ground-truth box2box transform targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. Only foreground anchors have values in this tensor. Could be `None` if F=0. The second one, of shape (R,), is an integer tensor indicating which anchors are foreground ones used for box regression. Could be `None` if F=0. gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks. The first one is a list of P=#feature level elements. Each is a list of A=#anchor tensors. Each tensor contains the ground truth masks of the same size and for the same feature level. Could be `None`. The second one is a list of P=#feature level elements. Each is a list of A=#anchor tensors. Each tensor contains the location of the ground truth masks of the same size and for the same feature level. The second dimension means (N, H, W), where N is image, H is height, and W is width. Could be `None`. num_fg (int): F=#foreground anchors, used later for loss normalization. """ gt_classes = [] gt_deltas = [] gt_masks = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)] gt_mask_inds = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)] anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths] indexes = [cat(indexes_i) for indexes_i in indexes] num_fg = 0 for i, (anchors_im, unit_lengths_im, indexes_im, targets_im) in enumerate( zip(anchors, unit_lengths, indexes, targets)): # Initialize all gt_classes_i = torch.full_like(unit_lengths_im, self.num_classes, dtype=torch.int64, device=self.device) # Ground truth classes has_gt = len(targets_im) > 0 if has_gt: # Compute the pairwise matrix gt_matched_inds, anchor_labels = _assignment_rule( targets_im.gt_boxes, anchors_im, unit_lengths_im, self.min_anchor_size) # Find the foreground instances fg_inds = anchor_labels == 1 fg_anchors = anchors_im[fg_inds] num_fg += len(fg_anchors) # Find the ground truths for foreground instances gt_fg_matched_inds = gt_matched_inds[fg_inds] # Assign labels for foreground instances gt_classes_i[fg_inds] = targets_im.gt_classes[ gt_fg_matched_inds] # Anchors with label -1 are ignored, others are left as negative gt_classes_i[anchor_labels == -1] = -1 # Boxes # Ground truth box regression, only for foregrounds matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes # Compute box regression offsets for foregrounds only gt_deltas_i = self.box2box_transform.get_deltas( fg_anchors.tensor, matched_gt_boxes.tensor) gt_deltas.append(gt_deltas_i) # Masks if self.mask_on: # Compute masks for each level and each anchor matched_indexes = indexes_im[fg_inds, :] for lvl in range(self.num_levels): ids_lvl = matched_indexes[:, 0] == lvl if torch.any(ids_lvl): cur_level_factor = 2**lvl if self.bipyramid_on else 1 for anc in range(self.num_anchors): ids_lvl_anchor = ids_lvl & ( matched_indexes[:, 4] == anc) if torch.any(ids_lvl_anchor): gt_masks[lvl][anc].append( targets_im[ gt_fg_matched_inds[ids_lvl_anchor]] .gt_masks.crop_and_resize( fg_anchors[ids_lvl_anchor].tensor, self.mask_sizes[anc] * cur_level_factor, )) # Select (N, H, W) dimensions gt_mask_inds_lvl_anc = matched_indexes[ ids_lvl_anchor, 1:4] # Set the image index to the current image gt_mask_inds_lvl_anc[:, 0] = i gt_mask_inds[lvl][anc].append( gt_mask_inds_lvl_anc) gt_classes.append(gt_classes_i) # Classes and boxes gt_classes = cat(gt_classes) gt_valid_inds = gt_classes >= 0 gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes) gt_classes_target = torch.zeros( (gt_classes.shape[0], self.num_classes), dtype=torch.float32, device=self.device) gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1 gt_deltas = cat(gt_deltas) if gt_deltas else None # Masks gt_masks = [[cat(mla) if mla else None for mla in ml] for ml in gt_masks] gt_mask_inds = [[cat(ila) if ila else None for ila in il] for il in gt_mask_inds] return ( (gt_classes_target, gt_valid_inds), (gt_deltas, gt_fg_inds), (gt_masks, gt_mask_inds), num_fg, )