def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [ BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos ] target = Instances(image_size) boxes = target.gt_boxes = Boxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": masks = PolygonMasks(segms) else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a full-image segmentation mask " "as a 2D ndarray.".format(type(segm))) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([ torch.from_numpy(np.ascontiguousarray(x)) for x in masks ])) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = np.array([obj.get("keypoints", []) for obj in annos]) # (N, K, 3) # Set all out-of-boundary points to "unlabeled" kpts_xy = kpts[:, :, :2] inside = (kpts_xy >= np.array([0, 0])) & (kpts_xy <= np.array( image_size[::-1])) inside = inside.all(axis=2) kpts[:, :, :2] = kpts_xy kpts[:, :, 2][~inside] = 0 target.gt_keypoints = Keypoints(kpts) return target
def get_ground_truth(self, shifts, targets, image_shape): gt_classes = [] gt_shifts_deltas = [] gt_centerness = [] gt_instances_masks = [] gt_inds = [] fpn_levels = [] im_inds = [] num_targets = 0 im_h, im_w = image_shape[-2:] nearest_offset = int(self.mask_out_stride // 2) for im_i, (shifts_per_image, targets_per_image) in enumerate(zip(shifts, targets)): object_sizes_of_interest = torch.cat([ shifts_i.new_tensor(size).unsqueeze(0).expand( shifts_i.shape[0], -1) for shifts_i, size in zip( shifts_per_image, self.object_sizes_of_interest) ], dim=0) shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0) gt_boxes = targets_per_image.gt_boxes deltas = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1)) # ground truth for instances masks polygons = targets_per_image.get("gt_masks").polygons gt_instances_masks_i = [] is_in_boxes = [] for ind in range(len(polygons)): # down-sampling operation and building is_in_boxes per instance to saving memory bitmask = polygons_to_bitmask(polygons[ind], im_h, im_w) bitmask = torch.from_numpy(bitmask).to( self.device).unsqueeze(0) # 1, len(shifts) is_in_boxes_i = self.generate_in_box_mask( gt_boxes[ind], bitmask, deltas[ind:ind + 1], im_h, im_w, shifts_per_image) # nearest sample to supervised resolution bitmask = bitmask[:, nearest_offset::self.mask_out_stride, nearest_offset::self.mask_out_stride] is_in_boxes.append(is_in_boxes_i) gt_instances_masks_i.append(bitmask) is_in_boxes = torch.cat(is_in_boxes, dim=0) # len(GT), len(shifts) gt_instances_masks_i = torch.cat(gt_instances_masks_i, dim=0) # len(GT), im_h, im_w max_deltas = deltas.max(dim=-1).values # limit the regression range for each location is_cared_in_the_level = \ (max_deltas >= object_sizes_of_interest[None, :, 0]) & \ (max_deltas <= object_sizes_of_interest[None, :, 1]) gt_positions_area = gt_boxes.area().unsqueeze(1).repeat( 1, shifts_over_all_feature_maps.shape[0]) gt_positions_area[~is_in_boxes] = math.inf gt_positions_area[~is_cared_in_the_level] = math.inf # if there are still more than one objects for a position, # we choose the one with minimal area positions_min_area, gt_matched_idxs = gt_positions_area.min(dim=0) gt_ind_i = num_targets + gt_matched_idxs num_targets += len(targets_per_image) # ground truth box regression gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas( shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor) # ground truth classes has_gt = len(targets_per_image) > 0 if has_gt: gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] # Shifts with area inf are treated as background. gt_classes_i[positions_min_area == math.inf] = self.num_classes else: gt_classes_i = torch.zeros_like( gt_matched_idxs) + self.num_classes # ground truth centerness left_right = gt_shifts_reg_deltas_i[:, [0, 2]] top_bottom = gt_shifts_reg_deltas_i[:, [1, 3]] gt_centerness_i = torch.sqrt( (left_right.min(dim=-1).values / left_right.max(dim=-1).values).clamp_(min=0) * (top_bottom.min(dim=-1).values / top_bottom.max(dim=-1).values).clamp_(min=0)) fpn_level_i = torch.cat([ loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(shifts_per_image) ]) im_ind_i = fpn_level_i.new_ones(len(fpn_level_i)) * im_i gt_classes.append(gt_classes_i) gt_shifts_deltas.append(gt_shifts_reg_deltas_i) gt_centerness.append(gt_centerness_i) gt_instances_masks.append(gt_instances_masks_i) gt_inds.append(gt_ind_i) fpn_levels.append(fpn_level_i) im_inds.append(im_ind_i) return torch.stack(gt_classes), torch.stack(gt_shifts_deltas), torch.stack(gt_centerness),\ torch.cat(gt_instances_masks), torch.stack(gt_inds), torch.stack(im_inds), \ torch.stack(fpn_levels)