示例#1
0
def annotations_to_instances(annos, image_size, mask_format="polygon"):
    """
    Create an :class:`Instances` object used by the models,
    from instance annotations in the dataset dict.

    Args:
        annos (list[dict]): a list of instance annotations in one image, each
            element for one instance.
        image_size (tuple): height, width

    Returns:
        Instances:
            It will contain fields "gt_boxes", "gt_classes",
            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
            This is the format that builtin models expect.
    """
    boxes = [
        BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS)
        for obj in annos
    ]
    target = Instances(image_size)
    boxes = target.gt_boxes = Boxes(boxes)
    boxes.clip(image_size)

    classes = [obj["category_id"] for obj in annos]
    classes = torch.tensor(classes, dtype=torch.int64)
    target.gt_classes = classes

    if len(annos) and "segmentation" in annos[0]:
        segms = [obj["segmentation"] for obj in annos]
        if mask_format == "polygon":
            masks = PolygonMasks(segms)
        else:
            assert mask_format == "bitmask", mask_format
            masks = []
            for segm in segms:
                if isinstance(segm, list):
                    # polygon
                    masks.append(polygons_to_bitmask(segm, *image_size))
                elif isinstance(segm, dict):
                    # COCO RLE
                    masks.append(mask_util.decode(segm))
                elif isinstance(segm, np.ndarray):
                    assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
                        segm.ndim)
                    # mask array
                    masks.append(segm)
                else:
                    raise ValueError(
                        "Cannot convert segmentation of type '{}' to BitMasks!"
                        "Supported types are: polygons as list[list[float] or ndarray],"
                        " COCO-style RLE as a dict, or a full-image segmentation mask "
                        "as a 2D ndarray.".format(type(segm)))
            # torch.from_numpy does not support array with negative stride.
            masks = BitMasks(
                torch.stack([
                    torch.from_numpy(np.ascontiguousarray(x)) for x in masks
                ]))
        target.gt_masks = masks

    if len(annos) and "keypoints" in annos[0]:
        kpts = np.array([obj.get("keypoints", [])
                         for obj in annos])  # (N, K, 3)
        # Set all out-of-boundary points to "unlabeled"
        kpts_xy = kpts[:, :, :2]
        inside = (kpts_xy >= np.array([0, 0])) & (kpts_xy <= np.array(
            image_size[::-1]))
        inside = inside.all(axis=2)
        kpts[:, :, :2] = kpts_xy
        kpts[:, :, 2][~inside] = 0
        target.gt_keypoints = Keypoints(kpts)

    return target
示例#2
0
    def get_ground_truth(self, shifts, targets, image_shape):

        gt_classes = []
        gt_shifts_deltas = []
        gt_centerness = []
        gt_instances_masks = []
        gt_inds = []
        fpn_levels = []
        im_inds = []
        num_targets = 0
        im_h, im_w = image_shape[-2:]
        nearest_offset = int(self.mask_out_stride // 2)
        for im_i, (shifts_per_image,
                   targets_per_image) in enumerate(zip(shifts, targets)):
            object_sizes_of_interest = torch.cat([
                shifts_i.new_tensor(size).unsqueeze(0).expand(
                    shifts_i.shape[0], -1) for shifts_i, size in zip(
                        shifts_per_image, self.object_sizes_of_interest)
            ],
                                                 dim=0)

            shifts_over_all_feature_maps = torch.cat(shifts_per_image, dim=0)

            gt_boxes = targets_per_image.gt_boxes

            deltas = self.shift2box_transform.get_deltas(
                shifts_over_all_feature_maps, gt_boxes.tensor.unsqueeze(1))

            # ground truth for instances masks
            polygons = targets_per_image.get("gt_masks").polygons
            gt_instances_masks_i = []
            is_in_boxes = []
            for ind in range(len(polygons)):
                # down-sampling operation and building is_in_boxes per instance to saving memory
                bitmask = polygons_to_bitmask(polygons[ind], im_h, im_w)
                bitmask = torch.from_numpy(bitmask).to(
                    self.device).unsqueeze(0)
                # 1, len(shifts)
                is_in_boxes_i = self.generate_in_box_mask(
                    gt_boxes[ind], bitmask, deltas[ind:ind + 1], im_h, im_w,
                    shifts_per_image)
                # nearest sample to supervised resolution
                bitmask = bitmask[:, nearest_offset::self.mask_out_stride,
                                  nearest_offset::self.mask_out_stride]

                is_in_boxes.append(is_in_boxes_i)
                gt_instances_masks_i.append(bitmask)

            is_in_boxes = torch.cat(is_in_boxes, dim=0)  # len(GT), len(shifts)
            gt_instances_masks_i = torch.cat(gt_instances_masks_i,
                                             dim=0)  # len(GT), im_h, im_w

            max_deltas = deltas.max(dim=-1).values
            # limit the regression range for each location
            is_cared_in_the_level = \
                (max_deltas >= object_sizes_of_interest[None, :, 0]) & \
                (max_deltas <= object_sizes_of_interest[None, :, 1])

            gt_positions_area = gt_boxes.area().unsqueeze(1).repeat(
                1, shifts_over_all_feature_maps.shape[0])
            gt_positions_area[~is_in_boxes] = math.inf
            gt_positions_area[~is_cared_in_the_level] = math.inf

            # if there are still more than one objects for a position,
            # we choose the one with minimal area
            positions_min_area, gt_matched_idxs = gt_positions_area.min(dim=0)

            gt_ind_i = num_targets + gt_matched_idxs
            num_targets += len(targets_per_image)

            # ground truth box regression
            gt_shifts_reg_deltas_i = self.shift2box_transform.get_deltas(
                shifts_over_all_feature_maps, gt_boxes[gt_matched_idxs].tensor)

            # ground truth classes
            has_gt = len(targets_per_image) > 0
            if has_gt:
                gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs]
                # Shifts with area inf are treated as background.
                gt_classes_i[positions_min_area == math.inf] = self.num_classes
            else:
                gt_classes_i = torch.zeros_like(
                    gt_matched_idxs) + self.num_classes

            # ground truth centerness
            left_right = gt_shifts_reg_deltas_i[:, [0, 2]]
            top_bottom = gt_shifts_reg_deltas_i[:, [1, 3]]
            gt_centerness_i = torch.sqrt(
                (left_right.min(dim=-1).values /
                 left_right.max(dim=-1).values).clamp_(min=0) *
                (top_bottom.min(dim=-1).values /
                 top_bottom.max(dim=-1).values).clamp_(min=0))

            fpn_level_i = torch.cat([
                loc.new_ones(len(loc), dtype=torch.long) * level
                for level, loc in enumerate(shifts_per_image)
            ])
            im_ind_i = fpn_level_i.new_ones(len(fpn_level_i)) * im_i

            gt_classes.append(gt_classes_i)
            gt_shifts_deltas.append(gt_shifts_reg_deltas_i)
            gt_centerness.append(gt_centerness_i)
            gt_instances_masks.append(gt_instances_masks_i)
            gt_inds.append(gt_ind_i)
            fpn_levels.append(fpn_level_i)
            im_inds.append(im_ind_i)

        return torch.stack(gt_classes), torch.stack(gt_shifts_deltas), torch.stack(gt_centerness),\
            torch.cat(gt_instances_masks), torch.stack(gt_inds), torch.stack(im_inds), \
            torch.stack(fpn_levels)