Exemplo n.º 1
0
def create_instances(predictions, image_size):
    ret = Instances(image_size)

    score = np.asarray([x["score"] for x in predictions])
    chosen = (score > args.conf_threshold).nonzero()[0]
    score = score[chosen]
    bbox = np.asarray([predictions[i]["bbox"] for i in chosen])

    if score.shape[0] == 0:
        bbox = np.zeros((0, 4))
    else:
        bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)

    labels = np.asarray(
        [dataset_id_map(predictions[i]["category_id"]) for i in chosen])

    ret.scores = score
    ret.pred_boxes = Boxes(bbox)
    ret.pred_classes = labels

    try:
        ret.pred_masks = [predictions[i]["segmentation"] for i in chosen]
    except KeyError:
        pass
    return ret
Exemplo n.º 2
0
def _postprocess(results,
                 result_mask_info,
                 output_height,
                 output_width,
                 mask_threshold=0.5):
    """
    Post-process the output boxes for TensorMask.
    The input images are often resized when entering an object detector.
    As a result, we often need the outputs of the detector in a different
    resolution from its inputs.
    This function will postprocess the raw outputs of TensorMask
    to produce outputs according to the desired output resolution.
    Args:
        results (Instances): the raw outputs from the detector.
            `results.image_size` contains the input image resolution the detector sees.
            This object might be modified in-place. Note that it does not contain the field
            `pred_masks`, which is provided by another input `result_masks`.
        result_mask_info (list[Tensor], Boxes): a pair of two items for mask related results.
                The first item is a list of #detection tensors, each is the predicted masks.
                The second item is the anchors corresponding to the predicted masks.
        output_height, output_width: the desired output resolution.
    Returns:
        Instances: the postprocessed output from the model, based on the output resolution
    """
    scale_x, scale_y = (
        output_width / results.image_size[1],
        output_height / results.image_size[0],
    )
    results = Instances((output_height, output_width), **results.get_fields())

    output_boxes = results.pred_boxes
    output_boxes.tensor[:, 0::2] *= scale_x
    output_boxes.tensor[:, 1::2] *= scale_y
    output_boxes.clip(results.image_size)

    inds_nonempty = output_boxes.nonempty()
    results = results[inds_nonempty]
    result_masks, result_anchors = result_mask_info
    if result_masks:
        result_anchors.tensor[:, 0::2] *= scale_x
        result_anchors.tensor[:, 1::2] *= scale_y
        result_masks = [
            x for (i, x) in zip(inds_nonempty.tolist(), result_masks) if i
        ]
        results.pred_masks = _paste_mask_lists_in_image(
            result_masks,
            result_anchors[inds_nonempty],
            results.image_size,
            threshold=mask_threshold,
        )
    return results
Exemplo n.º 3
0
def detector_postprocess(results,
                         output_height,
                         output_width,
                         mask_threshold=0.5):
    """
    Resize the output instances.
    The input images are often resized when entering an object detector.
    As a result, we often need the outputs of the detector in a different
    resolution from its inputs.

    This function will resize the raw outputs of an R-CNN detector
    to produce outputs according to the desired output resolution.

    Args:
        results (Instances): the raw outputs from the detector.
            `results.image_size` contains the input image resolution the detector sees.
            This object might be modified in-place.
        output_height, output_width: the desired output resolution.

    Returns:
        Instances: the resized output from the model, based on the output resolution
    """
    scale_x, scale_y = (output_width / results.image_size[1],
                        output_height / results.image_size[0])
    results = Instances((output_height, output_width), **results.get_fields())

    if results.has("pred_boxes"):
        output_boxes = results.pred_boxes
    elif results.has("proposal_boxes"):
        output_boxes = results.proposal_boxes

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(results.image_size)

    results = results[output_boxes.nonempty()]

    if results.has("pred_masks"):
        results.pred_masks = paste_masks_in_image(
            results.pred_masks[:, 0, :, :],  # N, 1, M, M
            results.pred_boxes,
            results.image_size,
            threshold=mask_threshold,
        )

    if results.has("pred_keypoints"):
        results.pred_keypoints[:, :, 0] *= scale_x
        results.pred_keypoints[:, :, 1] *= scale_y

    return results
Exemplo n.º 4
0
    def postprocess(self, results, output_height, output_width, resized_in_h,
                    resized_in_w, padded_im_h, padded_im_w):
        scale_x, scale_y = (output_width / resized_in_w,
                            output_height / resized_in_h)
        # gather detection result to Instances
        results = Instances((output_height, output_width),
                            **results.get_fields())
        # scale detection box results from resized_padded_image space to source image space and clip
        output_boxes = results.pred_boxes
        output_boxes.scale(scale_x, scale_y)
        output_boxes.clip(results.image_size)
        # filter empty detection in source image space
        results = results[output_boxes.nonempty()]
        if results.has("pred_global_logits"):
            mask_h, mask_w = results.pred_global_logits.shape[-2:]
            factor_h = padded_im_h // mask_h
            factor_w = padded_im_w // mask_w
            assert factor_h == factor_w
            factor = factor_h
            # aligned upsample instances mask to resized_padded_image shape
            pred_global_masks = aligned_bilinear(
                results.pred_global_logits.sigmoid(), factor)
            pred_global_masks = pred_global_masks[:, :, :resized_in_h, :
                                                  resized_in_w]
            # scale mask from resized_image shape to source image shape
            # this is a inverse procedure of opencv or PIL interpolation
            # which align_corners is False
            pred_global_masks = F.interpolate(pred_global_masks,
                                              size=(output_height,
                                                    output_width),
                                              mode="bilinear",
                                              align_corners=False)
            pred_global_masks = pred_global_masks[:, 0, :, :]
            # filter out the pred masks with low confidence score
            results.pred_masks = pred_global_masks > self.infer_mask_threshold

        return results
Exemplo n.º 5
0
    def inference_single_image(self, cate_preds, seg_preds, featmap_size,
                               img_shape, ori_shape):
        """
        Args:
            cate_preds, seg_preds: see: method: `inference`.
            featmap_size (list[tuple]): feature map size per level.
            img_shape (tuple): the size of the image fed into the model (height and width).
            ori_shape (tuple): original image shape (height and width).

        Returns:
            result (Instances): predicted results of single image after post-processing.
        """
        assert len(cate_preds) == len(seg_preds)
        result = Instances(ori_shape)

        # overall info.
        h, w = img_shape
        upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4)

        # process.
        inds = (cate_preds > self.score_threshold)
        # category scores.
        cate_scores = cate_preds[inds]
        if len(cate_scores) == 0:
            return result
        # category labels.
        inds = inds.nonzero(as_tuple=False)
        cate_labels = inds[:, 1]

        # strides.
        size_trans = cate_labels.new_tensor(self.seg_num_grids).pow(2).cumsum(
            0)  # [1600, 2896, 3472, 3728, 3872]
        strides = cate_scores.new_ones(size_trans[-1])
        n_stage = len(self.seg_num_grids)
        strides[:size_trans[0]] *= self.feature_strides[0]
        for ind_ in range(1, n_stage):
            strides[size_trans[ind_ -
                               1]:size_trans[ind_]] *= self.feature_strides[
                                   ind_]
        strides = strides[inds[:, 0]]

        # masks.
        seg_preds = seg_preds[inds[:, 0]]
        seg_masks = seg_preds > self.mask_threshold
        sum_masks = seg_masks.sum((1, 2)).float()

        # filter.
        keep = sum_masks > strides
        if keep.sum() == 0:
            return result

        seg_masks = seg_masks[keep, ...]
        seg_preds = seg_preds[keep, ...]
        sum_masks = sum_masks[keep]
        cate_scores = cate_scores[keep]
        cate_labels = cate_labels[keep]

        # mask scoring.
        seg_scores = (seg_preds * seg_masks.float()).sum((1, 2)) / sum_masks
        cate_scores *= seg_scores

        # sort and keep top nms_pre
        sort_inds = torch.argsort(cate_scores, descending=True)
        if len(sort_inds) > self.nms_per_image:
            sort_inds = sort_inds[:self.nms_per_image]
        seg_masks = seg_masks[sort_inds, :, :]
        seg_preds = seg_preds[sort_inds, :, :]
        sum_masks = sum_masks[sort_inds]
        cate_scores = cate_scores[sort_inds]
        cate_labels = cate_labels[sort_inds]

        # Matrix NMS
        cate_scores = matrix_nms(seg_masks,
                                 cate_labels,
                                 cate_scores,
                                 kernel=self.nms_kernel,
                                 sigma=self.nms_sigma,
                                 sum_masks=sum_masks)

        # filter.
        keep = cate_scores >= self.update_threshold
        if keep.sum() == 0:
            return result
        seg_preds = seg_preds[keep, :, :]
        cate_scores = cate_scores[keep]
        cate_labels = cate_labels[keep]

        # sort and keep top_k
        sort_inds = torch.argsort(cate_scores, descending=True)
        if len(sort_inds) > self.max_detections_per_image:
            sort_inds = sort_inds[:self.max_detections_per_image]
        seg_preds = seg_preds[sort_inds, :, :]
        cate_scores = cate_scores[sort_inds]
        cate_labels = cate_labels[sort_inds]

        seg_preds = F.interpolate(seg_preds.unsqueeze(0),
                                  size=upsampled_size_out,
                                  mode='bilinear')[:, :, :h, :w]
        seg_masks = F.interpolate(seg_preds, size=ori_shape,
                                  mode='bilinear').squeeze(0)
        seg_masks = seg_masks > self.mask_threshold

        seg_masks = BitMasks(seg_masks)
        result.pred_masks = seg_masks
        result.pred_boxes = seg_masks.get_bounding_boxes()
        result.scores = cate_scores
        result.pred_classes = cate_labels
        return result
Exemplo n.º 6
0
    def inference_single_image(self, cate_preds, seg_preds_x, seg_preds_y,
                               featmap_size, img_shape, ori_shape):
        result = Instances(ori_shape)

        # overall info.
        h, w = img_shape
        upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4)

        # trans trans_diff.
        trans_size = torch.Tensor(self.seg_num_grids).pow(2).cumsum(0).long()
        trans_diff = torch.ones(trans_size[-1].item(),
                                device=self.device).long()
        num_grids = torch.ones(trans_size[-1].item(),
                               device=self.device).long()
        seg_size = torch.Tensor(self.seg_num_grids).cumsum(0).long()
        seg_diff = torch.ones(trans_size[-1].item(), device=self.device).long()
        strides = torch.ones(trans_size[-1].item(), device=self.device)

        n_stage = len(self.seg_num_grids)
        trans_diff[:trans_size[0]] *= 0
        seg_diff[:trans_size[0]] *= 0
        num_grids[:trans_size[0]] *= self.seg_num_grids[0]
        strides[:trans_size[0]] *= self.feature_strides[0]

        for ind_ in range(1, n_stage):
            trans_diff[trans_size[ind_ -
                                  1]:trans_size[ind_]] *= trans_size[ind_ - 1]
            seg_diff[trans_size[ind_ - 1]:trans_size[ind_]] *= seg_size[ind_ -
                                                                        1]
            num_grids[trans_size[ind_ -
                                 1]:trans_size[ind_]] *= self.seg_num_grids[
                                     ind_]
            strides[trans_size[ind_ -
                               1]:trans_size[ind_]] *= self.feature_strides[
                                   ind_]

        # process.
        inds = (cate_preds > self.score_threshold)
        # category scores.
        cate_scores = cate_preds[inds]

        # category labels.
        inds = inds.nonzero(as_tuple=False)
        trans_diff = torch.index_select(trans_diff, dim=0, index=inds[:, 0])
        seg_diff = torch.index_select(seg_diff, dim=0, index=inds[:, 0])
        num_grids = torch.index_select(num_grids, dim=0, index=inds[:, 0])
        strides = torch.index_select(strides, dim=0, index=inds[:, 0])

        y_inds = (inds[:, 0] - trans_diff) // num_grids
        x_inds = (inds[:, 0] - trans_diff) % num_grids
        y_inds += seg_diff
        x_inds += seg_diff

        cate_labels = inds[:, 1]
        seg_masks_soft = seg_preds_x[x_inds, ...] * seg_preds_y[y_inds, ...]
        seg_masks = seg_masks_soft > self.mask_threshold
        sum_masks = seg_masks.sum((1, 2)).float()

        # filter.
        keep = sum_masks > strides
        if keep.sum() == 0:
            return result

        seg_masks_soft = seg_masks_soft[keep, ...]
        seg_masks = seg_masks[keep, ...]
        cate_scores = cate_scores[keep]
        sum_masks = sum_masks[keep]
        cate_labels = cate_labels[keep]

        # mask scoring
        seg_score = (seg_masks_soft * seg_masks.float()).sum(
            (1, 2)) / sum_masks
        cate_scores *= seg_score

        if len(cate_scores) == 0:
            return result

        # sort and keep top nms_pre
        sort_inds = torch.argsort(cate_scores, descending=True)
        if len(sort_inds) > self.nms_per_image:
            sort_inds = sort_inds[:self.nms_per_image]
        seg_masks_soft = seg_masks_soft[sort_inds, :, :]
        seg_masks = seg_masks[sort_inds, :, :]
        cate_scores = cate_scores[sort_inds]
        sum_masks = sum_masks[sort_inds]
        cate_labels = cate_labels[sort_inds]

        # Matrix NMS
        cate_scores = matrix_nms(seg_masks,
                                 cate_labels,
                                 cate_scores,
                                 kernel=self.nms_kernel,
                                 sigma=self.nms_sigma,
                                 sum_masks=sum_masks)

        # filter.
        keep = cate_scores >= self.update_threshold
        seg_masks_soft = seg_masks_soft[keep, :, :]
        cate_scores = cate_scores[keep]
        cate_labels = cate_labels[keep]

        # sort and keep top_k
        sort_inds = torch.argsort(cate_scores, descending=True)
        if len(sort_inds) > self.max_detections_per_image:
            sort_inds = sort_inds[:self.max_detections_per_image]
        seg_masks_soft = seg_masks_soft[sort_inds, :, :]
        cate_scores = cate_scores[sort_inds]
        cate_labels = cate_labels[sort_inds]

        seg_masks_soft = F.interpolate(seg_masks_soft.unsqueeze(0),
                                       size=upsampled_size_out,
                                       mode='bilinear')[:, :, :h, :w]
        seg_masks = F.interpolate(seg_masks_soft,
                                  size=ori_shape,
                                  mode='bilinear').squeeze(0)
        seg_masks = seg_masks > self.mask_threshold

        seg_masks = BitMasks(seg_masks)
        result.pred_masks = seg_masks
        result.pred_boxes = seg_masks.get_bounding_boxes()
        result.scores = cate_scores
        result.pred_classes = cate_labels
        return result