def show_annotated_image(img,
                         boxes,
                         labels,
                         scores,
                         class_ids,
                         score_threshold=0.0,
                         default_boxes=None,
                         transform_corners=None,
                         max_dets=None,
                         showfig=False,
                         image_id=None):
    good_ids = torch.nonzero(scores.float() > score_threshold).view(-1)
    if good_ids.numel() > 0:
        if max_dets is not None:
            _, ids = scores[good_ids].sort(descending=False)
            good_ids = good_ids[ids[-max_dets:]]
        boxes = boxes[good_ids].cpu()
        labels = labels[good_ids].cpu()
        scores = scores[good_ids].cpu()
        label_names = ["Cl " + str(l.item()) for l in labels]
        box_colors = ["yellow"] * len(boxes)
    else:
        boxes = BoxList.create_empty(boxes.image_size)
        labels = torch.LongTensor(0)
        scores = torch.FloatTensor(0)
        label_names = []
        box_colors = []

    # create visualizations of default boxes
    if default_boxes is not None:
        default_boxes = default_boxes[good_ids].cpu()

        # append boxes
        boxes = torch.cat([default_boxes.bbox_xyxy, boxes.bbox_xyxy], 0)
        labels = torch.cat(
            [torch.Tensor(len(default_boxes)).to(labels).zero_(), labels], 0)
        scores = torch.cat([
            torch.Tensor(len(default_boxes)).to(scores).fill_(float("nan")),
            scores
        ], 0)
        label_names = [""] * len(default_boxes) + label_names
        box_colors = ["cyan"] * len(default_boxes) + box_colors
    else:
        boxes = boxes.bbox_xyxy

    if transform_corners is not None:
        # draw polygons representing the corners of a transformation
        transform_corners = transform_corners[good_ids].cpu()

    vis_image(img,
              showfig=showfig,
              boxes=boxes,
              scores=scores,
              label_names=label_names,
              colors=box_colors,
              image_id=image_id,
              polygons=transform_corners)
    return
Exemplo n.º 2
0
 def apply_transform_to_corners(masked_transform_corners, transform,
                                img_size):
     # need to have 8 numbers of corners in the format x,y,x,y,x,y,x,y for the following:
     masked_transform_corners = masked_transform_corners.contiguous().view(
         -1, 4)
     corners_as_boxes = BoxList(masked_transform_corners,
                                img_size,
                                mode="xyxy")
     corners_as_boxes = transform(corners_as_boxes)
     masked_transform_corners = corners_as_boxes.bbox_xyxy.contiguous(
     ).view(-1, 8)
     return masked_transform_corners
Exemplo n.º 3
0
    def _get_default_boxes(self, img_size):
        """Compute the default (anchor) bounding boxes given the image size.
        Not caching this because both self._get_feature_map_size_per_image_size and self.output_box_grid_generator.create_strided_boxes_columnfirst are cached.

        Args:
            img_size (FeatureMapSize)
        Return:
            boxes_xyxy (BoxList)
        """
        feature_map_size = self._get_feature_map_size_per_image_size(img_size)
        boxes_xyxy = self.output_box_grid_generator.create_strided_boxes_columnfirst(
            feature_map_size)
        boxes_xyxy = BoxList(boxes_xyxy, image_size=img_size, mode="xyxy")
        return boxes_xyxy
Exemplo n.º 4
0
    def build_boxes_from_loc_scores(self, loc_scores, default_boxes):
        """build_boxes_from_loc_scores is a wrapper for the torchvision implemetation of box decoding
        Cannot be static because the torchvision method for decode is not static (this can be easily fixed id needed).

        build_boxes_from_loc_scores and build_loc_targets implement inverse functionality:
        self.build_loc_targets(self.build_boxes_from_loc_scores(loc_scores, default_boxes), default_boxes)
        should be very close to loc_scores

        Ref: https://github.com/pytorch/vision/blob/master/torchvision/models/detection/_utils.py
        """
        box_preds = self.box_coder.decode_single(loc_scores,
                                                 default_boxes.bbox_xyxy)
        return BoxList(box_preds,
                       image_size=default_boxes.image_size,
                       mode="xyxy")
Exemplo n.º 5
0
def create_strided_boxes_columnfirst(grid_size, box_size, box_stride):
    """Create a list of boxes, shifted horizontally and vertically with some stride. The boxes are appearinf in the column-first (vertical shift first) order starting from the top left. The boxes are in the x1y1x2y2 format.
  
    Args:
      grid_size: (tuple of len 2) height and width of the grid, the number of boxes equals grid_size.w * grid_size.h
      box_size: (tuple of len 2) height and width of all the boxes
      box_stride: (tuple of len 2) vertical and horizontal strides, respectively

    Returns:
      (Tensor) tensor of boxes, size [grid_size.w * grid_size.h, 4]

    Comment: even vectorized this functions can be quite slow, thus I put it into functools.lru_cache decorator to cache the calls
    """
    # # slow code
    # boxes_cXcYWH = []
    # for h in range(grid_size.h):
    #     for w in range(grid_size.v):
    #         cx = (w + 0.5) * box_stride.w
    #         cy = (h + 0.5) * box_stride.h
    #         boxes_cXcYWH.append((cx, cy, box_size.w, box_size.h))
    # boxes_cXcYWH = torch.FloatTensor(boxes)  # 'cx cy w h'

    # vectorized code

    # get center positions
    h = torch.arange(0, grid_size.h, dtype=torch.float)
    cy = (h + 0.5) * box_stride.h
    w = torch.arange(0, grid_size.w, dtype=torch.float)
    cx = (w + 0.5) * box_stride.w

    # make tuples of coordinates
    cx = cx.unsqueeze(0).expand(cy.size(0), -1).contiguous()
    cy = cy.unsqueeze(1).expand(-1, cx.size(1)).contiguous()
    cx = cx.view(-1)
    cy = cy.view(-1)

    # create sizes of appropriate length
    sx = torch.FloatTensor([box_size.w]).expand_as(cx)
    sy = torch.FloatTensor([box_size.h]).expand_as(cy)

    boxes_cXcYWH = torch.stack([cx, cy, sx, sy], dim=1)

    boxes_xyxy = BoxList.convert_bbox_format(boxes_cXcYWH, "cx_cy_w_h", "xyxy")
    return boxes_xyxy
Exemplo n.º 6
0
    def get_bounding_boxes(self, boxes, score_threshold=0.6, max_dets=8):
        scores = boxes.get_field("scores").clone()

        good_ids = torch.nonzero(scores.float() > score_threshold).view(-1)
        if good_ids.numel() > 0:
            if max_dets is not None:
                _, ids = scores[good_ids].sort(descending=False)
                good_ids = good_ids[ids[-max_dets:]]
            boxes = boxes[good_ids].cpu()
        else:
            boxes = BoxList.create_empty(boxes.image_size)

        default_boxes = boxes.get_field("default_boxes") if boxes.has_field(
            "default_boxes") else None
        if default_boxes is not None:
            default_boxes = default_boxes[good_ids].cpu()

            # append boxes
            boxes = torch.cat([default_boxes.bbox_xyxy, boxes.bbox_xyxy], 0)
        else:
            boxes = boxes.bbox_xyxy

        return self.bounding_boxes(boxes)
Exemplo n.º 7
0
    def get_box_to_cut_anchor(self,
                              img_size,
                              crop_size,
                              fm_size,
                              default_box_transform=None):
        """For each anchor box, obtain the box of the size crop_size such that
            2) the anchor box is roughly in the middle of the crop
            2) it is aligned with the stride of the anchor boxes
        Need this function so make sure that after cropping the original image we get the same cropped feature map
        (problems are caused by the network stride).
        Used in train.mine_hard_patches.
        Args:
            img_size (FeatureMapSize) - size of the original image
            crop_size (FeatureMapSize) - size of the crop needed for training
            fm_size (FeatureMapSize) - size of the feature map from this image
            default_box_transform (TransformList) - transformation to convert the boxes to the img_size scale
        Returns:
            crop_boxes_xyxy, anchor_box (BoxList)
            anchor_index (tensor of indices)
        """

        # anchors are encoded in the column-first row-last order
        # converting position (anchor_index) to (anchor_y_index, anchor_x_index)
        anchor_index = torch.arange(fm_size.h * fm_size.w)
        anchor_y_index = anchor_index // fm_size.w
        anchor_x_index = anchor_index % fm_size.w

        # get the center of the anchor
        cx = (anchor_x_index.float() + 0.5) * self.box_stride.w
        cy = (anchor_y_index.float() + 0.5) * self.box_stride.h

        # get the top-left corner of the box to crop
        box_left = cx - crop_size.w / 2
        box_top = cy - crop_size.h / 2

        anchor_box = torch.stack([
            cx, cy,
            torch.full_like(cx, self.box_size.w),
            torch.full_like(cx, self.box_size.h)
        ], 1)
        anchor_box = BoxList.convert_bbox_format(anchor_box, "cx_cy_w_h",
                                                 "xyxy")

        # round down to strided positions in the image
        def floor_to_stride(pos, stride):
            return (torch.floor(pos) // stride) * stride

        def ceil_to_stride(pos, stride):
            return torch.floor(torch.ceil(torch.floor(pos) / stride)) * stride

        box_left = masked_select_or_fill_constant(
            floor_to_stride(box_left, self.box_stride.w), box_left > 0, 0)
        box_top = masked_select_or_fill_constant(
            floor_to_stride(box_top, self.box_stride.h), box_top > 0, 0)

        # get another corner
        box_right = box_left + crop_size.w
        box_bottom = box_top + crop_size.h

        # make sure the crop is in the image: this stratery should be compatible with the one used in augmentation.crop_image
        mask_have_to_move_right = box_left < 0
        box_right[mask_have_to_move_right] -= box_left[mask_have_to_move_right]
        box_left[mask_have_to_move_right] = 0

        mask = box_right > img_size.w
        shift_left = ceil_to_stride(box_right - img_size.w, self.box_stride.w)
        mask_good_fit = (box_left - shift_left >= 0)
        # can safely shift left
        box_left[mask & mask_good_fit] -= shift_left[mask & mask_good_fit]
        box_right[mask & mask_good_fit] -= shift_left[mask & mask_good_fit]
        # just output full width
        box_left[mask & ~mask_good_fit] = 0
        box_right[mask & ~mask_good_fit] = crop_size.w

        mask_have_to_move_down = box_top < 0
        box_bottom[mask_have_to_move_down] -= box_top[mask_have_to_move_down]
        box_top[mask_have_to_move_down] = 0

        mask = box_bottom > img_size.h
        shift_up = ceil_to_stride(box_bottom - img_size.h, self.box_stride.h)
        mask_good_fit = (box_top - shift_up >= 0)
        # can safely shift up
        box_top[mask & mask_good_fit] -= shift_up[mask & mask_good_fit]
        box_bottom[mask & mask_good_fit] -= shift_up[mask & mask_good_fit]
        # just output full height
        box_top[mask & ~mask_good_fit] = 0
        box_bottom[mask & ~mask_good_fit] = crop_size.h

        # assemble the box
        crop_boxes_xyxy = torch.stack(
            [box_left, box_top, box_right, box_bottom], 1)  # lx ty rx by

        # convert boxes to the original image coordinates
        crop_boxes_xyxy = BoxList(crop_boxes_xyxy, img_size, mode="xyxy")
        anchor_box = BoxList(anchor_box, img_size, mode="xyxy")
        if default_box_transform is not None:
            crop_boxes_xyxy = default_box_transform(crop_boxes_xyxy)
            anchor_box = default_box_transform(anchor_box)

        return crop_boxes_xyxy, anchor_box, anchor_index
Exemplo n.º 8
0
    def forward(self, feature_maps):
        """
        Args:
            feature_maps (Tensor[float], size b^A x d x h^A x w^A) - contains the feature map of the input image
            b^A - batch size
            d - feature dimensionality
            h^A - height of the feature map
            w^A - width of the feature map
​
        Returns:
                # here b^C is the class batch size, i.e., the number of class images contained in self.class_batch_size passed when creating this object
            output_localization (Tensor[float], size b^A x b^C x 4 x h^A x w^A) - the localization output w.r.t. the standard box encoding - computed by DetectionBoxCoder.build_loc_targets
            output_recognition (Tensor[float], size size b^A x b^C x 1 x h^A x w^A) - the recognition output for each of the classes:
                in the [-1, 1] segment, the higher the better match to the class
            output_recognition_transform_detached (Tensor[float], size b^A x b^C x 1 x h^A x w^A) - same as output_recognition,
                but with the computational graph detached from the transformation (for backward  that does not update
                the transofrmation - intended for the negatives)
            corner_coordinates (Tensor[float], size size b^A x b^C x 8 x h^A x w^A) - the corners of the default boxes after
                the transofrmation, datached from the computational graph, for visualisation only
        """
        # get dims
        batch_size = feature_maps.size(0)
        feature_dim = feature_maps.size(1)
        image_fm_size = FeatureMapSize(img=feature_maps)
        class_fm_size = FeatureMapSize(img=self.class_feature_maps)
        feature_dim_for_regression = class_fm_size.h * class_fm_size.w

        class_feature_dim = self.class_feature_maps.size(1)
        assert feature_dim == class_feature_dim, "Feature dimensionality of input={0} and class={1} feature maps has to equal".format(
            feature_dim, class_feature_dim)

        # L2-normalize the feature map
        feature_maps = normalize_feature_map_L2(feature_maps, 1e-5)

        # get correlations all to all
        corr_maps = torch.einsum("bfhw,afxy->abwhxy", self.class_feature_maps,
                                 feature_maps)
        # need to try to optimize this with opt_einsum: https://optimized-einsum.readthedocs.io/en/latest/
        # CAUTION: note the switch of dimensions hw to wh. This is done for compatability with the FeatureCorrelation class by Ignacio Rocco https://github.com/ignacio-rocco/ncnet/blob/master/lib/model.py (to be able to load their models)

        # reshape to have the correlation map of dimensions similar to the standard tensor for image feature maps
        corr_maps = corr_maps.contiguous().view(
            batch_size * self.class_batch_size, feature_dim_for_regression,
            image_fm_size.h, image_fm_size.w)

        # compute the grids to resample corr maps
        resampling_grids_local_coord = self.aligner(corr_maps)

        # build classifications outputs
        cor_maps_for_recognition = corr_maps.contiguous().view(
            batch_size, self.class_batch_size, feature_dim_for_regression,
            image_fm_size.h, image_fm_size.w)
        resampling_grids_local_coord = resampling_grids_local_coord.contiguous(
        ).view(batch_size, self.class_batch_size, image_fm_size.h,
               image_fm_size.w, self.aligner.out_grid_size.h,
               self.aligner.out_grid_size.w, 2)

        # need to recompute resampling_grids to [-1, 1] coordinates w.r.t. the feature maps to sample points with F.grid_sample
        # first get the list of boxes that corresponds to the receptive fields of the parameter regression network: box sizes are the receptive field sizes, stride is the network stride
        default_boxes_xyxy_wrt_fm = self.box_grid_generator_feature_map_level.create_strided_boxes_columnfirst(
            fm_size=image_fm_size)

        default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.view(
            1, 1, image_fm_size.h, image_fm_size.w, 4)
        # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x  box_grid_height x box_grid_width x 4
        default_boxes_xyxy_wrt_fm = default_boxes_xyxy_wrt_fm.to(
            resampling_grids_local_coord.device)
        resampling_grids_fm_coord = convert_box_coordinates_local_to_global(
            resampling_grids_local_coord, default_boxes_xyxy_wrt_fm)

        # covert to coordinates normalized to [-1, 1] (to be compatible with torch.nn.functional.grid_sample)
        resampling_grids_fm_coord_x = resampling_grids_fm_coord.narrow(
            -1, 0, 1)
        resampling_grids_fm_coord_y = resampling_grids_fm_coord.narrow(
            -1, 1, 1)
        resampling_grids_fm_coord_unit = torch.cat([
            resampling_grids_fm_coord_x / (image_fm_size.w - 1) * 2 - 1,
            resampling_grids_fm_coord_y / (image_fm_size.h - 1) * 2 - 1
        ],
                                                   dim=-1)
        # clamp to fit the image plane
        resampling_grids_fm_coord_unit = resampling_grids_fm_coord_unit.clamp(
            -1, 1)

        # extract and pool matches
        # # slower code:
        # output_recognition = self.resample_of_correlation_map_simple(cor_maps_for_recognition,
        #                                                          resampling_grids_fm_coord_unit,
        #                                                          self.class_pool_mask)

        # we use faster, but somewhat more obscure version
        output_recognition = self.resample_of_correlation_map_fast(
            cor_maps_for_recognition, resampling_grids_fm_coord_unit,
            self.class_pool_mask)
        if output_recognition.requires_grad:
            output_recognition_transform_detached = self.resample_of_correlation_map_fast(
                cor_maps_for_recognition,
                resampling_grids_fm_coord_unit.detach(), self.class_pool_mask)
        else:
            # Optimization to make eval faster
            output_recognition_transform_detached = output_recognition

        # build localization targets
        default_boxes_xyxy_wrt_image = self.box_grid_generator_image_level.create_strided_boxes_columnfirst(
            fm_size=image_fm_size)

        default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.view(
            1, 1, image_fm_size.h, image_fm_size.w, 4)
        # 1 (to broadcast to batch_size) x 1 (to broadcast to class batch_size) x  box_grid_height x box_grid_width x 4
        default_boxes_xyxy_wrt_image = default_boxes_xyxy_wrt_image.to(
            resampling_grids_local_coord.device)
        resampling_grids_image_coord = convert_box_coordinates_local_to_global(
            resampling_grids_local_coord, default_boxes_xyxy_wrt_image)

        num_pooled_points = self.aligner.out_grid_size.w * self.aligner.out_grid_size.h
        resampling_grids_x = resampling_grids_image_coord.narrow(
            -1, 0, 1).contiguous().view(-1, num_pooled_points)
        resampling_grids_y = resampling_grids_image_coord.narrow(
            -1, 1, 1).contiguous().view(-1, num_pooled_points)
        class_boxes_xyxy = torch.stack([
            resampling_grids_x.min(dim=1)[0],
            resampling_grids_y.min(dim=1)[0],
            resampling_grids_x.max(dim=1)[0],
            resampling_grids_y.max(dim=1)[0]
        ], 1)

        # extract rectangle borders to draw complete boxes
        corner_coordinates = resampling_grids_image_coord[:, :, :, :, [
            0, -1
        ]][:, :, :, :, :, [0, -1]]  # only the corners
        corner_coordinates = corner_coordinates.detach_()
        corner_coordinates = corner_coordinates.view(
            batch_size, self.class_batch_size, image_fm_size.h,
            image_fm_size.w,
            8)  # batch_size x label_batch_size x fm_height x fm_width x 8
        corner_coordinates = corner_coordinates.transpose(3, 4).transpose(
            2, 3)  # batch_size x label_batch_size x 5 x fm_height x fm_width

        class_boxes = BoxList(class_boxes_xyxy.view(-1, 4),
                              image_fm_size,
                              mode="xyxy")
        default_boxes_wrt_image = BoxList(default_boxes_xyxy_wrt_image.view(
            -1, 4),
                                          image_fm_size,
                                          mode="xyxy")
        default_boxes_with_image_batches = cat_boxlist(
            [default_boxes_wrt_image] * batch_size * self.class_batch_size)

        output_localization = Os2dBoxCoder.build_loc_targets(
            class_boxes, default_boxes_with_image_batches)  # num_boxes x 4
        output_localization = output_localization.view(
            batch_size, self.class_batch_size, image_fm_size.h,
            image_fm_size.w,
            4)  # batch_size x label_batch_size x fm_height x fm_width x 4
        output_localization = output_localization.transpose(3, 4).transpose(
            2, 3)  # batch_size x label_batch_size x 4 x fm_height x fm_width

        return output_localization, output_recognition, output_recognition_transform_detached, corner_coordinates