def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor: """ Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU). NMS iteratively removes lower scoring boxes which have an IoU greater than iou_threshold with another (higher scoring) box. If multiple boxes have the exact same score and satisfy the IoU criterion with respect to a reference box, the selected box is not guaranteed to be the same between CPU and GPU. This is similar to the behavior of argsort in PyTorch when repeated values are present. Args: boxes (Tensor[N, 4])): boxes to perform NMS on. They are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. scores (Tensor[N]): scores for each one of the boxes iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold Returns: Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted in decreasing order of scores """ if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(nms) _assert_has_ops() return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor: """ Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU). NMS iteratively removes lower scoring boxes which have an IoU greater than iou_threshold with another (higher scoring) box. If multiple boxes have the exact same score and satisfy the IoU criterion with respect to a reference box, the selected box is not guaranteed to be the same between CPU and GPU. This is similar to the behavior of argsort in PyTorch when repeated values are present. Parameters ---------- boxes : Tensor[N, 4]) boxes to perform NMS on. They are expected to be in (x1, y1, x2, y2) format scores : Tensor[N] scores for each one of the boxes iou_threshold : float discards all overlapping boxes with IoU > iou_threshold Returns ------- keep : Tensor int64 tensor with the indices of the elements that have been kept by NMS, sorted in decreasing order of scores """ _assert_has_ops() return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
def roi_pool( input: Tensor, boxes: Tensor, output_size: BroadcastingList2[int], spatial_scale: float = 1.0, ) -> Tensor: """ Performs Region of Interest (RoI) Pool operator described in Fast R-CNN Arguments: input (Tensor[N, C, H, W]): input tensor boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. If a single Tensor is passed, then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in a batch output_size (int or Tuple[int, int]): the size of the output after the cropping is performed, as (height, width) spatial_scale (float): a scaling factor that maps the input coordinates to the box coordinates. Default: 1.0 Returns: output (Tensor[K, C, output_size[0], output_size[1]]) """ _assert_has_ops() check_roi_boxes_shape(boxes) rois = boxes output_size = _pair(output_size) if not isinstance(rois, torch.Tensor): rois = convert_boxes_to_roi_format(rois) output, _ = torch.ops.torchvision.roi_pool(input, rois, spatial_scale, output_size[0], output_size[1]) return output
def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor: """ Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU). NMS iteratively removes lower scoring boxes which have an IoU greater than iou_threshold with another (higher scoring) box. If multiple boxes have the exact same score and satisfy the IoU criterion with respect to a reference box, the selected box is not guaranteed to be the same between CPU and GPU. This is similar to the behavior of argsort in PyTorch when repeated values are present. Args: boxes (Tensor[N, 4])): boxes to perform NMS on. They are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. scores (Tensor[N]): scores for each one of the boxes iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold Returns: Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted in decreasing order of scores """ # if scores.device == xm.xla_device(): # import torch_xla.core.functions as xf # import torch_xla.core.xla_model as xm # return xf.nms(boxes, scores, torch.tensor(0.00001).to(scores.device), torch.tensor(iou_threshold, device=scores.device), boxes.shape[0])[0].long() _assert_has_ops() return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
def roi_align( input: Tensor, boxes: Union[Tensor, List[Tensor]], output_size: BroadcastingList2[int], spatial_scale: float = 1.0, sampling_ratio: int = -1, aligned: bool = False, ) -> Tensor: """ Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN. Args: input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``H x W``. If the tensor is quantized, we expect a batch size of ``N == 1``. boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. If a single Tensor is passed, then the first column should contain the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in the batch. output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling is performed, as (height, width). spatial_scale (float): a scaling factor that maps the box coordinates to the input coordinates. For example, if your boxes are defined on the scale of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of the original image), you'll want to set this to 0.5. Default: 1.0 sampling_ratio (int): number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. If > 0, then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If <= 0, then an adaptive number of grid points are used (computed as ``ceil(roi_width / output_width)``, and likewise for height). Default: -1 aligned (bool): If False, use the legacy implementation. If True, pixel shift the box coordinates it by -0.5 for a better alignment with the two neighboring pixel indices. This version is used in Detectron2 Returns: Tensor[K, C, output_size[0], output_size[1]]: The pooled RoIs. """ if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(roi_align) _assert_has_ops() check_roi_boxes_shape(boxes) rois = boxes output_size = _pair(output_size) if not isinstance(rois, torch.Tensor): rois = convert_boxes_to_roi_format(rois) return torch.ops.torchvision.roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned)
def ps_roi_align( input: Tensor, boxes: Tensor, output_size: int, spatial_scale: float = 1.0, sampling_ratio: int = -1, ) -> Tensor: """ Performs Position-Sensitive Region of Interest (RoI) Align operator mentioned in Light-Head R-CNN. Args: input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``H x W``. boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. If a single Tensor is passed, then the first column should contain the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in the batch. output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling is performed, as (height, width). spatial_scale (float): a scaling factor that maps the box coordinates to the input coordinates. For example, if your boxes are defined on the scale of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of the original image), you'll want to set this to 0.5. Default: 1.0 sampling_ratio (int): number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. If > 0, then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If <= 0, then an adaptive number of grid points are used (computed as ``ceil(roi_width / output_width)``, and likewise for height). Default: -1 Returns: Tensor[K, C / (output_size[0] * output_size[1]), output_size[0], output_size[1]]: The pooled RoIs """ _assert_has_ops() check_roi_boxes_shape(boxes) rois = boxes output_size = _pair(output_size) if not isinstance(rois, torch.Tensor): rois = convert_boxes_to_roi_format(rois) output, _ = torch.ops.torchvision.ps_roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio) return output
def roi_align( input: Tensor, boxes: Tensor, output_size: BroadcastingList2[int], spatial_scale: float = 1.0, sampling_ratio: int = -1, aligned: bool = False, ) -> Tensor: """ Performs Region of Interest (RoI) Align operator described in Mask R-CNN Args: input (Tensor[N, C, H, W]): input tensor If the tensor is quantized, we expect a batch size of ``N == 1``. boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. If a single Tensor is passed, then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in a batch output_size (int or Tuple[int, int]): the size of the output after the cropping is performed, as (height, width) spatial_scale (float): a scaling factor that maps the input coordinates to the box coordinates. Default: 1.0 sampling_ratio (int): number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. If > 0, then exactly sampling_ratio x sampling_ratio grid points are used. If <= 0, then an adaptive number of grid points are used (computed as ceil(roi_width / pooled_w), and likewise for height). Default: -1 aligned (bool): If False, use the legacy implementation. If True, pixel shift it by -0.5 for align more perfectly about two neighboring pixel indices. This version in Detectron2 Returns: Tensor[K, C, output_size[0], output_size[1]]: The pooled RoIs. """ _assert_has_ops() check_roi_boxes_shape(boxes) rois = boxes output_size = _pair(output_size) if not isinstance(rois, torch.Tensor): rois = convert_boxes_to_roi_format(rois) return torch.ops.torchvision.roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned)
def ps_roi_align( input: Tensor, boxes: Tensor, output_size: int, spatial_scale: float = 1.0, sampling_ratio: int = -1, ) -> Tensor: """ Performs Position-Sensitive Region of Interest (RoI) Align operator mentioned in Light-Head R-CNN. Args: input (Tensor[N, C, H, W]): input tensor boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. If a single Tensor is passed, then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in a batch output_size (int or Tuple[int, int]): the size of the output after the cropping is performed, as (height, width) spatial_scale (float): a scaling factor that maps the input coordinates to the box coordinates. Default: 1.0 sampling_ratio (int): number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. If > 0 then exactly sampling_ratio x sampling_ratio grid points are used. If <= 0, then an adaptive number of grid points are used (computed as ceil(roi_width / pooled_w), and likewise for height). Default: -1 Returns: Tensor[K, C, output_size[0], output_size[1]]: The pooled RoIs """ _assert_has_ops() check_roi_boxes_shape(boxes) rois = boxes output_size = _pair(output_size) if not isinstance(rois, torch.Tensor): rois = convert_boxes_to_roi_format(rois) output, _ = torch.ops.torchvision.ps_roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio) return output
def ps_roi_pool( input: Tensor, boxes: Tensor, output_size: int, spatial_scale: float = 1.0, ) -> Tensor: """ Performs Position-Sensitive Region of Interest (RoI) Pool operator described in R-FCN Args: input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``H x W``. boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. If a single Tensor is passed, then the first column should contain the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in the batch. output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling is performed, as (height, width). spatial_scale (float): a scaling factor that maps the box coordinates to the input coordinates. For example, if your boxes are defined on the scale of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of the original image), you'll want to set this to 0.5. Default: 1.0 Returns: Tensor[K, C / (output_size[0] * output_size[1]), output_size[0], output_size[1]]: The pooled RoIs. """ if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(ps_roi_pool) _assert_has_ops() check_roi_boxes_shape(boxes) rois = boxes output_size = _pair(output_size) if not isinstance(rois, torch.Tensor): rois = convert_boxes_to_roi_format(rois) output, _ = torch.ops.torchvision.ps_roi_pool(input, rois, spatial_scale, output_size[0], output_size[1]) return output
def roi_pool( input: Tensor, boxes: Union[Tensor, List[Tensor]], output_size: BroadcastingList2[int], spatial_scale: float = 1.0, ) -> Tensor: """ Performs Region of Interest (RoI) Pool operator described in Fast R-CNN Args: input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``H x W``. boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. If a single Tensor is passed, then the first column should contain the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in the batch. output_size (int or Tuple[int, int]): the size of the output after the cropping is performed, as (height, width) spatial_scale (float): a scaling factor that maps the input coordinates to the box coordinates. Default: 1.0 Returns: Tensor[K, C, output_size[0], output_size[1]]: The pooled RoIs. """ _assert_has_ops() check_roi_boxes_shape(boxes) rois = boxes output_size = _pair(output_size) if not isinstance(rois, torch.Tensor): rois = convert_boxes_to_roi_format(rois) output, _ = torch.ops.torchvision.roi_pool(input, rois, spatial_scale, output_size[0], output_size[1]) return output
def deform_conv2d( input: Tensor, offset: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Tuple[int, int] = (1, 1), padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), mask: Optional[Tensor] = None, ) -> Tensor: """ Performs Deformable Convolution, described in Deformable Convolutional Networks Arguments: input (Tensor[batch_size, in_channels, in_height, in_width]): input tensor offset (Tensor[batch_size, 2 * offset_groups * kernel_height * kernel_width, out_height, out_width]): offsets to be applied for each position in the convolution kernel. weight (Tensor[out_channels, in_channels // groups, kernel_height, kernel_width]): convolution weights, split into groups of size (in_channels // groups) bias (Tensor[out_channels]): optional bias of shape (out_channels,). Default: None stride (int or Tuple[int, int]): distance between convolution centers. Default: 1 padding (int or Tuple[int, int]): height/width of padding of zeroes around each image. Default: 0 dilation (int or Tuple[int, int]): the spacing between kernel elements. Default: 1 mask (Tensor[batch_size, offset_groups * kernel_height * kernel_width, out_height, out_width]): masks to be applied for each position in the convolution kernel. Returns: output (Tensor[batch_sz, out_channels, out_h, out_w]): result of convolution Examples:: >>> input = torch.rand(4, 3, 10, 10) >>> kh, kw = 3, 3 >>> weight = torch.rand(5, 3, kh, kw) >>> # offset and mask should have the same spatial size as the output >>> # of the convolution. In this case, for an input of 10, stride of 1 >>> # and kernel size of 3, without padding, the output size is 8 >>> offset = torch.rand(4, 2 * kh * kw, 8, 8) >>> mask = torch.rand(4, kh * kw, 8, 8) >>> out = deform_conv2d(input, offset, weight, mask=mask) >>> print(out.shape) >>> # returns >>> torch.Size([4, 5, 8, 8]) """ _assert_has_ops() out_channels = weight.shape[0] use_mask = mask is not None if mask is None: mask = torch.zeros((input.shape[0], 0), device=input.device, dtype=input.dtype) if bias is None: bias = torch.zeros(out_channels, device=input.device, dtype=input.dtype) stride_h, stride_w = _pair(stride) pad_h, pad_w = _pair(padding) dil_h, dil_w = _pair(dilation) weights_h, weights_w = weight.shape[-2:] _, n_in_channels, in_h, in_w = input.shape n_offset_grps = offset.shape[1] // (2 * weights_h * weights_w) n_weight_grps = n_in_channels // weight.shape[1] if n_offset_grps == 0: raise RuntimeError( "the shape of the offset tensor at dimension 1 is not valid. It should " "be a multiple of 2 * weight.size[2] * weight.size[3].\n" "Got offset.shape[1]={}, while 2 * weight.size[2] * weight.size[3]={}".format( offset.shape[1], 2 * weights_h * weights_w)) return torch.ops.torchvision.deform_conv2d( input, weight, offset, mask, bias, stride_h, stride_w, pad_h, pad_w, dil_h, dil_w, n_weight_grps, n_offset_grps, use_mask,)
def deform_conv3d( input: Tensor, weight: Tensor, offset: Tensor, mask: Optional[Tensor] = None, bias: Optional[Tensor] = None, stride: Tuple[int, int, int] = (1, 1, 1), padding: Tuple[int, int, int] = (0, 0, 0), dilation: Tuple[int, int, int] = (1, 1, 1) ) -> Tensor: r""" Performs 3D version of Deformable Convolution v2, described in `Deformable ConvNets v2: More Deformable, Better Results <https://arxiv.org/abs/1811.11168>`__ if :attr:`mask` is not ``None`` and Performs 3D version of Deformable Convolution, described in `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`__ if :attr:`mask` is ``None``. Arguments: input (Tensor[batch_size, in_channels, in_height, in_width, in_depth]): input tensor weight (Tensor[out_channels, in_channels // groups, kernel_height, kernel_width]): convolution weights, split into groups of size (in_channels // groups) offset (Tensor[batch_size, 3 * offset_groups * kernel_height * kernel_width, out_height, out_width]): offsets to be applied for each position in the convolution kernel. mask (Tensor[batch_size, 3 * offset_groups * kernel_height * kernel_width, out_height, out_width]): modulation masks to be multiplied with each output of convolution kernel. bias (Tensor[out_channels]): optional bias of shape (out_channels,). Default: None stride (int or Tuple[int, int, int]): distance between convolution centers. Default: 1 padding (int or Tuple[int, int, int]): height/width of padding of zeroes around each image. Default: 0 dilation (int or Tuple[int, int, int]): the spacing between kernel elements. Default: 1 Returns: output (Tensor[batch_sz, out_channels, out_h, out_w, out_d]): result of convolution Examples:: >>> input = torch.rand(1, 3, 10, 10, 10) >>> kd, kh, kw = 3, 3, 3 >>> weight = torch.rand(5, 3, kd, kh, kw) >>> # offset and mask should have the same spatial size as the output >>> # of the convolution. In this case, for an input of 10, stride of 1 >>> # and kernel size of 3, without padding, the output size is 8 >>> offset = torch.rand(5, 3 * kd * kh * kw, 8, 8, 8) >>> mask = torch.rand(5, kd * kh * kw, 8, 8, 8) >>> out = deform_conv3d(input, weight, offset, mask) >>> print(out.shape) >>> # returns >>> torch.Size([1, 5, 8, 8, 8]) """ _assert_has_ops() out_channels = weight.shape[0] modulated = mask is not None if mask is None: mask = torch.zeros((input.shape[0], 0), device=input.device, dtype=input.dtype) if bias is None: bias = torch.zeros(out_channels, device=input.device, dtype=input.dtype) stride_d, stride_h, stride_w = _triple(stride) pad_d, pad_h, pad_w = _triple(padding) dil_d, dil_h, dil_w = _triple(dilation) weights_d, weights_h, weights_w = weight.shape[-3:] _, n_in_channels, in_d, in_h, in_w = input.shape n_offset_grps = offset.shape[1] // (3 * weights_d * weights_h * weights_w) n_weight_grps = n_in_channels // weight.shape[1] if n_offset_grps == 0: raise RuntimeError( "the shape of the offset tensor at dimension 1 is not valid. It should " "be a multiple of 3 * weight.size[2] * weight.size[3] * weight.size[4].\n" "Got offset.shape[1]={}, while 3 * weight.size[2] * weight.size[3] * weight.size[4]={}" .format(offset.shape[1], 3 * weights_d * weights_h * weights_w)) return torch.ops.tvdcn.deform_conv3d(input, weight, offset, mask, bias, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, dil_d, dil_h, dil_w, n_weight_grps, n_offset_grps, modulated)