def _simple_roialign_with_grad(self, img, box, resolution, device): if isinstance(resolution, int): resolution = (resolution, resolution) op = ROIAlign(resolution, 1.0, 0, aligned=True) input = torch.from_numpy(img[None, None, :, :].astype("float32")) rois = [0] + list(box) rois = torch.from_numpy(np.asarray(rois)[None, :].astype("float32")) input = input.to(device=device) rois = rois.to(device=device) input.requires_grad = True output = op.forward(input, rois) return input, output
def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor: """ Crop each bitmask by the given box, and resize results to (mask_size, mask_size). This can be used to prepare training targets for Mask R-CNN. It has less reconstruction error compared to rasterization with polygons. However we observe no difference in accuracy, but BitMasks requires more memory to store all the masks. Args: boxes (Tensor): Nx4 tensor storing the boxes for each mask mask_size (int): the size of the rasterized mask. Returns: Tensor: A bool tensor of shape (N, mask_size, mask_size), where N is the number of predicted boxes for this image. """ assert len(boxes) == len(self), "{} != {}".format( len(boxes), len(self)) device = self.tensor.device batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[:, None] rois = torch.cat([batch_inds, boxes], dim=1) # Nx5 bit_masks = self.tensor.to(dtype=torch.float32) rois = rois.to(device=device) output = (ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True).forward(bit_masks[:, None, :, :], rois).squeeze(1)) output = output >= 0.5 return output
def _simple_roialign(self, img, box, resolution, aligned=True): """ RoiAlign with scale 1.0 and 0 sample ratio. """ if isinstance(resolution, int): resolution = (resolution, resolution) op = ROIAlign(resolution, 1.0, 0, aligned=aligned) input = torch.from_numpy(img[None, None, :, :].astype("float32")) rois = [0] + list(box) rois = torch.from_numpy(np.asarray(rois)[None, :].astype("float32")) output = op.forward(input, rois) if torch.cuda.is_available(): output_cuda = op.forward(input.cuda(), rois.cuda()).cpu() self.assertTrue(torch.allclose(output, output_cuda)) return output[0, 0]
def test_roi_align_rotated_gradient_cuda(self): """ Compute gradients for ROIAlignRotated with multiple bounding boxes on the GPU, and compare the result with ROIAlign """ # torch.manual_seed(123) dtype = torch.float64 device = torch.device("cuda") pool_h, pool_w = (5, 5) roi_align = ROIAlign(output_size=(pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device) roi_align_rotated = ROIAlignRotated(output_size=(pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(device=device) x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True) # x_rotated = x.clone() won't work (will lead to grad_fun=CloneBackward)! x_rotated = Variable(x.data.clone(), requires_grad=True) # roi_rotated format is (batch index, x_center, y_center, width, height, angle) rois_rotated = torch.tensor( [[0, 4.5, 4.5, 9, 9, 0], [0, 2, 7, 4, 4, 0], [0, 7, 7, 4, 4, 0]], dtype=dtype, device=device, ) y_rotated = roi_align_rotated(x_rotated, rois_rotated) s_rotated = y_rotated.sum() s_rotated.backward() # roi format is (batch index, x1, y1, x2, y2) rois = torch.tensor( [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9]], dtype=dtype, device=device) y = roi_align(x, rois) s = y.sum() s.backward() assert torch.allclose( x.grad, x_rotated.grad ), "gradients for ROIAlign and ROIAlignRotated mismatch on CUDA"
def crop_resize_by_d2_roialign( img, center, scale, output_size, aligned=True, interpolation="bilinear", in_format="HWC", out_format="HWC", dtype="float32", ): """ img: HWC output_size: int or (w, h) """ import torch from detectron2.layers.roi_align import ROIAlign from torchvision.ops import RoIPool if isinstance(output_size, int): output_size = (output_size, output_size) # NOTE: different to cv2 convention!!! output_size = (output_size[1], output_size[0]) # to (h, w) if interpolation == "bilinear": op = ROIAlign(output_size, 1.0, 0, aligned=aligned) elif interpolation == "nearest": op = RoIPool(output_size, 1.0) # else: raise ValueError(f"Wrong interpolation type: {interpolation}") assert in_format in ["HW", "HWC", "CHW"] if in_format == "HW": img = img[None] elif in_format == "HWC": img = img.transpose(2, 0, 1) # CHW img_tensor = torch.tensor(img[None].astype("float32")) cx, cy = center if isinstance(scale, (int, float)): scale = (scale, scale) bw, bh = scale rois = torch.tensor(np.array([0] + [cx - bw / 2, cy - bh / 2, cx + bw / 2, cy + bh / 2], dtype="float32")[None]) result = op(img_tensor, rois)[0].numpy().astype(dtype) if out_format == "HWC": result = result.transpose(1, 2, 0) return result
def test_empty_batch(self): input = torch.zeros(0, 3, 10, 10, dtype=torch.float32) rois = torch.zeros(0, 5, dtype=torch.float32) op = ROIAlign((7, 7), 1.0, 0, aligned=True) output = op.forward(input, rois) self.assertTrue(output.shape == (0, 3, 7, 7))
def measure_roialign_perf(input_shape, roi_shape, output_size, spatial_scale, sampling_ratio=0, aligned=True): """ Args: input: NCHW images rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy. output_size (tuple): h, w spatial_scale (float): scale the input boxes by this number sampling_ratio (int): number of inputs samples to take for each output sample. 0 to take samples densely. aligned (bool): if False, use the legacy implementation in Detectron. If True, align the results more perfectly. """ assert roi_shape[1] == 5, "ERROR: ROI shape expected to be of form (m,5)" # Preparing Inputs n = input_shape[0] b = roi_shape[0] inputbatch = torch.randn(input_shape, dtype=torch.float, requires_grad=True) # creating ROI tensor - shape (b,5) # RoI tensor [:, 1:] contains coordiantes of bounding boxes - xyxy. # (100,1200) range chosen based on COCO max image size. bboxes = torch.FloatTensor(roi_shape[0], 4).uniform_(100,1200) # First column of RoI tensor maps bounding box to image in batch. # Based on my observations, the boxes are ordered by image index in batch, # ie all boxes corresponding to first image first, then for the second # image, third image and so on. boxToNMapping = torch.tensor( np.expand_dims(np.array([i * n // b for i in range(b)]), axis=1), dtype=torch.float) roi = torch.cat((boxToNMapping, bboxes), dim=1) roi.requires_grad=True #print(inputbatch.shape, roi.shape) # Defining Op roi_align = ROIAlign(output_size, spatial_scale, sampling_ratio, aligned) roi_align.cuda() inputbatch = inputbatch.cuda() roi = roi.cuda() # Forward Pass # warmup - 2 iters roi_align.forward(inputbatch, roi) roi_align.forward(inputbatch, roi) torch.cuda.synchronize() start = time.time() for _ in range(ITERATIONS): #output = roi_align.forward(inputbatch.cuda(), roi.cuda()) output = roi_align.forward(inputbatch, roi) torch.cuda.synchronize() end = time.time() fwd_time = (end - start) * 1000 / ITERATIONS # Backward Pass # required hack to call backward() output_sum = output.sum() # warmup output_sum.backward(retain_graph=True) output_sum.backward(retain_graph=True) torch.cuda.synchronize() bwd_start = time.time() for _ in range(ITERATIONS): output_sum.backward(retain_graph=True) torch.cuda.synchronize() bwd_end = time.time() bwd_time = (bwd_end - bwd_start) * 1000 / ITERATIONS return fwd_time, bwd_time
os.path.join(save_dir, '{}.pth'.format(idx * b_s + i))) batch_size = 32 box_num = 10 clips_len = 32 dataset = 'something' cuda = True path = 'data/{}/feats'.format(dataset) # make directory for extracted features mkdir(path) # set up some layers roi = ROIAlign((7, 7), 7.0 / 224.0, 0) avg_pool2d = torch.nn.AdaptiveMaxPool2d((1, 1)) avg_pool3d = nn.AdaptiveAvgPool3d((1, 1, 1)) # set up base network net = resnet.i3_res50_nl(num_classes=400, pretrained=True) # net = resnet.i3_res50(num_classes=400, pretrained=True) if cuda: net.cuda() net = nn.DataParallel(net) net.eval() # set up dataloader testset = something.Something(root='data/{}'.format(dataset), split='val', clip_len=clips_len)