def preprocess_data(self, images): # T x H x W x C images = [image[:, :, ::-1] for image in images] images = np.concatenate([image[np.newaxis] for image in images]) images = torch.from_numpy(images).float() images = images / 255. images -= self.data_mean images /= self.data_std images = images.permute(3, 0, 1, 2) # -> C x T x H x W images, _ = transform.random_short_side_scale_jitter( images, self.min_scale, self.max_scale) images, _ = transform.uniform_crop(images, self.crop_size, 0) images = images.unsqueeze(0) # Fast Path Way index = torch.linspace(0, images.shape[2] - 1, self.num_frames).long() fast_pathway = torch.index_select(images, 2, index) # Slow Path Way index = torch.linspace(0, fast_pathway.shape[2] - 1, fast_pathway.shape[2] // self.alpha).long() slow_pathway = torch.index_select(fast_pathway, 2, index) inputs = [slow_pathway, fast_pathway] for i in range(2): inputs[i] = inputs[i].to(self.device) return inputs
def spatial_sampling( frames, spatial_idx=-1, min_scale=256, max_scale=320, crop_size=224, random_horizontal_flip=True, inverse_uniform_sampling=False, ): """ Perform spatial sampling on the given video frames. If spatial_idx is -1, perform random scale, random crop, and random flip on the given frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling with the given spatial_idx. Args: frames (tensor): frames of images sampled from the video. The dimension is `num frames` x `height` x `width` x `channel`. spatial_idx (int): if -1, perform random spatial sampling. If 0, 1, or 2, perform left, center, right crop if width is larger than height, and perform top, center, buttom crop if height is larger than width. min_scale (int): the minimal size of scaling. max_scale (int): the maximal size of scaling. crop_size (int): the size of height and width used to crop the frames. inverse_uniform_sampling (bool): if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a reciprocal to get the scale. If False, take a uniform sample from [min_scale, max_scale]. Returns: frames (tensor): spatially sampled frames. """ assert spatial_idx in [-1, 0, 1, 2] if spatial_idx == -1: frames, _ = transform.random_short_side_scale_jitter( images=frames, min_size=min_scale, max_size=max_scale, inverse_uniform_sampling=inverse_uniform_sampling, ) frames, _ = transform.random_crop(frames, crop_size) if random_horizontal_flip: frames, _ = transform.horizontal_flip(0.5, frames) else: # The testing is deterministic and no jitter should be performed. # min_scale, max_scale, and crop_size are expect to be the same. assert len({min_scale, max_scale, crop_size}) == 1 frames, _ = transform.random_short_side_scale_jitter( frames, min_scale, max_scale) frames, _ = transform.uniform_crop(frames, crop_size, spatial_idx) return frames
def _prepare_im_res(self, im_path): # Prepare resnet style augmentation. im = self.load_image(im_path) # Train and test setups differ train_size, test_size = ( self.cfg.DATA.TRAIN_CROP_SIZE, self.cfg.DATA.TEST_CROP_SIZE, ) if self.mode == "train": # For training use random_sized_crop, horizontal_flip, augment, lighting im = transform.random_sized_crop_img( im, train_size, jitter_scale=self.cfg.DATA.TRAIN_JITTER_SCALES_RELATIVE, jitter_aspect=self.cfg.DATA.TRAIN_JITTER_ASPECT_RELATIVE, ) im, _ = transform.horizontal_flip(prob=0.5, images=im) # im = transforms.augment(im, cfg.TRAIN.AUGMENT) im = transform.lighting_jitter( im, 0.1, self.cfg.DATA.TRAIN_PCA_EIGVAL, self.cfg.DATA.TRAIN_PCA_EIGVEC, ) else: # For testing use scale and center crop im, _ = transform.uniform_crop(im, test_size, spatial_idx=1, scale_size=train_size) # For training and testing use color normalization im = transform.color_normalization(im, self.cfg.DATA.MEAN, self.cfg.DATA.STD) # Convert HWC/RGB/float to CHW/BGR/float format # im = np.ascontiguousarray(im[:, :, ::-1].transpose([2, 0, 1])) return im
def _images_and_boxes_preprocessing(self, imgs, boxes): """ This function performs preprocessing for the input images and corresponding boxes for one clip. Args: imgs (tensor): the images. boxes (ndarray): the boxes for the current clip. Returns: imgs (tensor): list of preprocessed images. boxes (ndarray): preprocessed boxes. """ # Image [0, 255] -> [0, 1]. imgs = imgs.float() imgs = imgs / 255.0 height, width = imgs.shape[2], imgs.shape[3] # The format of boxes is [x1, y1, x2, y2]. The input boxes are in the # range of [0, 1]. boxes[:, [0, 2]] *= width boxes[:, [1, 3]] *= height boxes = transform.clip_boxes_to_image(boxes, height, width) if self._split == "train": # Train split imgs, boxes = transform.random_short_side_scale_jitter( imgs, min_size=self._jitter_min_scale, max_size=self._jitter_max_scale, boxes=boxes, ) imgs, boxes = transform.random_crop(imgs, self._crop_size, boxes=boxes) # Random flip. imgs, boxes = transform.horizontal_flip(0.5, imgs, boxes=boxes) elif self._split == "val": # Val split # Resize short side to crop_size. Non-local and STRG uses 256. imgs, boxes = transform.random_short_side_scale_jitter( imgs, min_size=self._crop_size, max_size=self._crop_size, boxes=boxes, ) # Apply center crop for val split imgs, boxes = transform.uniform_crop(imgs, size=self._crop_size, spatial_idx=1, boxes=boxes) if self._test_force_flip: imgs, boxes = transform.horizontal_flip(1, imgs, boxes=boxes) elif self._split == "test": # Test split # Resize short side to crop_size. Non-local and STRG uses 256. imgs, boxes = transform.random_short_side_scale_jitter( imgs, min_size=self._crop_size, max_size=self._crop_size, boxes=boxes, ) if self._test_force_flip: imgs, boxes = transform.horizontal_flip(1, imgs, boxes=boxes) else: raise NotImplementedError("{} split not supported yet!".format( self._split)) if self.cfg.AVA.MANUAL_ROUND: imgs = (imgs * 255).byte().float() / 255 # Do color augmentation (after divided by 255.0). if self._split == "train" and self._use_color_augmentation: if not self._pca_jitter_only: imgs = transform.color_jitter( imgs, img_brightness=0.4, img_contrast=0.4, img_saturation=0.4, ) imgs = transform.lighting( imgs, alphastd=0.1, eigval=np.array(self._pca_eigval).astype(np.float32), eigvec=np.array(self._pca_eigvec).astype(np.float32), ) # Normalize images by mean and std. imgs = transform.color_normalization( imgs, np.array(self._data_mean, dtype=np.float32), np.array(self._data_std, dtype=np.float32), ) if not self._use_bgr: # Convert image format from BGR to RGB. # Note that Kinetics pre-training uses RGB! imgs = imgs[:, [2, 1, 0], ...] boxes = transform.clip_boxes_to_image(boxes, self._crop_size, self._crop_size) return imgs, boxes