def prepare_input( rgb: torch.Tensor, resize_res: int = 256, inp_res: int = 224, mean: torch.Tensor = 0.5 * torch.ones(3), std=1.0 * torch.ones(3), ): """ Process the video: 1) Resize to [resize_res x resize_res] 2) Center crop with [inp_res x inp_res] 3) Color normalize using mean/std """ iC, iF, iH, iW = rgb.shape rgb_resized = np.zeros((iF, resize_res, resize_res, iC)) for t in range(iF): tmp = rgb[:, t, :, :] tmp = resize_generic( im_to_numpy(tmp), resize_res, resize_res, interp="bilinear", is_flow=False ) rgb_resized[t] = tmp rgb = np.transpose(rgb_resized, (3, 0, 1, 2)) # Center crop coords ulx = int((resize_res - inp_res) / 2) uly = int((resize_res - inp_res) / 2) # Crop 256x256 rgb = rgb[:, :, uly : uly + inp_res, ulx : ulx + inp_res] rgb = to_torch(rgb).float() assert rgb.max() <= 1 rgb = color_normalize(rgb, mean, std) return rgb
def prepare_image(self, image): was_fixed_point = not image.is_floating_point() image = torch.empty_like(image, dtype=torch.float32).copy_(image) if was_fixed_point: image /= 255.0 if image.shape[-2:] != self.input_shape: image = fit(image, self.input_shape, fit_mode='contain') image = color_normalize(image, self.data_info.rgb_mean, self.data_info.rgb_stddev) return image
def get_training_image(img_path, bbox=None, inp_res=256, mean=(0.6419, 0.6292, 0.5994), std=(0.2311, 0.2304, 0.2379)): img = load_image(img_path) if bbox is not None: x0, y0, x1, y1 = bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1] c = np.array([(x0 + x1), (y0 + y1)]) / 2 # center s = np.sqrt((y1 - y0) * (x1 - x0)) / 60.0 # scale else: c = np.array([img.shape[2] / 2, img.shape[1] / 2]) s = 5.0 # THIS HAS TO BE FIXED !!! r = 0 # rotation inp = crop(img, c, s, [inp_res, inp_res], rot=r) inp = color_normalize(inp, mean, std) meta = {'center': c, 'scale': s} return inp, meta
def _get_single_video(self, index, data_index, frame_ix): """Loads/augments/returns the video data :param index: Index wrt to the data loader :param data_index: Index wrt to train/valid list :param frame_ix: A list of frame indices to sample from the video :return data: Dictionary of input/output and other metadata """ # If the input is pose (Pose->Sign experiments) if hasattr(self, "input_type") and self.input_type == "pose": data = { "rgb": self._get_pose(data_index, frame_ix), "index": index, "data_index": data_index, "class": self._get_class(data_index, frame_ix), "class_names": self.class_names, "dataset": self.datasetname, } return data # Otherwise the input is RGB else: rgb = self._load_rgb(data_index, frame_ix) if getattr(self, "mask_rgb", False): rgb = self._mask_rgb( rgb, data_index, frame_ix, region=self.mask_rgb, mask_type=self.mask_type, ) if getattr(self, "gpu_collation", False): # Meta info data = { "rgb": rgb, "index": index, "data_index": data_index, "class": self._get_class(data_index, frame_ix), "class_names": self.class_names, "dataset": self.datasetname, } return data # Preparing RGB data if self.setname == "train": # Horizontal flip: disable for now, should be done after the bbox cropping is_hflip = random.random() < self.hflip if is_hflip: rgb = torch.flip(rgb, dims=[2]) # Color jitter rgb = im_color_jitter(rgb, num_in_frames=self.num_in_frames, thr=0.2) rgb = im_to_numpy(rgb) iH, iW, iC = rgb.shape if self.use_bbox: y0, x0, y1, x1 = self._get_bbox(data_index) y0 = max(0, int(y0 * iH)) y1 = min(iH, int(y1 * iH)) x0 = max(0, int(x0 * iW)) x1 = min(iW, int(x1 * iW)) if self.setname == "train" and is_hflip: x0 = iW - x0 x1 = iW - x1 x0, x1 = x1, x0 rgb = rgb[y0:y1, x0:x1, :] rgb = resize_generic( rgb, self.resize_res, self.resize_res, interp="bilinear", is_flow=False, ) iH, iW, iC = rgb.shape resol = self.resize_res # 300 for 256, 130 for 112 etc. if self.setname == "train": # Augment the scaled resolution between: # [1 - self.scale_factor, 1 + self.scale_factor) rand_scale = random.random() resol *= 1 - self.scale_factor + 2 * self.scale_factor * rand_scale resol = int(resol) if iW > iH: nH, nW = resol, int(resol * iW / iH) else: nH, nW = int(resol * iH / iW), resol # Resize to nH, nW resolution rgb = resize_generic(rgb, nH, nW, interp="bilinear", is_flow=False) # Crop if self.setname == "train": # Random crop coords ulx = random.randint(0, nW - self.inp_res) uly = random.randint(0, nH - self.inp_res) else: # Center crop coords ulx = int((nW - self.inp_res) / 2) uly = int((nH - self.inp_res) / 2) # Crop 256x256 rgb = rgb[uly : uly + self.inp_res, ulx : ulx + self.inp_res] rgb = im_to_torch(rgb) rgb = im_to_video(rgb) rgb = color_normalize(rgb, self.mean, self.std) # Return data = { "rgb": rgb, "class": self._get_class(data_index, frame_ix), "index": index, "class_names": self.class_names, "dataset": self.datasetname, } return data
def gpu_collater(self, minibatch, concat_datasets=None): rgb = minibatch["rgb"] assert rgb.is_cuda, "expected tensor to be on the GPU" if self.setname == "train": is_hflip = random.random() < self.hflip if is_hflip: # horizontal axis is last rgb = torch.flip(rgb, dims=[-1]) if self.setname == "train": rgb = im_color_jitter(rgb, num_in_frames=self.num_in_frames, thr=0.2) # For now, mimic the original pipeline. If it's still a bottleneck, we should # collapse the cropping, resizing etc. logic into a single sampling grid. iB, iC, iK, iH, iW = rgb.shape assert iK == self.num_in_frames, "unexpected number of frames per clip" bbox_yxyx = np.zeros((iB, 4), dtype=np.float32) for ii, data_index in enumerate(minibatch["data_index"]): bbox_yxyx[ii] = np.array([0, 0, 1, 1]) # Otherwise, it fails when mixing use_bbox True and False for two datasets if concat_datasets is not None: local_use_bbox = concat_datasets[minibatch["dataset"][ii]].use_bbox else: local_use_bbox = self.use_bbox if local_use_bbox: # Until we patch ConcatDataset, we need to pass the dataset object # explicitly to handle bbox selection if concat_datasets is not None: get_bbox = concat_datasets[minibatch["dataset"][ii]]._get_bbox else: get_bbox = self._get_bbox bbox_yxyx[ii] = get_bbox(data_index) # require that the original boxes lie inside the image bbox_yxyx[:, :2] = np.maximum(0, bbox_yxyx[:, :2]) bbox_yxyx[:, 2:] = np.minimum(1, bbox_yxyx[:, 2:]) if self.setname == "train": if is_hflip: flipped_xmin = 1 - bbox_yxyx[:, 3] flipped_xmax = 1 - bbox_yxyx[:, 1] bbox_yxyx[:, 1] = flipped_xmin bbox_yxyx[:, 3] = flipped_xmax # apply a random (isotropic) scale factor to box coordinates rand_scale = np.random.rand(iB, 1) rand_scale = 1 - self.scale_factor + 2 * self.scale_factor * rand_scale # Mimic the meaning of scale used in CPU pipeline rand_scale = 1 / rand_scale bbox_yxyx = scale_yxyx_bbox(bbox_yxyx, scale=rand_scale) # apply random/center cropping to match the proportions used in the original code # (the scaling is not quite identical, but close to it) if self.setname == "train": crop_box_sc = (self.inp_res / self.resize_res) * rand_scale else: crop_box_sc = self.inp_res / self.resize_res bbox_yxyx = scale_yxyx_bbox(bbox_yxyx, scale=crop_box_sc) # If training, jitter the location such that it still lies within the appropriate # region defined by the (optionally scaled) bounding box if self.setname == "train": crop_bbox_cenhw = bbox_format(bbox_yxyx, src="yxyx", dest="cenhw") cropped_hw = crop_bbox_cenhw[:, 2:] valid_offset_region_hw = ((1 - crop_box_sc) / crop_box_sc) * cropped_hw valid_offset_samples = np.random.rand(iB, 2) valid_rand_offsets = (valid_offset_samples - 0.5) * valid_offset_region_hw # apply offsets bbox_yxyx += np.tile(valid_rand_offsets, (1, 2)) # TODO(Samuel): go back over: # (1) the corner alignment logic to check we are doing # the right thing here. # (2) whether zero padding is appropriate for out-of-bounds handling # center in [-1, -1] coordinates bbox_yxyx = 2 * bbox_yxyx - 1 grids = torch.zeros( iB, self.inp_res, self.inp_res, 2, device=rgb.device, dtype=rgb.dtype ) for ii, bbox in enumerate(bbox_yxyx): yticks = torch.linspace(start=bbox[0], end=bbox[2], steps=self.inp_res) xticks = torch.linspace(start=bbox[1], end=bbox[3], steps=self.inp_res) grid_y, grid_x = torch.meshgrid(yticks, xticks) # The grid expects the ordering to be x then y grids[ii] = torch.stack((grid_x, grid_y), 2) # merge RGB and clip dimensions to use with grid sampler rgb = rgb.view(rgb.shape[0], 3 * self.num_in_frames, iH, iW) rgb = torch.nn.functional.grid_sample( rgb, grid=grids, mode="bilinear", align_corners=False, padding_mode="zeros", ) # unflatten channel/clip dimension rgb = rgb.view(rgb.shape[0], 3, self.num_in_frames, self.inp_res, self.inp_res) rgb = color_normalize(rgb, self.mean, self.std) minibatch["rgb"] = rgb return minibatch
def __getitem__(self, index): sf = self.scale_factor rf = self.rot_factor if self.is_train: a = self.anno[self.train_list[index]] else: a = self.anno[self.valid_list[index]] img_path = os.path.join(self.img_folder, a['img_paths']) pts = torch.Tensor(a['joint_self']) # pts[:, 0:2] -= 1 # Convert pts to zero based # c = torch.Tensor(a['objpos']) - 1 c = torch.Tensor(a['objpos']) s = a['scale_provided'] # Adjust center/scale slightly to avoid cropping limbs if c[0] != -1: c[1] = c[1] + 15 * s s = s * 1.25 # For single-person pose estimation with a centered/scaled figure nparts = pts.size(0) img = load_image(img_path) # CxHxW r = 0 if self.is_train: s = s * torch.randn(1).mul_(sf).add_(1).clamp(1 - sf, 1 + sf)[0] r = torch.randn(1).mul_(rf).clamp( -2 * rf, 2 * rf)[0] if random.random() <= 0.6 else 0 # Flip if random.random() <= 0.5: img = fliplr(img) pts = shufflelr(pts, img.size(2), self.DATA_INFO.hflip_indices) c[0] = img.size(2) - c[0] # Color img[0, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1) img[1, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1) img[2, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1) # Prepare image and groundtruth map inp = crop(img, c, s, self.inp_res, rot=r) inp = color_normalize(inp, self.DATA_INFO.rgb_mean, self.DATA_INFO.rgb_stddev) # Generate ground truth tpts = pts.clone() target = torch.zeros(nparts, *self.out_res) target_weight = tpts[:, 2].clone().view(nparts, 1) for i in range(nparts): # if tpts[i, 2] > 0: # This is evil!! if tpts[i, 1] > 0: tpts[i, 0:2] = to_torch( transform(tpts[i, 0:2] + 1, c, s, self.out_res, rot=r)) target[i], vis = draw_labelmap(target[i], tpts[i] - 1, self.sigma, type=self.label_type) target_weight[i, 0] *= vis # Meta info if not isinstance(s, torch.Tensor): s = torch.Tensor(s) meta = { 'index': index, 'center': c, 'scale': s, 'pts': pts, 'tpts': tpts, 'target_weight': target_weight } return inp, target, meta