def __getitem__(self, idx): # BGR image filename = self.image_files[idx] print('filename = ', filename) im = cv2.imread(filename) if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = chromatic_transform(im) if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = add_noise(im) im_tensor = torch.from_numpy(im) / 255.0 im_tensor_bgr = im_tensor.clone() im_tensor_bgr = im_tensor_bgr.permute(2, 0, 1) im_tensor -= self._pixel_mean image_blob = im_tensor.permute(2, 0, 1) # Label labels_filename = filename.replace('image_color', 'annotation') foreground_labels = util_.imread_indexed(labels_filename) foreground_labels = self.process_label(foreground_labels) label_blob = torch.from_numpy(foreground_labels).unsqueeze(0) index = filename.find('OSD') sample = { 'image_color': image_blob, 'image_color_bgr': im_tensor_bgr, 'label': label_blob, 'filename': filename[index + 4:] } # Depth image if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': pcd_filename = filename.replace('image_color', 'pcd') pcd_filename = pcd_filename.replace('png', 'pcd') print('pcd_filename = ', pcd_filename) pcloud = pcl.load(pcd_filename).to_array() pcloud[np.isnan(pcloud)] = 0 xyz_img = pcloud.reshape((self._height, self._width, 3)) depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1) sample['depth'] = depth_blob # # Depth image # if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': # pcd_filename = filename.replace('image_color', 'pcd') # pcd_filename = pcd_filename.replace('png', 'pcd') # # pcl replaced with open3d # pcloud = o3d.io.read_point_cloud(pcd_filename) # pcloud = np.asarray(pcloud) # print(np.isnan(pcloud)) # pcloud[np.isnan(pcloud)] = 0 # xyz_img = pcloud.reshape((self._height, self._width, 3)) # depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1) # sample['depth'] = depth_blob return sample
def __getitem__(self, idx): # BGR image filename = str(self.image_paths[idx]) im = cv2.imread(filename) if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = chromatic_transform(im) if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = add_noise(im) im_tensor = torch.from_numpy(im) / 255.0 im_tensor_bgr = im_tensor.clone() im_tensor_bgr = im_tensor_bgr.permute(2, 0, 1) im_tensor -= self._pixel_mean image_blob = im_tensor.permute(2, 0, 1) # Label labels_filename = filename.replace('rgb', 'label') foreground_labels = util_.imread_indexed(labels_filename) # mask table as background foreground_labels[foreground_labels == 1] = 0 if 'table' in labels_filename: foreground_labels[foreground_labels == 2] = 0 foreground_labels = self.process_label(foreground_labels) label_blob = torch.from_numpy(foreground_labels).unsqueeze(0) index = filename.find('OCID') sample = { 'image_color': image_blob, 'image_color_bgr': im_tensor_bgr, 'label': label_blob, 'filename': filename[index + 5:] } # Depth image if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': pcd_filename = filename.replace('rgb', 'pcd') pcd_filename = pcd_filename.replace('png', 'pcd') pcloud = pcl.load(pcd_filename).to_array() pcloud[np.isnan(pcloud)] = 0 xyz_img = pcloud.reshape((self._height, self._width, 3)) depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1) sample['depth'] = depth_blob return sample
def _get_image_blob(self, color_file, depth_file, scale_ind): # rgba rgba = pad_im(cv2.imread(color_file, cv2.IMREAD_UNCHANGED), 16) if rgba.shape[2] == 4: im = np.copy(rgba[:, :, :3]) alpha = rgba[:, :, 3] I = np.where(alpha == 0) im[I[0], I[1], :] = 0 else: im = rgba im_scale = cfg.TRAIN.SCALES_BASE[scale_ind] if im_scale != 1.0: im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) height = im.shape[0] width = im.shape[1] # chromatic transform if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = chromatic_transform(im) if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = add_noise(im) im_tensor = torch.from_numpy(im) / 255.0 im_tensor -= self._pixel_mean image_blob = im_tensor.permute(2, 0, 1).float() # depth image im_depth = pad_im(cv2.imread(depth_file, cv2.IMREAD_UNCHANGED), 16) if im_scale != 1.0: im_depth = cv2.resize(im_depth, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_NEAREST) im_depth = im_depth.astype('float') / 1000.0 return image_blob, im_depth, im_scale, height, width
def _get_image_blob(roidb, scale_ind, num_classes, backgrounds, intrinsic_matrix, db_inds_syn, is_syn): """Builds an input blob from the images in the roidb at the specified scales. """ num_images = len(roidb) processed_ims = [] processed_ims_depth = [] processed_ims_normal = [] im_scales = [] roidb_syn = [] for i in xrange(num_images): if is_syn: # depth raw filename = cfg.TRAIN.SYNROOT + '{:06d}-depth.png'.format( db_inds_syn[i]) im_depth_raw = pad_im(cv2.imread(filename, cv2.IMREAD_UNCHANGED), 16) # rgba filename = cfg.TRAIN.SYNROOT + '{:06d}-color.png'.format( db_inds_syn[i]) rgba = pad_im(cv2.imread(filename, cv2.IMREAD_UNCHANGED), 16) # sample a background image ind = np.random.randint(len(backgrounds), size=1)[0] filename = backgrounds[ind] background = cv2.imread(filename, cv2.IMREAD_UNCHANGED) try: background = cv2.resize(background, (rgba.shape[1], rgba.shape[0]), interpolation=cv2.INTER_LINEAR) except: if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'NORMAL': background = np.zeros((rgba.shape[0], rgba.shape[1]), dtype=np.uint16) else: background = np.zeros((rgba.shape[0], rgba.shape[1], 3), dtype=np.uint8) print 'bad background image' if cfg.INPUT != 'DEPTH' and cfg.INPUT != 'NORMAL' and len( background.shape) != 3: background = np.zeros((rgba.shape[0], rgba.shape[1], 3), dtype=np.uint8) print 'bad background image' # add background im = np.copy(rgba[:, :, :3]) alpha = rgba[:, :, 3] I = np.where(alpha == 0) if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'NORMAL': im_depth_raw[I[0], I[1]] = background[I[0], I[1]] / 10 else: im[I[0], I[1], :] = background[I[0], I[1], :3] else: # depth raw im_depth_raw = pad_im( cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16) # rgba rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED), 16) if rgba.shape[2] == 4: im = np.copy(rgba[:, :, :3]) alpha = rgba[:, :, 3] I = np.where(alpha == 0) im[I[0], I[1], :] = 0 else: im = rgba # chromatic transform if cfg.TRAIN.CHROMATIC: im = chromatic_transform(im) if cfg.TRAIN.ADD_NOISE: im = add_noise(im) if roidb[i]['flipped']: im = im[:, ::-1, :] im_orig = im.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_scale = cfg.TRAIN.SCALES_BASE[scale_ind] im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) im_scales.append(im_scale) processed_ims.append(im) # depth im_depth = im_depth_raw.astype(np.float32, copy=True) / float( im_depth_raw.max()) * 255 im_depth = np.tile(im_depth[:, :, np.newaxis], (1, 1, 3)) if cfg.TRAIN.ADD_NOISE: im_depth = add_noise(im_depth) if roidb[i]['flipped']: im_depth = im_depth[:, ::-1] im_orig = im_depth.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_depth = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) processed_ims_depth.append(im_depth) # normals if cfg.INPUT == 'NORMAL': depth = im_depth_raw.astype(np.float32, copy=True) / 1000.0 fx = intrinsic_matrix[0, 0] * im_scale fy = intrinsic_matrix[1, 1] * im_scale cx = intrinsic_matrix[0, 2] * im_scale cy = intrinsic_matrix[1, 2] * im_scale nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0, cfg.GPU_ID) im_normal = 127.5 * nmap + 127.5 im_normal = im_normal.astype(np.uint8) im_normal = im_normal[:, :, (2, 1, 0)] im_normal = cv2.bilateralFilter(im_normal, 9, 75, 75) if roidb[i]['flipped']: im_normal = im_normal[:, ::-1, :] im_orig = im_normal.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_normal = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) processed_ims_normal.append(im_normal) blob_normal = im_list_to_blob(processed_ims_normal, 3) else: blob_normal = [] # Create a blob to hold the input images blob = im_list_to_blob(processed_ims, 3) blob_depth = im_list_to_blob(processed_ims_depth, 3) return blob, blob_depth, blob_normal, im_scales
def _get_image_blob(roidb, scale_ind): """Builds an input blob from the images in the roidb at the specified scales. """ num_images = len(roidb) processed_ims = [] processed_ims_depth = [] processed_ims_normal = [] im_scales = [] if cfg.TRAIN.GAN: processed_ims_rescale = [] for i in range(num_images): # meta data meta_data = scipy.io.loadmat(roidb[i]['meta_data']) K = meta_data['intrinsic_matrix'].astype(np.float32, copy=True) fx = K[0, 0] fy = K[1, 1] cx = K[0, 2] cy = K[1, 2] # depth raw im_depth_raw = pad_im( cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16) height = im_depth_raw.shape[0] width = im_depth_raw.shape[1] # rgba rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED), 16) if rgba.shape[2] == 4: im = np.copy(rgba[:, :, :3]) alpha = rgba[:, :, 3] I = np.where(alpha == 0) im[I[0], I[1], :] = 0 else: im = rgba # chromatic transform if cfg.TRAIN.CHROMATIC: label = pad_im(cv2.imread(roidb[i]['label'], cv2.IMREAD_UNCHANGED), 16) im = chromatic_transform(im, label) # mask the color image according to depth if cfg.EXP_DIR == 'rgbd_scene': I = np.where(im_depth_raw == 0) im[I[0], I[1], :] = 0 if roidb[i]['flipped']: im = im[:, ::-1, :] if cfg.TRAIN.GAN: im_orig = im.astype(np.float32, copy=True) / 127.5 - 1 im_scale = cfg.TRAIN.SCALES_BASE[scale_ind] im_rescale = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) processed_ims_rescale.append(im_rescale) im_orig = im.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_scale = cfg.TRAIN.SCALES_BASE[scale_ind] im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) im_scales.append(im_scale) processed_ims.append(im) # depth im_depth = im_depth_raw.astype(np.float32, copy=True) / float( im_depth_raw.max()) * 255 im_depth = np.tile(im_depth[:, :, np.newaxis], (1, 1, 3)) if roidb[i]['flipped']: im_depth = im_depth[:, ::-1] im_orig = im_depth.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_depth = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) processed_ims_depth.append(im_depth) # normals depth = im_depth_raw.astype(np.float32, copy=True) / float( meta_data['factor_depth']) nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0, cfg.GPU_ID) im_normal = 127.5 * nmap + 127.5 im_normal = im_normal.astype(np.uint8) im_normal = im_normal[:, :, (2, 1, 0)] if roidb[i]['flipped']: im_normal = im_normal[:, ::-1, :] im_orig = im_normal.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_normal = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) processed_ims_normal.append(im_normal) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims, 3) blob_depth = im_list_to_blob(processed_ims_depth, 3) blob_normal = im_list_to_blob(processed_ims_normal, 3) if cfg.TRAIN.GAN: blob_rescale = im_list_to_blob(processed_ims_rescale, 3) else: blob_rescale = [] return blob, blob_rescale, blob_depth, blob_normal, im_scales
def _render_item(self): height = cfg.TRAIN.SYN_HEIGHT width = cfg.TRAIN.SYN_WIDTH fx = self._intrinsic_matrix[0, 0] fy = self._intrinsic_matrix[1, 1] px = self._intrinsic_matrix[0, 2] py = self._intrinsic_matrix[1, 2] zfar = 6.0 znear = 0.25 bound = 0.1 qt = np.zeros((7, ), dtype=np.float32) image_tensor = torch.cuda.FloatTensor(height, width, 4).detach() seg_tensor = torch.cuda.FloatTensor(height, width, 4).detach() if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': pc_tensor = torch.cuda.FloatTensor(height, width, 4).detach() else: pc_tensor = None cfg.renderer.set_projection_matrix(width, height, fx, fy, px, py, znear, zfar) classes = np.array(cfg.TRAIN.CLASSES) # sample target object cls_indexes = [] cls_target = np.random.randint(len(cfg.TRAIN.CLASSES), size=1)[0] cls_indexes.append(cfg.TRAIN.CLASSES[cls_target]) # sample target pose poses_all = [] cls = int(cls_indexes[0]) if self.pose_indexes[cls] >= len(self.pose_lists[cls]): self.pose_indexes[cls] = 0 self.pose_lists[cls] = np.random.permutation( np.arange(len(self.eulers))) roll = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][0] + 15 * np.random.randn() pitch = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][1] + 15 * np.random.randn() yaw = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][2] + 15 * np.random.randn() qt[3:] = euler2quat(roll * math.pi / 180.0, pitch * math.pi / 180.0, yaw * math.pi / 180.0) self.pose_indexes[cls] += 1 qt[0] = np.random.uniform(-bound, bound) qt[1] = np.random.uniform(-bound, bound) qt[2] = np.random.uniform(cfg.TRAIN.SYN_TNEAR, cfg.TRAIN.SYN_TFAR) # render target poses_all.append(qt.copy()) cfg.renderer.set_poses(poses_all) cfg.renderer.set_light_pos(np.random.uniform(-0.5, 0.5, 3)) intensity = np.random.uniform(0.8, 2) light_color = intensity * np.random.uniform(0.9, 1.1, 3) cfg.renderer.set_light_color(light_color) cfg.renderer.render(cls_indexes, image_tensor, seg_tensor) image_tensor = image_tensor.flip(0) seg_tensor = seg_tensor.flip(0) seg = torch.sum(seg_tensor[:, :, :3], dim=2) mask = (seg != 0).cpu().numpy() # sample an occluder cls_indexes.append(0) poses_all.append(np.zeros((7, ), dtype=np.float32)) while 1: while 1: cls_occ = np.random.randint(len(self._classes_all), size=1)[0] if cls_occ != cls_indexes[0]: cls_indexes[1] = cls_occ break # sample poses cls = int(cls_indexes[1]) if self.pose_indexes[cls] >= len(self.pose_lists[cls]): self.pose_indexes[cls] = 0 self.pose_lists[cls] = np.random.permutation( np.arange(len(self.eulers))) roll = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][0] + 15 * np.random.randn() pitch = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][1] + 15 * np.random.randn() yaw = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][2] + 15 * np.random.randn() qt[3:] = euler2quat(roll * math.pi / 180.0, pitch * math.pi / 180.0, yaw * math.pi / 180.0) self.pose_indexes[cls] += 1 # translation, sample an object nearby object_id = 0 extent = np.mean(self._extents_all[cls, :]) flag = np.random.randint(0, 2) if flag == 0: flag = -1 qt[0] = poses_all[object_id][ 0] + flag * extent * np.random.uniform(0.3, 0.5) if np.absolute(qt[0]) > bound: qt[0] = poses_all[object_id][ 0] - flag * extent * np.random.uniform(0.3, 0.5) flag = np.random.randint(0, 2) if flag == 0: flag = -1 qt[1] = poses_all[object_id][ 1] + flag * extent * np.random.uniform(0.3, 0.5) if np.absolute(qt[1]) > bound: qt[1] = poses_all[object_id][ 1] - flag * extent * np.random.uniform(0.3, 0.5) qt[2] = poses_all[object_id][2] - extent * np.random.uniform( 1.0, 2.0) if qt[2] < cfg.TRAIN.SYN_TNEAR: qt[2] = poses_all[object_id][2] + extent * np.random.uniform( 1.0, 2.0) poses_all[1] = qt cfg.renderer.set_poses(poses_all) # rendering cfg.renderer.set_light_pos(np.random.uniform(-0.5, 0.5, 3)) intensity = np.random.uniform(0.8, 2) light_color = intensity * np.random.uniform(0.9, 1.1, 3) cfg.renderer.set_light_color(light_color) cfg.renderer.render(cls_indexes, image_tensor, seg_tensor, pc2_tensor=pc_tensor) seg_tensor = seg_tensor.flip(0) if pc_tensor is not None: pc_tensor = pc_tensor.flip(0) im_label = seg_tensor.cpu().numpy() im_label = im_label[:, :, (2, 1, 0)] * 255 im_label = np.round(im_label).astype(np.uint8) im_label = np.clip(im_label, 0, 255) im_label_only, im_label = self.process_label_image(im_label) # compute occlusion percentage mask_target = (im_label == cls_indexes[0] + 1).astype(np.int32) per_occ = 1.0 - np.sum(mask & mask_target) / np.sum(mask) if per_occ < 0.5: break # RGB to BGR order image_tensor = image_tensor.flip(0) im = image_tensor.cpu().numpy() im = np.clip(im, 0, 1) im = im[:, :, (2, 1, 0)] * 255 im = im.astype(np.uint8) if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': # XYZ coordinates in camera frame im_depth = pc_tensor.cpu().numpy() im_depth = im_depth[:, :, :3] label_blob = np.zeros((self.num_classes, height, width), dtype=np.float32) for i in range(self.num_classes): I = np.where(im_label == classes[i] + 1) if len(I[0]) > 0: label_blob[i, I[0], I[1]] = 1.0 # foreground mask seg = seg_tensor[:, :, 2] + 256 * seg_tensor[:, :, 1] + 256 * 256 * seg_tensor[:, :, 0] mask = (seg != 0).unsqueeze(2).repeat((1, 1, 3)).float().cpu() ''' import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(3, 2, 1) plt.imshow(im[:, :, (2, 1, 0)]) ax = fig.add_subplot(3, 2, 2) plt.imshow(im_label) print(per_occ) ax = fig.add_subplot(3, 2, 3) plt.imshow(im_depth[:, :, 0]) ax = fig.add_subplot(3, 2, 4) plt.imshow(im_depth[:, :, 1]) ax = fig.add_subplot(3, 2, 5) plt.imshow(im_depth[:, :, 2]) plt.show() ''' # chromatic transform if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = chromatic_transform(im) if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = add_noise(im) im_tensor = torch.from_numpy(im) / 255.0 im_tensor -= self._pixel_mean if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': im_depth_tensor = torch.from_numpy(im_depth).float() if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im_depth_tensor = add_noise_depth(im_depth_tensor).float() else: im_depth_tensor = im_tensor.clone() # poses and boxes only for the target object pose_blob = np.zeros((1, 9), dtype=np.float32) gt_boxes = np.zeros((1, 5), dtype=np.float32) pose_blob[0, 0] = 1 pose_blob[0, 1] = cls_target pose_blob[0, 2:6] = poses_all[0][3:] pose_blob[0, 6:] = poses_all[0][:3] # compute box x3d = np.ones((4, self._points_all.shape[1]), dtype=np.float32) x3d[0, :] = self._points_all[cls_target, :, 0] x3d[1, :] = self._points_all[cls_target, :, 1] x3d[2, :] = self._points_all[cls_target, :, 2] RT = np.zeros((3, 4), dtype=np.float32) RT[:3, :3] = quat2mat(pose_blob[0, 2:6]) RT[:, 3] = pose_blob[0, 6:] x2d = np.matmul(self._intrinsic_matrix, np.matmul(RT, x3d)) x2d[0, :] = np.divide(x2d[0, :], x2d[2, :]) x2d[1, :] = np.divide(x2d[1, :], x2d[2, :]) gt_boxes[0, 0] = np.min(x2d[0, :]) gt_boxes[0, 1] = np.min(x2d[1, :]) gt_boxes[0, 2] = np.max(x2d[0, :]) gt_boxes[0, 3] = np.max(x2d[1, :]) gt_boxes[0, 4] = cls_target # construct the meta data K = self._intrinsic_matrix Kinv = np.linalg.pinv(K) meta_data_blob = np.zeros(18, dtype=np.float32) meta_data_blob[0:9] = K.flatten() meta_data_blob[9:18] = Kinv.flatten() is_syn = 1 im_info = np.array( [im.shape[0], im.shape[1], cfg.TRAIN.SCALES_BASE[0], is_syn], dtype=np.float32) pose_result = pose_blob.copy() # im is pytorch tensor in gpu sample = { 'image_color': im_tensor, 'image_depth': im_depth_tensor, 'meta_data': meta_data_blob, 'label_blob': label_blob, 'mask': mask, 'poses': pose_blob, 'extents': self._extents, 'points': self._point_blob, 'gt_boxes': gt_boxes, 'poses_result': pose_result, 'im_info': im_info } return sample
def __getitem__(self, idx): # Get scene directory, crop dose not use background scene_idx = idx // self.NUM_VIEWS_PER_SCENE scene_dir = self.scene_dirs[scene_idx] # Get view number view_num = idx % self.NUM_VIEWS_PER_SCENE if cfg.TRAIN.SYN_CROP: view_num += 2 # Label foreground_labels_filename = os.path.join( scene_dir, 'segmentation_%05d.png' % view_num) foreground_labels = util_.imread_indexed(foreground_labels_filename) # mask table as background foreground_labels[foreground_labels == 1] = 0 foreground_labels = self.process_label(foreground_labels) # BGR image filename = os.path.join(scene_dir, 'rgb_%05d.jpeg' % view_num) im = cv2.imread(filename) if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': # Depth image depth_img_filename = os.path.join(scene_dir, 'depth_%05d.png' % view_num) depth_img = cv2.imread( depth_img_filename, cv2.IMREAD_ANYDEPTH ) # This reads a 16-bit single-channel image. Shape: [H x W] xyz_img = self.process_depth(depth_img) else: xyz_img = None # crop if cfg.TRAIN.SYN_CROP: im, foreground_labels, xyz_img = self.pad_crop_resize( im, foreground_labels, xyz_img) foreground_labels = self.process_label(foreground_labels) # sample labels if cfg.TRAIN.EMBEDDING_SAMPLING: foreground_labels = self.sample_pixels( foreground_labels, cfg.TRAIN.EMBEDDING_SAMPLING_NUM) label_blob = torch.from_numpy(foreground_labels).unsqueeze(0) sample = {'label': label_blob} if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = chromatic_transform(im) if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = add_noise(im) im_tensor = torch.from_numpy(im) / 255.0 im_tensor -= self._pixel_mean image_blob = im_tensor.permute(2, 0, 1) sample['image_color'] = image_blob if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1) sample['depth'] = depth_blob return sample
def load(self, filename_color, filename_depth, intrinsics): if filename_depth is None: background_depth = np.zeros((3, self._height, self._width), dtype=np.float32) mask_depth = np.zeros((self._height, self._width), dtype=np.float32) if filename_depth is None and np.random.rand( 1 ) < cfg.TRAIN.SYN_BACKGROUND_CONSTANT_PROB: # only for rgb cases # constant background image background_color = np.ones((self._height, self._width, 3), dtype=np.uint8) color = np.random.randint(256, size=3) background_color[:, :, 0] = color[0] background_color[:, :, 1] = color[1] background_color[:, :, 2] = color[2] else: background_color = cv2.imread(filename_color, cv2.IMREAD_UNCHANGED) if filename_depth is not None: background_depth = cv2.imread(filename_depth, cv2.IMREAD_UNCHANGED) try: # randomly crop a region as background bw = background_color.shape[1] bh = background_color.shape[0] x1 = npr.randint(0, int(bw / 3)) y1 = npr.randint(0, int(bh / 3)) x2 = npr.randint(int(2 * bw / 3), bw) y2 = npr.randint(int(2 * bh / 3), bh) background_color = background_color[y1:y2, x1:x2] background_color = cv2.resize(background_color, (self._width, self._height), interpolation=cv2.INTER_LINEAR) if len(background_color.shape) != 3: background_color = cv2.cvtColor(background_color, cv2.COLOR_GRAY2RGB) if filename_depth is not None: background_depth = background_depth[y1:y2, x1:x2] background_depth = cv2.resize( background_depth, (self._width, self._height), interpolation=cv2.INTER_NEAREST) background_depth = self.backproject( background_depth, intrinsics, self.depth_factor) except: background_color = np.zeros((self._height, self._width, 3), dtype=np.uint8) print('bad background_color image', filename_color) if filename_depth is not None: background_depth = np.zeros((self._height, self._width, 3), dtype=np.float32) print('bad depth background image') if len(background_color.shape) != 3: background_color = np.zeros((self._height, self._width, 3), dtype=np.uint8) print('bad background_color image', filename_color) if filename_depth is not None: if len(background_depth.shape) != 3: background_depth = np.zeros((self._height, self._width, 3), dtype=np.float32) print('bad depth background image') z_im = background_depth[:, :, 2] mask_depth = z_im > 0.0 mask_depth = mask_depth.astype(np.float32) if np.random.rand(1) > 0.1: background_depth = add_noise_depth(background_depth) background_depth = background_depth.transpose(2, 0, 1).astype( np.float32) if np.random.rand(1) > 0.1: background_color = chromatic_transform(background_color) if np.random.rand(1) > 0.1: background_color = add_noise(background_color) background_color = background_color.astype(np.float32) if self.subtract_mean: background_color -= self._pixel_mean background_color = background_color.transpose(2, 0, 1) / 255.0 sample = { 'background_color': background_color, 'background_depth': background_depth, 'mask_depth': mask_depth } return sample
def _compose_item(self): height = cfg.TRAIN.SYN_HEIGHT width = cfg.TRAIN.SYN_WIDTH classes_all = np.array(range(len(self._classes_all))) mask_depth_cuda = torch.cuda.FloatTensor(1, height, width).fill_(0) # sample target objects if cfg.TRAIN.SYN_SAMPLE_OBJECT: maxnum = np.minimum(self.num_classes - 1, cfg.TRAIN.SYN_MAX_OBJECT) num = np.random.randint(cfg.TRAIN.SYN_MIN_OBJECT, maxnum + 1) perm = np.random.permutation(np.arange(self.num_classes - 1)) indexes_target = perm[:num] + 1 else: num = self.num_classes - 1 indexes_target = np.arange(num) + 1 num_target = num cls_indexes = [cfg.TRAIN.CLASSES[i] - 1 for i in indexes_target] # sample poses im_color = np.zeros((height, width, 3), dtype=np.uint8) im_label = np.zeros((height, width), dtype=np.uint8) im_label_all = np.zeros((height, width), dtype=np.uint8) gt_boxes = np.zeros((self.num_classes, 5), dtype=np.float32) for i in range(num): # select image cls = int(cls_indexes[i]) if self.pose_indexes_real[cls] >= len(self.pose_lists_real[cls]): self.pose_indexes_real[cls] = 0 self.pose_lists_real[cls] = np.random.permutation( np.arange(len(self.pose_lists_real[cls]))) index_image = self.pose_lists_real[cls][ self.pose_indexes_real[cls]] self.pose_indexes_real[cls] += 1 # read image filename = self.pose_images[cls][index_image] im = cv2.imread(filename, cv2.IMREAD_UNCHANGED) # read mask filename_mask = filename[:-4] + '_mask.pbm' mask = cv2.imread(filename_mask, cv2.IMREAD_UNCHANGED) mask = np.array(mask == 0).astype(np.uint8) kernel = np.ones((20, 20), np.uint8) mask = cv2.erode(mask, kernel, iterations=1) while 1: # rescale the image rescale_factor = np.random.uniform(0.1, 0.3) affine_1 = np.eye(3, dtype=np.float32) affine_1[0, 0] = rescale_factor * affine_1[0, 0] affine_1[1, 1] = rescale_factor * affine_1[1, 1] # translation to center delta_x = np.random.uniform(0.25, 0.5) delta_y = np.random.uniform(0.25, 0.5) M_translation = np.float32([[1, 0, delta_x * width], [0, 1, delta_y * height]]) affine_2 = np.eye(3, dtype=np.float32) affine_2[:2, :] = M_translation # rotation degree = np.random.uniform(-180.0, 180.0) M_rotation = cv2.getRotationMatrix2D((width / 2, height / 2), degree, 1) affine_3 = np.eye(3, dtype=np.float32) affine_3[:2, :] = M_rotation # translation again delta_x = np.random.uniform(-0.4, 0.4) delta_y = np.random.uniform(-0.4, 0.4) M_translation_1 = np.float32([[1, 0, delta_x * width], [0, 1, delta_y * height]]) affine_4 = np.eye(3, dtype=np.float32) affine_4[:2, :] = M_translation_1 # all together affine = np.dot(affine_4, np.dot(affine_3, np.dot(affine_2, affine_1))) im_final = cv2.warpAffine(im, affine[:2, :], (width, height)) mask_final = cv2.warpAffine(mask, affine[:2, :], (width, height)) index_foreground = np.where(mask_final == 1) if len(index_foreground[0]) > 0: break # paste object and label index = np.where((mask_final == 1) & (im_label_all == 0)) im_color[index[0], index[1], :] = im_final[index[0], index[1], :] cls_ind = np.where(np.array(cfg.TRAIN.CLASSES) == cls + 1)[0] im_label[index[0], index[1]] = cls_ind gt_boxes[i, 0] = np.min(index_foreground[1]) gt_boxes[i, 1] = np.min(index_foreground[0]) gt_boxes[i, 2] = np.max(index_foreground[1]) gt_boxes[i, 3] = np.max(index_foreground[0]) gt_boxes[i, 4] = cls_ind cls_ind = np.where(classes_all == cls + 1)[0] im_label_all[index[0], index[1]] = cls_ind ''' import matplotlib.pyplot as plt fig = plt.figure() im = im.astype(np.uint8) ax = fig.add_subplot(2, 3, 1) plt.imshow(im[:, :, (2, 1, 0)]) ax.set_title('color') ax = fig.add_subplot(2, 3, 2) plt.imshow(im_final[:, :, (2, 1, 0)]) ax.set_title('final') ax = fig.add_subplot(2, 3, 3) plt.imshow(mask) ax.set_title('mask') ax = fig.add_subplot(2, 3, 4) plt.imshow(mask_final) ax.set_title('mask final') ax = fig.add_subplot(2, 3, 5) plt.imshow(im_color[:, :, (2, 1, 0)]) for j in range(gt_boxes.shape[0]): if gt_boxes[j, 4] == 0: continue x1 = gt_boxes[j, 0] y1 = gt_boxes[j, 1] x2 = gt_boxes[j, 2] y2 = gt_boxes[j, 3] plt.gca().add_patch( plt.Rectangle((x1, y1), x2-x1, y2-y1, fill=False, edgecolor='g', linewidth=3, clip_on=False)) ax = fig.add_subplot(2, 3, 6) plt.imshow(im_label_all) plt.show() #''' # foreground mask seg = torch.from_numpy((im_label_all != 0).astype(np.float32)) mask = seg.unsqueeze(0).repeat((3, 1, 1)).float().cuda() im = im_color # chromatic transform if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = chromatic_transform(im) im_cuda = torch.from_numpy(im).cuda().float() / 255.0 if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im_cuda = add_noise_cuda(im_cuda) im_cuda -= self._pixel_mean im_cuda = im_cuda.permute(2, 0, 1) # label blob classes = np.array(range(self.num_classes)) label_blob = np.zeros((self.num_classes, self._height, self._width), dtype=np.float32) label_blob[0, :, :] = 1.0 for i in range(1, self.num_classes): I = np.where(im_label == classes[i]) if len(I[0]) > 0: label_blob[i, I[0], I[1]] = 1.0 label_blob[0, I[0], I[1]] = 0.0 # construct the meta data K = self._intrinsic_matrix K[2, 2] = 1 Kinv = np.linalg.pinv(K) meta_data_blob = np.zeros(18, dtype=np.float32) meta_data_blob[0:9] = K.flatten() meta_data_blob[9:18] = Kinv.flatten() # no vertex regression target and poses pose_blob = np.zeros((self.num_classes, 9), dtype=np.float32) vertex_targets = np.zeros((3 * self.num_classes, height, width), dtype=np.float32) vertex_weights = np.zeros((3 * self.num_classes, height, width), dtype=np.float32) im_info = np.array( [im.shape[1], im.shape[2], cfg.TRAIN.SCALES_BASE[0], 1], dtype=np.float32) sample = { 'image_color': im_cuda, 'image_depth': im_cuda, 'label': label_blob, 'mask': mask, 'mask_depth': mask_depth_cuda, 'meta_data': meta_data_blob, 'poses': pose_blob, 'extents': self._extents, 'points': self._point_blob, 'symmetry': self._symmetry, 'gt_boxes': gt_boxes, 'im_info': im_info } if cfg.TRAIN.VERTEX_REG: sample['vertex_targets'] = vertex_targets sample['vertex_weights'] = vertex_weights return sample
def _render_item(self): height = cfg.TRAIN.SYN_HEIGHT width = cfg.TRAIN.SYN_WIDTH fx = self._intrinsic_matrix[0, 0] fy = self._intrinsic_matrix[1, 1] px = self._intrinsic_matrix[0, 2] py = self._intrinsic_matrix[1, 2] zfar = 6.0 znear = 0.01 # sample target objects if cfg.TRAIN.SYN_SAMPLE_OBJECT: maxnum = np.minimum(self.num_classes - 1, cfg.TRAIN.SYN_MAX_OBJECT) num = np.random.randint(cfg.TRAIN.SYN_MIN_OBJECT, maxnum + 1) perm = np.random.permutation(np.arange(self.num_classes - 1)) indexes_target = perm[:num] + 1 else: num = self.num_classes - 1 indexes_target = np.arange(num) + 1 num_target = num cls_indexes = [cfg.TRAIN.CLASSES[i] - 1 for i in indexes_target] # sample other objects as distractors if cfg.TRAIN.SYN_SAMPLE_DISTRACTOR: num_other = min(5, self._num_classes_other) num_selected = np.random.randint(0, num_other + 1) perm = np.random.permutation(np.arange(self._num_classes_other)) indexes = perm[:num_selected] for i in range(num_selected): cls_indexes.append(self._classes_other[indexes[i]] - 1) else: num_selected = 0 # sample poses num = num_target + num_selected poses_all = [] for i in range(num): qt = np.zeros((7, ), dtype=np.float32) # rotation cls = int(cls_indexes[i]) if self.pose_indexes[cls] >= len(self.pose_lists[cls]): self.pose_indexes[cls] = 0 self.pose_lists[cls] = np.random.permutation( np.arange(len(self.eulers))) yaw = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][0] + 15 * np.random.randn() pitch = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][1] + 15 * np.random.randn() pitch = np.clip(pitch, -90, 90) roll = self.eulers[self.pose_lists[cls][ self.pose_indexes[cls]]][2] + 15 * np.random.randn() qt[3:] = euler2quat(yaw * math.pi / 180.0, pitch * math.pi / 180.0, roll * math.pi / 180.0, 'syxz') self.pose_indexes[cls] += 1 # translation bound = cfg.TRAIN.SYN_BOUND if i == 0 or i >= num_target or np.random.rand(1) > 0.5: qt[0] = np.random.uniform(-bound, bound) qt[1] = np.random.uniform(-bound, bound) qt[2] = np.random.uniform(cfg.TRAIN.SYN_TNEAR, cfg.TRAIN.SYN_TFAR) else: # sample an object nearby object_id = np.random.randint(0, i, size=1)[0] extent = 2 * np.mean(self._extents_all[cls + 1, :]) flag = np.random.randint(0, 2) if flag == 0: flag = -1 qt[0] = poses_all[object_id][ 0] + flag * extent * np.random.uniform(1.0, 1.5) if np.absolute(qt[0]) > bound: qt[0] = poses_all[object_id][ 0] - flag * extent * np.random.uniform(1.0, 1.5) if np.absolute(qt[0]) > bound: qt[0] = np.random.uniform(-bound, bound) flag = np.random.randint(0, 2) if flag == 0: flag = -1 qt[1] = poses_all[object_id][ 1] + flag * extent * np.random.uniform(1.0, 1.5) if np.absolute(qt[1]) > bound: qt[1] = poses_all[object_id][ 1] - flag * extent * np.random.uniform(1.0, 1.5) if np.absolute(qt[1]) > bound: qt[1] = np.random.uniform(-bound, bound) qt[2] = poses_all[object_id][2] - extent * np.random.uniform( 2.0, 4.0) if qt[2] < cfg.TRAIN.SYN_TNEAR: qt[2] = poses_all[object_id][ 2] + extent * np.random.uniform(2.0, 4.0) poses_all.append(qt) cfg.renderer.set_poses(poses_all) # sample lighting cfg.renderer.set_light_pos(np.random.uniform(-0.5, 0.5, 3)) intensity = np.random.uniform(0.8, 2) light_color = intensity * np.random.uniform(0.9, 1.1, 3) cfg.renderer.set_light_color(light_color) # rendering cfg.renderer.set_projection_matrix(width, height, fx, fy, px, py, znear, zfar) image_tensor = torch.cuda.FloatTensor(height, width, 4).detach() seg_tensor = torch.cuda.FloatTensor(height, width, 4).detach() pc_tensor = torch.cuda.FloatTensor(height, width, 4).detach() cfg.renderer.render(cls_indexes, image_tensor, seg_tensor, pc2_tensor=pc_tensor) image_tensor = image_tensor.flip(0) seg_tensor = seg_tensor.flip(0) pc_tensor = pc_tensor.flip(0) # foreground mask seg = seg_tensor[:, :, 2] + 256 * seg_tensor[:, :, 1] + 256 * 256 * seg_tensor[:, :, 0] mask = (seg != 0).unsqueeze(0).repeat((3, 1, 1)).float() # RGB to BGR order im = image_tensor.cpu().numpy() im = np.clip(im, 0, 1) im = im[:, :, (2, 1, 0)] * 255 im = im.astype(np.uint8) # XYZ coordinates in camera frame im_depth = pc_tensor.cpu().numpy() im_depth = im_depth[:, :, :3] im_depth_return = im_depth[:, :, 2].copy() im_label = seg_tensor.cpu().numpy() im_label = im_label[:, :, (2, 1, 0)] * 255 im_label = np.round(im_label).astype(np.uint8) im_label = np.clip(im_label, 0, 255) im_label, im_label_all = self.process_label_image(im_label) centers = np.zeros((num, 2), dtype=np.float32) rcenters = cfg.renderer.get_centers() for i in range(num): centers[i, 0] = rcenters[i][1] * width centers[i, 1] = rcenters[i][0] * height centers = centers[:num_target, :] ''' import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(3, 2, 1) plt.imshow(im[:, :, (2, 1, 0)]) for i in range(num_target): plt.plot(centers[i, 0], centers[i, 1], 'yo') ax = fig.add_subplot(3, 2, 2) plt.imshow(im_label) ax = fig.add_subplot(3, 2, 3) plt.imshow(im_depth[:, :, 0]) ax = fig.add_subplot(3, 2, 4) plt.imshow(im_depth[:, :, 1]) ax = fig.add_subplot(3, 2, 5) plt.imshow(im_depth[:, :, 2]) plt.show() #''' # chromatic transform if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = chromatic_transform(im) im_cuda = torch.from_numpy(im).cuda().float() / 255.0 if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im_cuda = add_noise_cuda(im_cuda) im_cuda -= self._pixel_mean im_cuda = im_cuda.permute(2, 0, 1) if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': # depth mask z_im = im_depth[:, :, 2] mask_depth = z_im > 0.0 mask_depth = mask_depth.astype('float') mask_depth_cuda = torch.from_numpy(mask_depth).cuda().float() mask_depth_cuda.unsqueeze_(0) im_cuda_depth = torch.from_numpy(im_depth).cuda().float() if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im_cuda_depth = add_noise_depth_cuda(im_cuda_depth) im_cuda_depth = im_cuda_depth.permute(2, 0, 1) else: im_cuda_depth = im_cuda.clone() mask_depth_cuda = torch.cuda.FloatTensor(1, height, width).fill_(0) # label blob classes = np.array(range(self.num_classes)) label_blob = np.zeros((self.num_classes, self._height, self._width), dtype=np.float32) label_blob[0, :, :] = 1.0 for i in range(1, self.num_classes): I = np.where(im_label == classes[i]) if len(I[0]) > 0: label_blob[i, I[0], I[1]] = 1.0 label_blob[0, I[0], I[1]] = 0.0 # poses and boxes pose_blob = np.zeros((self.num_classes, 9), dtype=np.float32) gt_boxes = np.zeros((self.num_classes, 5), dtype=np.float32) for i in range(num_target): cls = int(indexes_target[i]) pose_blob[i, 0] = 1 pose_blob[i, 1] = cls T = poses_all[i][:3] qt = poses_all[i][3:] # egocentric to allocentric qt_allocentric = egocentric2allocentric(qt, T) if qt_allocentric[0] < 0: qt_allocentric = -1 * qt_allocentric pose_blob[i, 2:6] = qt_allocentric pose_blob[i, 6:] = T # compute box x3d = np.ones((4, self._points_all.shape[1]), dtype=np.float32) x3d[0, :] = self._points_all[cls, :, 0] x3d[1, :] = self._points_all[cls, :, 1] x3d[2, :] = self._points_all[cls, :, 2] RT = np.zeros((3, 4), dtype=np.float32) RT[:3, :3] = quat2mat(qt) RT[:, 3] = T x2d = np.matmul(self._intrinsic_matrix, np.matmul(RT, x3d)) x2d[0, :] = np.divide(x2d[0, :], x2d[2, :]) x2d[1, :] = np.divide(x2d[1, :], x2d[2, :]) gt_boxes[i, 0] = np.min(x2d[0, :]) gt_boxes[i, 1] = np.min(x2d[1, :]) gt_boxes[i, 2] = np.max(x2d[0, :]) gt_boxes[i, 3] = np.max(x2d[1, :]) gt_boxes[i, 4] = cls # construct the meta data """ format of the meta_data intrinsic matrix: meta_data[0 ~ 8] inverse intrinsic matrix: meta_data[9 ~ 17] """ K = self._intrinsic_matrix K[2, 2] = 1 Kinv = np.linalg.pinv(K) meta_data_blob = np.zeros(18, dtype=np.float32) meta_data_blob[0:9] = K.flatten() meta_data_blob[9:18] = Kinv.flatten() # vertex regression target if cfg.TRAIN.VERTEX_REG: vertex_targets, vertex_weights = self._generate_vertex_targets( im_label, indexes_target, centers, poses_all, classes, self.num_classes) elif cfg.TRAIN.VERTEX_REG_DELTA and cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': vertex_targets, vertex_weights = self._generate_vertex_deltas( im_label, indexes_target, centers, poses_all, classes, self.num_classes, im_depth) else: vertex_targets = [] vertex_weights = [] im_info = np.array( [im.shape[1], im.shape[2], cfg.TRAIN.SCALES_BASE[0], 1], dtype=np.float32) sample = { 'image_color': im_cuda, 'image_depth': im_cuda_depth, 'im_depth': im_depth_return, 'label': label_blob, 'mask': mask, 'mask_depth': mask_depth_cuda, 'meta_data': meta_data_blob, 'poses': pose_blob, 'extents': self._extents, 'points': self._point_blob, 'symmetry': self._symmetry, 'gt_boxes': gt_boxes, 'im_info': im_info } if cfg.TRAIN.VERTEX_REG or cfg.TRAIN.VERTEX_REG_DELTA: sample['vertex_targets'] = vertex_targets sample['vertex_weights'] = vertex_weights return sample
def __getitem__(self, idx): sample = self.data[idx] # (idx: [rgb, d, seg]) rgb_path = sample[0] depth_path = sample[1] segmentation_path = sample[2] # _, ax = plt.subplots(1, 3) # ax[0].imshow(rgb) # ax[1].imshow(depth) # ax[2].imshow(segmentation) # plt.show() foreground_labels_filename = segmentation_path foreground_labels = util_.imread_indexed(foreground_labels_filename) # mask table as background foreground_labels[foreground_labels == 1] = 0 foreground_labels = self.process_label(foreground_labels) # BGR image filename = rgb_path im = cv2.imread(filename) if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': # Depth image depth_img_filename = depth_path depth_img = cv2.imread( depth_img_filename, cv2.IMREAD_ANYDEPTH ) # This reads a 16-bit single-channel image. Shape: [H x W] xyz_img = self.process_depth(depth_img) else: xyz_img = None # crop if cfg.TRAIN.SYN_CROP: im, foreground_labels, xyz_img = self.pad_crop_resize( im, foreground_labels, xyz_img) foreground_labels = self.process_label(foreground_labels) # sample labels if cfg.TRAIN.EMBEDDING_SAMPLING: foreground_labels = self.sample_pixels( foreground_labels, cfg.TRAIN.EMBEDDING_SAMPLING_NUM) label_blob = torch.from_numpy(foreground_labels).unsqueeze(0) sample = {'label': label_blob} if cfg.TRAIN.CHROMATIC and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = chromatic_transform(im) if cfg.TRAIN.ADD_NOISE and cfg.MODE == 'TRAIN' and np.random.rand( 1) > 0.1: im = add_noise(im) im_tensor = torch.from_numpy(im) / 255.0 im_tensor -= self._pixel_mean image_blob = im_tensor.permute(2, 0, 1) sample['image_color'] = image_blob if cfg.INPUT == 'DEPTH' or cfg.INPUT == 'RGBD': depth_blob = torch.from_numpy(xyz_img).permute(2, 0, 1) sample['depth'] = depth_blob return sample
def _get_image_blob(roidb, scale_ind): """Builds an input blob from the images in the roidb at the specified scales. """ num_images = len(roidb) processed_ims = [] processed_ims_depth = [] processed_ims_normal = [] im_scales = [] for i in xrange(num_images): # meta data meta_data = scipy.io.loadmat(roidb[i]['meta_data']) K = meta_data['intrinsic_matrix'].astype(np.float32, copy=True) fx = K[0, 0] fy = K[1, 1] cx = K[0, 2] cy = K[1, 2] # depth raw im_depth_raw = pad_im(cv2.imread(roidb[i]['depth'], cv2.IMREAD_UNCHANGED), 16) height = im_depth_raw.shape[0] width = im_depth_raw.shape[1] # rgba rgba = pad_im(cv2.imread(roidb[i]['image'], cv2.IMREAD_UNCHANGED), 16) if rgba.shape[2] == 4: im = np.copy(rgba[:,:,:3]) alpha = rgba[:,:,3] I = np.where(alpha == 0) im[I[0], I[1], :] = 255 else: im = rgba # chromatic transform if cfg.TRAIN.CHROMATIC: im = chromatic_transform(im) # mask the color image according to depth if cfg.EXP_DIR == 'rgbd_scene': I = np.where(im_depth_raw == 0) im[I[0], I[1], :] = 0 if roidb[i]['flipped']: im = im[:, ::-1, :] im_orig = im.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_scale = cfg.TRAIN.SCALES_BASE[scale_ind] im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) im_scales.append(im_scale) processed_ims.append(im) # depth im_depth = im_depth_raw.astype(np.float32, copy=True) / float(im_depth_raw.max()) * 255 im_depth = np.tile(im_depth[:,:,np.newaxis], (1,1,3)) if roidb[i]['flipped']: im_depth = im_depth[:, ::-1] im_orig = im_depth.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_depth = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) processed_ims_depth.append(im_depth) # normals depth = im_depth_raw.astype(np.float32, copy=True) / float(meta_data['factor_depth']) nmap = gpu_normals.gpu_normals(depth, fx, fy, cx, cy, 20.0, cfg.GPU_ID) im_normal = 127.5 * nmap + 127.5 im_normal = im_normal.astype(np.uint8) im_normal = im_normal[:, :, (2, 1, 0)] if roidb[i]['flipped']: im_normal = im_normal[:, ::-1, :] im_orig = im_normal.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_normal = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) processed_ims_normal.append(im_normal) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims, 3) blob_depth = im_list_to_blob(processed_ims_depth, 3) blob_normal = im_list_to_blob(processed_ims_normal, 3) return blob, blob_depth, blob_normal, im_scales