def _process_request(self, cv_color, cv_depth, bbox): """ Takes RGBD image and bounding box coordinates to infer the 3D keypoints """ # Parse the bounding box top_left, bottom_right = PixelCoord(), PixelCoord() top_left.x = bbox.x_offset top_left.y = bbox.y_offset bottom_right.x = bbox.x_offset + bbox.width bottom_right.y = bbox.y_offset + bbox.height # Perform the inference imgproc_out = inference.proc_input_img_raw(cv_color, cv_depth, top_left, bottom_right) # keypointxy_depth_scaled = inference.inference_resnet_nostage(self._network, imgproc_out) keypointxy_depth_scaled = self._query_network(imgproc_out) keypointxy_depth_realunit = inference.get_keypoint_xy_depth_real_unit( keypointxy_depth_scaled) # print("keypoint_x_y_depth:", keypointxy_depth_realunit) keypoint_xy_depth_img, camera_keypoint = inference.get_3d_prediction_K( keypointxy_depth_realunit, imgproc_out.bbox2patch, self.K_inv) # print(keypoint_xy_depth_img) # print("cam keypoint", camera_keypoint) return keypoint_xy_depth_img, camera_keypoint
def test_in_image(self): from mankey.utils.imgproc import PixelCoord, rectify_bbox_in_image # Some test of rectification topleft, bottomright = PixelCoord(), PixelCoord() topleft.x = 0 topleft.y = 10 bottomright.x = 20 bottomright.y = 50 # Test of in_image rectified_topleft, rectified_bottomright = rectify_bbox_in_image( topleft, bottomright, 640, 480) self.assertEqual(rectified_bottomright.x - rectified_topleft.x, rectified_bottomright.y - rectified_topleft.y) self.assertEqual(rectified_bottomright.x - rectified_topleft.x, 40)
class SupervisedKeypointDBEntry: # The path to rgb is must rgb_image_path = '' # The path to depth image depth_image_path = '' # The path to mask image binary_mask_path = '' # If length zero, indicates no depth @property def has_depth(self): return len(self.depth_image_path) > 0 @property def has_mask(self): return len(self.binary_mask_path) > 0 # The bounding box is tight bbox_top_left = PixelCoord() bbox_bottom_right = PixelCoord() # The information related to keypoint # All of these element should be in size of (3, n_keypoint) # The first element iterate over x, y, or z, the second element iterate over keypoints keypoint_camera = None # The position of keypoint expressed in camera frame using meter as unit # (pixel_x, pixel_y, mm_depth) for each keypoint # Note that the pixel might be outside the image space keypoint_pixelxy_depth = None # Each element indicate the validity of the corresponded keypoint coordinate # 1 means valid, 0 means not valid keypoint_validity_weight = None on_boundary = False # The pose of the camera # Homogeneous transformation matrix camera_in_world = np.ndarray(shape=[4, 4]) # xyzrot delta_rotation_matrix = np.ndarray(shape=[3, 3]) delta_translation = np.ndarray(shape=[3,]) gripper_pose = np.ndarray(shape=[4, 4]) step_size = np.ndarray(shape=[1,])
def test_center_aligned(self): from mankey.utils.imgproc import PixelCoord, rectify_bbox_center_align # Some test of rectification topleft, bottomright = PixelCoord(), PixelCoord() topleft.x = 0 topleft.y = 10 bottomright.x = 20 bottomright.y = 50 # Test of center-aligned rectified_topleft, rectified_bottomright = rectify_bbox_center_align( topleft, bottomright) self.assertEqual(rectified_bottomright.x - rectified_topleft.x, rectified_bottomright.y - rectified_topleft.y) self.assertEqual(rectified_bottomright.x + rectified_topleft.x, topleft.x + bottomright.x) self.assertEqual(rectified_bottomright.y + rectified_topleft.y, topleft.y + bottomright.y)
def process_raw( self, cv_rgb, # type: np.ndarray cv_depth, # type: np.ndarray bbox, #type: np.ndarray [x,y,w,h] ): # type: (np.ndarray, np.ndarray, np.ndarray [x_min,y_min,x_max,y_max]) -> np.ndarray # Parse the bounding box top_left, bottom_right = PixelCoord(), PixelCoord() top_left.x = bbox[0] top_left.y = bbox[1] bottom_right.x = bbox[2] bottom_right.y = bbox[3] # Perform the inference imgproc_out = inference.proc_input_img_raw(cv_rgb, cv_depth, top_left, bottom_right) keypointxy_depth_scaled = inference.inference_resnet_nostage( self._network, imgproc_out) keypointxy_depth_realunit = inference.get_keypoint_xy_depth_real_unit( keypointxy_depth_scaled) _, camera_keypoint = inference.get_3d_prediction( keypointxy_depth_realunit, imgproc_out.bbox2patch) return camera_keypoint
def process_request_raw( self, cv_color, # type: np.ndarray cv_depth, # type: np.ndarray bbox, # type: RegionOfInterest ): # type: (np.ndarray, np.ndarray, RegionOfInterest) -> np.ndarray # Parse the bounding box top_left, bottom_right = PixelCoord(), PixelCoord() top_left.x = bbox.x_offset top_left.y = bbox.y_offset bottom_right.x = bbox.x_offset + bbox.width bottom_right.y = bbox.y_offset + bbox.height # Perform the inference imgproc_out = inference.proc_input_img_raw( cv_color, cv_depth, top_left, bottom_right) keypointxy_depth_scaled = inference.inference_resnet_nostage(self._network, imgproc_out) keypointxy_depth_realunit = inference.get_keypoint_xy_depth_real_unit(keypointxy_depth_scaled) _, camera_keypoint = inference.get_3d_prediction( keypointxy_depth_realunit, imgproc_out.bbox2patch) return camera_keypoint
def _get_transformed_keypoint( transform: np.ndarray, entry: SupervisedKeypointDBEntry, patch_width: int, patch_height: int) -> (np.ndarray, np.ndarray): """ Given the bounding box to patch transform, compute the transform keypoint and their validity. Note that transformed pixel might not be int :param transform: 3x3 homogeneous transform matrix :param entry: :param patch_width: :param patch_height: :return: A tuple contains the transformed pixelxy_depth and validity """ from mankey.utils.imgproc import transform_2d, PixelCoord, pixel_in_bbox # Allocate the space n_keypoint = entry.keypoint_pixelxy_depth.shape[1] transformed_pixelxy_depth = np.zeros((3, n_keypoint)) transformed_validity_weight = np.ones((3, n_keypoint)) # Construct bounding box top_left = PixelCoord() top_left.x = 0 top_left.y = 0 bottom_right = PixelCoord() bottom_right.x = patch_width bottom_right.y = patch_height # Do transform pixel = PixelCoord() for i in range(n_keypoint): transformed_pixelxy_depth[0:2, i] = transform_2d( entry.keypoint_pixelxy_depth[0:2, i], transform) transformed_pixelxy_depth[2, i] = entry.keypoint_pixelxy_depth[2, i] # Check validity pixel.x = int(transformed_pixelxy_depth[0, i]) pixel.y = int(transformed_pixelxy_depth[1, i]) if not pixel_in_bbox(pixel, top_left, bottom_right): transformed_validity_weight[0, i] = 0 transformed_validity_weight[1, i] = 0 transformed_validity_weight[2, i] = 0 # OK return transformed_pixelxy_depth, transformed_validity_weight
def _get_image_entry(self, image_map, scene_root: str) -> SupervisedKeypointDBEntry: entry = SupervisedKeypointDBEntry() # The path for rgb image #rgb_name = image_map['rgb_image_filename'] # multi-view pic, the main pic is chosen now #rgb_name = image_map['rgb_image_filename'][0] rgb_path = [] for rgb_name in image_map['rgb_image_filename']: rgb_path.append(os.path.join(scene_root, 'processed/images/' + rgb_name)) #assert os.path.exists(rgb_path) entry.rgb_image_path = rgb_path # The path for depth image #depth_name = image_map['depth_image_filename'] # multi-view pic, the main pic is chosen now #depth_name = image_map['depth_image_filename'][0] rgb_path = [] for rgb_name in image_map['rgb_image_filename']: depth_path = os.path.join(scene_root, 'processed/images/' + depth_name) assert os.path.exists(depth_path) # Spartan must have depth image entry.depth_image_path = depth_path # The path for pcd ''' old version pcd_name = depth_name.split('.')[0] + '.npy' pcd_path = os.path.join(scene_root, 'processed/pcd/' + pcd_name) assert os.path.exists(pcd_path) entry.pcd_path = pcd_path ''' pcd_name = image_map['pcd'] pcd_path = os.path.join(scene_root, 'processed/pcd_seg_heatmap_3kpt/' + pcd_name) assert os.path.exists(pcd_path) entry.pcd_path = pcd_path # pcd centroid & pcd mean entry.pcd_centroid = np.array(image_map['pcd_centroid']) entry.pcd_mean = np.array(image_map['pcd_mean']) ''' # The path for mask image mask_name = depth_name[0:6] + '_mask.png' mask_path = os.path.join(scene_root, 'processed/image_masks/' + mask_name) assert os.path.exists(mask_path) entry.binary_mask_path = mask_path ''' # xyzrot entry.delta_rotation_matrix = np.array(image_map['delta_rotation_matrix']).reshape((3,3)) #entry.delta_rot_cls = np.array(image_map['cls']).reshape((3,)) entry.delta_translation = np.array(image_map['delta_translation']).reshape((3,)) entry.gripper_pose = np.array(image_map['gripper_pose']).reshape((4,4)) #step_size_value = max(min(image_map['step_size'], 1.0), 0.0) step_size_value = np.linalg.norm(entry.delta_translation) if step_size_value == 0: entry.unit_delta_translation = entry.delta_translation else: entry.unit_delta_translation = entry.delta_translation / step_size_value step_size_value = step_size_value*100 if step_size_value >= 1.0: entry.step_size = np.array([1.0]).reshape((1,)) else: entry.step_size = np.array([step_size_value]).reshape((1,)) # The camera pose in world camera2world_map = image_map['camera_to_world'] entry.camera_in_world = camera2world_from_map(camera2world_map) # The bounding box top_left = PixelCoord() bottom_right = PixelCoord() top_left.x, top_left.y = image_map['bbox_top_left_xy'][0], image_map['bbox_top_left_xy'][1] bottom_right.x, bottom_right.y = image_map['bbox_bottom_right_xy'][0], image_map['bbox_bottom_right_xy'][1] entry.bbox_top_left = top_left entry.bbox_bottom_right = bottom_right # The size of keypoint keypoint_camera_frame_list = image_map['3d_keypoint_camera_frame'] n_keypoint = len(keypoint_camera_frame_list) if self._num_keypoint < 0: self._num_keypoint = n_keypoint else: assert self._num_keypoint == n_keypoint # The keypoint in camera frame entry.keypoint_camera = np.zeros((3, n_keypoint)) for i in range(n_keypoint): for j in range(3): entry.keypoint_camera[j, i] = keypoint_camera_frame_list[i][j] # The pixel coordinate and depth of keypoint keypoint_pixelxy_depth_list = image_map['keypoint_pixel_xy_depth'] assert n_keypoint == len(keypoint_pixelxy_depth_list) entry.keypoint_pixelxy_depth = np.zeros((3, n_keypoint), dtype=np.int) for i in range(n_keypoint): for j in range(3): entry.keypoint_pixelxy_depth[j, i] = keypoint_pixelxy_depth_list[i][j] # Check the validity entry.keypoint_validity_weight = np.ones((3, n_keypoint)) for i in range(n_keypoint): pixel = PixelCoord() pixel.x = entry.keypoint_pixelxy_depth[0, i] pixel.y = entry.keypoint_pixelxy_depth[1, i] depth_mm = entry.keypoint_pixelxy_depth[2, i] valid = True if depth_mm < 0: # The depth cannot be negative valid = False # The pixel must be in bounding box if not pixel_in_bbox(pixel, entry.bbox_top_left, entry.bbox_bottom_right): valid = False # Invalid all the dimension if not valid: entry.keypoint_validity_weight[0, i] = 0 entry.keypoint_validity_weight[1, i] = 0 entry.keypoint_validity_weight[2, i] = 0 entry.on_boundary = True # OK return entry def _check_image_entry(self, entry: SupervisedKeypointDBEntry) -> bool: # Check the bounding box if entry.bbox_top_left.x is None or entry.bbox_top_left.y is None: return False if entry.bbox_bottom_right.x is None or entry.bbox_bottom_right.y is None: return False # OK return True
def _get_image_entry(self, image_map, scene_root: str) -> SupervisedKeypointDBEntry: entry = SupervisedKeypointDBEntry() # The path for rgb image rgb_name = image_map['rgb_image_filename'] rgb_path = os.path.join(scene_root, 'processed/images/' + rgb_name) assert os.path.exists(rgb_path) entry.rgb_image_path = rgb_path # The path for depth image depth_name = image_map['depth_image_filename'] depth_path = os.path.join(scene_root, 'processed/images/' + depth_name) assert os.path.exists(depth_path) # Spartan must have depth image entry.depth_image_path = depth_path ''' # The path for mask image mask_name = depth_name[0:6] + '_mask.png' mask_path = os.path.join(scene_root, 'processed/image_masks/' + mask_name) assert os.path.exists(mask_path) entry.binary_mask_path = mask_path ''' # xyzrot entry.delta_rotation_matrix = np.array( image_map['delta_rotation_matrix']).reshape((3, 3)) entry.delta_translation = np.array( image_map['delta_translation']).reshape((3, )) entry.gripper_pose = np.array(image_map['gripper_pose']).reshape( (4, 4)) step_size_value = max(min(image_map['step_size'], 1.0), 0.0) entry.step_size = np.array([step_size_value]).reshape((1, )) # The camera pose in world camera2world_map = image_map['camera_to_world'] entry.camera_in_world = camera2world_from_map(camera2world_map) # The bounding box top_left = PixelCoord() bottom_right = PixelCoord() top_left.x, top_left.y = image_map['bbox_top_left_xy'][0], image_map[ 'bbox_top_left_xy'][1] bottom_right.x, bottom_right.y = image_map['bbox_bottom_right_xy'][ 0], image_map['bbox_bottom_right_xy'][1] entry.bbox_top_left = top_left entry.bbox_bottom_right = bottom_right # The size of keypoint keypoint_camera_frame_list = image_map['3d_keypoint_camera_frame'] n_keypoint = len(keypoint_camera_frame_list) if self._num_keypoint < 0: self._num_keypoint = n_keypoint else: assert self._num_keypoint == n_keypoint # The keypoint in camera frame entry.keypoint_camera = np.zeros((3, n_keypoint)) for i in range(n_keypoint): for j in range(3): entry.keypoint_camera[j, i] = keypoint_camera_frame_list[i][j] # The pixel coordinate and depth of keypoint keypoint_pixelxy_depth_list = image_map['keypoint_pixel_xy_depth'] assert n_keypoint == len(keypoint_pixelxy_depth_list) entry.keypoint_pixelxy_depth = np.zeros((3, n_keypoint), dtype=np.int) for i in range(n_keypoint): for j in range(3): entry.keypoint_pixelxy_depth[ j, i] = keypoint_pixelxy_depth_list[i][j] # Check the validity entry.keypoint_validity_weight = np.ones((3, n_keypoint)) for i in range(n_keypoint): pixel = PixelCoord() pixel.x = entry.keypoint_pixelxy_depth[0, i] pixel.y = entry.keypoint_pixelxy_depth[1, i] depth_mm = entry.keypoint_pixelxy_depth[2, i] valid = True if depth_mm < 0: # The depth cannot be negative valid = False # The pixel must be in bounding box if not pixel_in_bbox(pixel, entry.bbox_top_left, entry.bbox_bottom_right): valid = False # Invalid all the dimension if not valid: entry.keypoint_validity_weight[0, i] = 0 entry.keypoint_validity_weight[1, i] = 0 entry.keypoint_validity_weight[2, i] = 0 entry.on_boundary = True # OK return entry