def estimate_camera_poses(TC1Oa, TC2Ob, labels_ab, TC1Og, TC2Od, labels_gd, mesh_db): # Assume (TC1Oa and TC2Ob), (TC1Og, TC2Od) are the same. # Notation differ from the paper, paper(code) # we have 1(a), 2(b), a(alpha), b(beta), g(gamma), d(delta) bsz = TC1Oa.shape[0] assert TC1Oa.shape == (bsz, 4, 4) assert TC2Ob.shape == (bsz, 4, 4) assert TC1Og.shape == (bsz, 4, 4) assert TC2Od.shape == (bsz, 4, 4) assert len(labels_ab) == bsz assert len(labels_gd) == bsz TObC2 = invert_T(TC2Ob) meshes_ab = mesh_db.select(labels_ab) ids_expand, sym_ids = expand_ids_for_symmetry(labels_ab, mesh_db.n_sym_mapping) sym_expand = meshes_ab.symmetries[ids_expand, sym_ids] dist_fn = symmetric_distance_batched_fast dists, _ = dist_fn(TC1Og[ids_expand], (TC1Oa[ids_expand] @ sym_expand @ TObC2[ids_expand]) @ TC2Od[ids_expand], labels_gd[ids_expand], mesh_db) min_ids = scatter_argmin(dists, ids_expand) S_Oa_star = meshes_ab.symmetries[torch.arange(len(min_ids)), sym_ids[min_ids]] TC1C2 = TC1Oa @ S_Oa_star @ TObC2 return TC1C2
def parse_obs_data(obs): data = defaultdict(list) frame_info = obs['frame_info'] TWC = torch.as_tensor(obs['camera']['TWC']).float() for n, obj in enumerate(obs['objects']): info = dict(frame_obj_id=n, label=obj['name'], visib_fract=obj.get('visib_fract', 1), scene_id=frame_info['scene_id'], view_id=frame_info['view_id']) data['infos'].append(info) data['TWO'].append(obj['TWO']) data['bboxes'].append(obj['bbox']) for k, v in data.items(): if k != 'infos': data[k] = torch.stack([torch.as_tensor(x).float() for x in v]) data['infos'] = pd.DataFrame(data['infos']) TCO = invert_T(TWC).unsqueeze(0) @ data['TWO'] data = tc.PandasTensorCollection( infos=data['infos'], TCO=TCO, bboxes=data['bboxes'], poses=TCO, ) return data
def render(self, obj_infos, TCO, K, resolution=(240, 320), render_depth=False): TCO = torch.as_tensor(TCO).detach() TOC = invert_T(TCO).cpu().numpy() K = torch.as_tensor(K).cpu().numpy() bsz = len(TCO) assert TCO.shape == (bsz, 4, 4) assert K.shape == (bsz, 3, 3) # NOTE: Could be faster with pytorch 3.8's sharedmemory for n in np.arange(bsz): obj_info = dict(name=obj_infos[n]['name'], TWO=np.eye(4)) cam_info = dict( resolution=resolution, K=K[n], TWC=TOC[n], ) kwargs = dict(cam_infos=[cam_info], obj_infos=[obj_info], render_depth=render_depth) if self.n_workers > 0: kwargs['data_id'] = n self.in_queue.put(kwargs) else: cam_obs = self.plotters[0].render_scene(**kwargs) images = np.stack([d['rgb'] for d in cam_obs]) depth = np.stack([d['depth'] for d in cam_obs]) if render_depth else None self.out_queue.put((n, images, depth)) images = [None for _ in np.arange(bsz)] depths = [None for _ in np.arange(bsz)] for n in np.arange(bsz): data_id, im, depth = self.out_queue.get() images[data_id] = im[0] if render_depth: depths[data_id] = depth[0] images = torch.as_tensor(np.stack( images, axis=0)).pin_memory().cuda(non_blocking=True) images = images.float().permute(0, 3, 1, 2) / 255 if render_depth: depths = torch.as_tensor(np.stack( depths, axis=0)).pin_memory().cuda(non_blocking=True) depths = depths.float() return images, depths else: return images
def make_scene_infos(self, TWO_9d, TCW_9d): TWO = compute_transform_from_pose9d(TWO_9d) TCW = compute_transform_from_pose9d(TCW_9d) TWC = invert_T(TCW) objects = tc.PandasTensorCollection( infos=self.obj_infos, TWO=TWO, ) cameras = tc.PandasTensorCollection( infos=self.cam_infos, TWC=TWC, K=self.K ) return objects, cameras
def robust_initialization_TWO_TCW(self, n_init=1): TWO_9d_init = [] TCW_9d_init = [] dists = [] for n in range(n_init): TWO, TWC = self.sample_initial_TWO_TWC(n) TCW = invert_T(TWC) TWO_9d, TCW_9d = self.extract_pose9d(TWO), self.extract_pose9d(TCW) dists_, _ = self.align_TCO_cand(TWO_9d, TCW_9d) TWO_9d_init.append(TWO_9d) TCW_9d_init.append(TCW_9d) dists.append(dists_.mean()) best_iter = torch.tensor(dists).argmin() return TWO_9d_init[best_iter], TCW_9d_init[best_iter]
def __init__(self, candidates, cameras, pairs_TC1C2, mesh_db): self.device, self.dtype = candidates.device, candidates.poses.dtype self.mesh_db = mesh_db cameras = cameras.to(self.device).to(self.dtype) pairs_TC1C2 = pairs_TC1C2.to(self.device).to(self.dtype) view_ids = np.unique(candidates.infos['view_id']) keep_ids = np.logical_and( np.isin(pairs_TC1C2.infos['view1'], view_ids), np.isin(pairs_TC1C2.infos['view2'], view_ids), ) pairs_TC1C2 = pairs_TC1C2[np.where(keep_ids)[0]] keep_ids = np.where(np.isin(cameras.infos['view_id'], view_ids))[0] cameras = cameras[keep_ids] self.cam_infos = cameras.infos self.view_to_id = {view_id: n for n, view_id in enumerate(self.cam_infos['view_id'])} self.K = cameras.K self.n_views = len(self.cam_infos) self.obj_infos = make_obj_infos(candidates) self.obj_to_id = {obj_id: n for n, obj_id in enumerate(self.obj_infos['obj_id'])} self.obj_points = self.mesh_db.select(self.obj_infos['label'].values).points self.n_points = self.obj_points.shape[1] self.n_objects = len(self.obj_infos) self.cand = candidates self.cand_TCO = candidates.poses self.cand_labels = candidates.infos['label'] self.cand_view_ids = [self.view_to_id[view_id] for view_id in candidates.infos['view_id']] self.cand_obj_ids = [self.obj_to_id[obj_id] for obj_id in candidates.infos['obj_id']] self.n_candidates = len(self.cand_TCO) self.visibility_matrix = self.make_visibility_matrix(self.cand_view_ids, self.cand_obj_ids) self.v2v1_TC2C1_map = {(self.view_to_id[v2], self.view_to_id[v1]): invert_T(TC1C2) for (v1, v2, TC1C2) in zip(pairs_TC1C2.infos['view1'], pairs_TC1C2.infos['view2'], pairs_TC1C2.TC1C2)} self.ov_TCO_cand_map = {(o, v): TCO for (o, v, TCO) in zip(self.cand_obj_ids, self.cand_view_ids, self.cand_TCO)} self.residuals_ids = self.make_residuals_ids()
def reproject_scene(self, objects, cameras): TCO_data = [] for o in range(len(objects)): for v in range(len(cameras)): obj = objects[[o]] cam = cameras[[v]] infos = dict( scene_id=cam.infos['scene_id'].values, view_id=cam.infos['view_id'].values, score=obj.infos['score'].values + 1.0, view_group=obj.infos['view_group'].values, label=obj.infos['label'].values, batch_im_id=cam.infos['batch_im_id'].values, obj_id=obj.infos['obj_id'].values, from_ba=[True], ) data_ = tc.PandasTensorCollection( infos=pd.DataFrame(infos), poses=invert_T(cam.TWC) @ obj.TWO, ) TCO_data.append(data_) return tc.concatenate(TCO_data)
def make_scene_renderings(objects, cameras, urdf_ds_name, distance=1.5, theta=np.pi/4, angles=[0], object_scale=1.0, camera_scale=1.5, background_color=(242, 231, 191), show_cameras=False, resolution=(640, 480), colormap_rgb=None, object_id_ref=0, gui=False, use_nms3d=True, camera_color=(0.2, 0.2, 0.2, 1.0)): renderer = BulletSceneRenderer([urdf_ds_name, 'camera'], background_color=background_color, gui=gui) urdf_ds = renderer.body_cache.urdf_ds # Patch the scales for visualization is_camera = np.array(['camera' in label for label in urdf_ds.index['label']]) urdf_ds.index.loc[~is_camera, 'scale'] = object_scale * 0.001 urdf_ds.index.loc[is_camera, 'scale'] = camera_scale if use_nms3d: objects = nms3d(objects, poses_attr='TWO', th=0.04) objects = objects.cpu() objects.TWO = objects.poses if colormap_rgb is None: colormap_rgb, _ = make_colormaps(objects.infos['label']) objects.infos['color'] = objects.infos['label'].apply(lambda k: colormap_rgb[k]) cameras = cameras.cpu() TWWB = objects.poses[object_id_ref] cam = cameras[[0]] TCWB = invert_T(cam.TWC.squeeze(0)) @ TWWB TWBC = invert_T(TCWB) if TWBC[2, -1] < 0: quat = euler2quat([np.pi, 0, 0]) TWWB = Transform(TWWB.numpy()) * Transform(quat, np.zeros(3)) TWWB = TWWB.toHomogeneousMatrix() TWWB = np.asarray(TWWB) list_objects = [] for obj_id in range(len(objects)): TWO = np.linalg.inv(TWWB) @ objects.TWO[obj_id].numpy() TWO[:3, -1] *= object_scale obj = dict( name=objects.infos.loc[obj_id, 'label'], color=objects.infos.loc[obj_id, 'color'], TWO=TWO, ) list_objects.append(obj) target = np.mean(np.stack([obj['TWO'][:3, -1] for obj in list_objects]), axis=0) if show_cameras: for cam_id in range(len(cameras)): obj = dict( name='camera', color=camera_color, TWO=np.linalg.inv(TWWB) @ cameras.TWC[cam_id].numpy() ) list_objects.append(obj) fx, fy = 515, 515 w, h = resolution K = np.array([ [fx, 0, w/2], [0, fy, h/2], [0, 0, 1] ]) list_cameras = [] for phi in angles: x = distance * np.sin(theta) * np.cos(phi) y = distance * np.sin(theta) * np.sin(phi) z = distance * np.cos(theta) t = np.array([x, y, z]) R = transforms3d.euler.euler2mat(np.pi, theta, phi, axes='sxyz') R = R @ transforms3d.euler.euler2mat(0, 0, -np.pi/2, axes='sxyz') t += np.array(target) TWC = Transform(R, t).toHomogeneousMatrix() TWBC = TWWB @ TWC list_cameras.append( dict(K=K, TWC=TWC, resolution=(w, h)) ) renders = renderer.render_scene(list_objects, list_cameras) images = np.stack([render['rgb'] for render in renders]) if gui: time.sleep(100) renderer.disconnect() return images
def multiview_candidate_matching(candidates, mesh_db, model_bsz=1e3, score_bsz=1e5, dist_threshold=0.02, cameras=None, n_ransac_iter=20, n_min_inliers=3): timer_models = Timer() timer_score = Timer() timer_misc = Timer() known_poses = cameras is not None if known_poses: logger.debug('Using known camera poses.') n_ransac_iter = 1 else: logger.debug('Estimating camera poses using RANSAC.') timer_misc.start() candidates.infos['cand_id'] = np.arange(len(candidates)) timer_misc.pause() timer_models.start() seeds, tmatches = cosypose_cext.make_ransac_infos( candidates.infos['view_id'].values.tolist(), candidates.infos['label'].values.tolist(), n_ransac_iter, 0, ) if not known_poses: TC1C2 = estimate_camera_poses_batch(candidates, seeds, mesh_db, bsz=model_bsz) else: cameras.infos['idx'] = np.arange(len(cameras)) view_map = cameras.infos.set_index('view_id') TWC1 = cameras.TWC[view_map.loc[seeds['view1'], 'idx'].values] TWC2 = cameras.TWC[view_map.loc[seeds['view2'], 'idx'].values] TC1C2 = invert_T(TWC1) @ TWC2 timer_models.pause() timer_score.start() dists = score_tmaches_batch(candidates, tmatches, TC1C2, mesh_db, bsz=score_bsz) inliers = cosypose_cext.find_ransac_inliers( seeds['view1'], seeds['view2'], tmatches['hypothesis_id'], tmatches['cand1'], tmatches['cand2'], dists.cpu().numpy(), dist_threshold, n_min_inliers, ) timer_score.pause() timer_misc.start() pairs_TC1C2 = get_best_viewpair_pose_est(TC1C2, seeds, inliers) filtered_candidates = scene_level_matching(candidates, inliers) scene_infos = make_obj_infos(filtered_candidates) timer_misc.pause() outputs = dict( filtered_candidates=filtered_candidates, scene_infos=scene_infos, pairs_TC1C2=pairs_TC1C2, time_models=timer_models.stop(), time_score=timer_score.stop(), time_misc=timer_misc.stop(), ) return outputs