def make_scene_infos(self, TWO_9d, TCW_9d): TWO = compute_transform_from_pose9d(TWO_9d) TCW = compute_transform_from_pose9d(TCW_9d) TWC = invert_T(TCW) objects = tc.PandasTensorCollection( infos=self.obj_infos, TWO=TWO, ) cameras = tc.PandasTensorCollection( infos=self.cam_infos, TWC=TWC, K=self.K ) return objects, cameras
def get_best_viewpair_pose_est(TC1C2, seeds, inliers): best_hypotheses = inliers['best_hypotheses'] TC1C2_best = TC1C2[best_hypotheses] view1 = seeds['view1'][best_hypotheses] view2 = seeds['view2'][best_hypotheses] infos = pd.DataFrame(dict(view1=view1, view2=view2)) return tc.PandasTensorCollection(infos=infos, TC1C2=TC1C2_best)
def parse_obs_data(obs): data = defaultdict(list) frame_info = obs['frame_info'] TWC = torch.as_tensor(obs['camera']['TWC']).float() for n, obj in enumerate(obs['objects']): info = dict(frame_obj_id=n, label=obj['name'], visib_fract=obj.get('visib_fract', 1), scene_id=frame_info['scene_id'], view_id=frame_info['view_id']) data['infos'].append(info) data['TWO'].append(obj['TWO']) data['bboxes'].append(obj['bbox']) for k, v in data.items(): if k != 'infos': data[k] = torch.stack([torch.as_tensor(x).float() for x in v]) data['infos'] = pd.DataFrame(data['infos']) TCO = invert_T(TWC).unsqueeze(0) @ data['TWO'] data = tc.PandasTensorCollection( infos=data['infos'], TCO=TCO, bboxes=data['bboxes'], poses=TCO, ) return data
def collate_fn(self, batch): cam_infos, K = [], [] im_infos = [] depth = [] batch_im_id = -1 for n, data in enumerate(batch): assert n == 0 images, masks, obss = data for c, obs in enumerate(obss): batch_im_id += 1 frame_info = obs['frame_info'] im_info = {k: frame_info[k] for k in ('scene_id', 'view_id', 'group_id')} im_info.update(batch_im_id=batch_im_id) im_infos.append(im_info) cam_info = im_info.copy() K.append(obs['camera']['K']) cam_infos.append(cam_info) if self.load_depth: depth.append(torch.tensor(obs['camera']['depth'])) cameras = tc.PandasTensorCollection( infos=pd.DataFrame(cam_infos), K=torch.as_tensor(np.stack(K)), ) data = dict( cameras=cameras, images=images, im_infos=im_infos, ) if self.load_depth: data['depth'] = torch.stack(depth) return data
def collate_fn(self, batch): batch_im_id = -1 det_infos, bboxes = [], [] images = [] im_infos = [] for n, data in enumerate(batch): rgb, masks, obs = data batch_im_id += 1 frame_info = obs['frame_info'] im_info = {k: frame_info[k] for k in ('scene_id', 'view_id')} im_info.update(batch_im_id=batch_im_id) im_infos.append(im_info) images.append(rgb) for o, obj in enumerate(obs['objects']): obj_info = dict( label=obj['name'], score=1.0, ) obj_info.update(im_info) bboxes.append(obj['bbox']) det_infos.append(obj_info) gt_detections = tc.PandasTensorCollection( infos=pd.DataFrame(det_infos), bboxes=torch.as_tensor(np.stack(bboxes)).float(), ) data = dict( images=torch.stack(images), gt_detections=gt_detections, im_infos=im_infos, ) return data
def make_empty_predictions(): infos = dict(view_id=np.empty(0, dtype=np.int), scene_id=np.empty(0, dtype=np.int), label=np.empty(0, dtype=np.object), score=np.empty(0, dtype=np.float)) bboxes = torch.empty(0, 4, dtype=torch.float) return tc.PandasTensorCollection(infos=pd.DataFrame(infos), bboxes=bboxes)
def scene_level_matching(candidates, inliers): cand1 = inliers['inlier_matches_cand1'] cand2 = inliers['inlier_matches_cand2'] edges = np.ones((len(cand1)), dtype=np.int) n_cand = len(candidates) graph = csr_matrix((edges, (cand1, cand2)), shape=(n_cand, n_cand)) n_components, ids = connected_components(graph, directed=True, connection='strong') component_size = defaultdict(lambda: 0) for idx in ids: component_size[idx] += 1 obj_n_cand = np.empty(len(ids), dtype=np.int) for n, idx in enumerate(ids): obj_n_cand[n] = component_size[idx] cand_infos = candidates.infos.copy() cand_infos['component_id'] = ids keep_cand = obj_n_cand >= 2 cand_infos = cand_infos[keep_cand].reset_index(drop=True) for n, (comp_id, group) in enumerate(cand_infos.groupby('component_id')): cand_infos.loc[group.index, 'component_id'] = n cand_infos = cand_infos.rename(columns={'component_id': 'obj_id'}) matched_candidates = tc.PandasTensorCollection( infos=cand_infos, poses=candidates.poses[cand_infos['cand_id'].values]) return matched_candidates
def collate_fn(self, batch): batch_im_id = -1 cam_infos, K = [], [] det_infos, bboxes = [], [] for n, data in enumerate(batch): assert n == 0 images, masks, obss = data for c, obs in enumerate(obss): batch_im_id += 1 frame_info = obs['frame_info'] im_info = { k: frame_info[k] for k in ('scene_id', 'view_id', 'group_id') } im_info.update(batch_im_id=batch_im_id) cam_info = im_info.copy() K.append(obs['camera']['K']) cam_infos.append(cam_info) for o, obj in enumerate(obs['objects']): obj_info = dict( label=obj['name'], score=1.0, ) obj_info.update(im_info) bboxes.append(obj['bbox']) det_infos.append(obj_info) gt_detections = tc.PandasTensorCollection( infos=pd.DataFrame(det_infos), bboxes=torch.as_tensor(np.stack(bboxes)), ) cameras = tc.PandasTensorCollection( infos=pd.DataFrame(cam_infos), K=torch.as_tensor(np.stack(K)), ) data = dict( images=images, cameras=cameras, gt_detections=gt_detections, ) return data
def get_detections(self, images, detection_th=None, output_masks=False, mask_th=0.8, one_instance_per_class=False): images = self.cast(images).float() if images.shape[-1] == 3: images = images.permute(0, 3, 1, 2) if images.max() > 1: images = images / 255. images = images.float().cuda() outputs_ = self.model([image_n for image_n in images]) infos = [] bboxes = [] masks = [] for n, outputs_n in enumerate(outputs_): outputs_n['labels'] = [self.category_id_to_label[category_id.item()] \ for category_id in outputs_n['labels']] for obj_id in range(len(outputs_n['boxes'])): bbox = outputs_n['boxes'][obj_id] info = dict( batch_im_id=n, label=outputs_n['labels'][obj_id], score=outputs_n['scores'][obj_id].item(), ) mask = outputs_n['masks'][obj_id, 0] > mask_th bboxes.append(torch.as_tensor(bbox)) masks.append(torch.as_tensor(mask)) infos.append(info) if len(bboxes) > 0: bboxes = torch.stack(bboxes).cuda().float() masks = torch.stack(masks).cuda() else: infos = dict(score=[], label=[], batch_im_id=[]) bboxes = torch.empty(0, 4).cuda().float() masks = torch.empty(0, images.shape[1], images.shape[2], dtype=torch.bool).cuda() outputs = tc.PandasTensorCollection( infos=pd.DataFrame(infos), bboxes=bboxes, ) if output_masks: outputs.register_tensor('masks', masks) if detection_th is not None: keep = np.where(outputs.infos['score'] > detection_th)[0] outputs = outputs[keep] if one_instance_per_class: infos = outputs.infos infos['det_idx'] = np.arange(len(infos)) keep_ids = infos.sort_values('score', ascending=False).drop_duplicates('label')['det_idx'].values outputs = outputs[keep_ids] outputs.infos = outputs.infos.drop('det_idx', axis=1) return outputs
def read_cameras(json_path, view_ids): cameras = json.loads(Path(json_path).read_text()) all_K = [] for view_id in view_ids: cam_info = cameras[str(view_id)] K = np.array(cam_info['cam_K']).reshape(3, 3) all_K.append(K) K = torch.as_tensor(np.stack(all_K)) cameras = tc.PandasTensorCollection(K=K, infos=pd.DataFrame( dict(view_id=view_ids))) return cameras
def make_TCO_init(self, detections, K): K = K[detections.infos['batch_im_id'].values] boxes = detections.bboxes if self.coarse_model.cfg.init_method == 'z-up+auto-depth': meshes = self.coarse_model.mesh_db.select( detections.infos['label']) points_3d = meshes.sample_points(2000, deterministic=True) TCO_init = TCO_init_from_boxes_zup_autodepth(boxes, points_3d, K) else: TCO_init = TCO_init_from_boxes(z_range=(1.0, 1.0), boxes=boxes, K=K) return tc.PandasTensorCollection(infos=detections.infos, poses=TCO_init)
def batched_model_predictions(self, model, images, K, obj_data, n_iterations=1): timer = Timer() timer.start() ids = torch.arange(len(obj_data)) ds = TensorDataset(ids) dl = DataLoader(ds, batch_size=self.bsz_objects) preds = defaultdict(list) for (batch_ids, ) in dl: timer.resume() obj_inputs = obj_data[batch_ids.numpy()] labels = obj_inputs.infos['label'].values im_ids = obj_inputs.infos['batch_im_id'].values images_ = images[im_ids] K_ = K[im_ids] TCO_input = obj_inputs.poses outputs = model(images=images_, K=K_, TCO=TCO_input, n_iterations=n_iterations, labels=labels) timer.pause() for n in range(1, n_iterations + 1): iter_outputs = outputs[f'iteration={n}'] infos = obj_inputs.infos batch_preds = tc.PandasTensorCollection( infos, poses=iter_outputs['TCO_output'], poses_input=iter_outputs['TCO_input'], K_crop=iter_outputs['K_crop'], boxes_rend=iter_outputs['boxes_rend'], boxes_crop=iter_outputs['boxes_crop']) preds[f'iteration={n}'].append(batch_preds) logger.debug( f'Pose prediction on {len(obj_data)} detections (n_iterations={n_iterations}): {timer.stop()}' ) preds = dict(preds) for k, v in preds.items(): preds[k] = tc.concatenate(v) return preds
def load_posecnn_results(): results_path = LOCAL_DATA_DIR / 'saved_detections' / 'ycbv_posecnn.pkl' results = pkl.loads(results_path.read_bytes()) infos, poses, bboxes = [], [], [] l_offsets = (LOCAL_DATA_DIR / 'bop_datasets/ycbv' / 'offsets.txt').read_text().strip().split('\n') ycb_offsets = dict() for l_n in l_offsets: obj_id, offset = l_n[:2], l_n[3:] obj_id = int(obj_id) offset = np.array(json.loads(offset)) * 0.001 ycb_offsets[obj_id] = offset def mat_from_qt(qt): wxyz = qt[:4].copy().tolist() xyzw = [*wxyz[1:], wxyz[0]] t = qt[4:].copy() return Transform(xyzw, t) for scene_view_str, result in results.items(): scene_id, view_id = scene_view_str.split('/') scene_id, view_id = int(scene_id), int(view_id) n_dets = result['rois'].shape[0] for n in range(n_dets): obj_id = result['rois'][:, 1].astype(np.int)[n] label = f'obj_{obj_id:06d}' infos.append( dict( scene_id=scene_id, view_id=view_id, score=result['rois'][n, 1], label=label, )) bboxes.append(result['rois'][n, 2:6]) pose = mat_from_qt(result['poses'][n]) offset = ycb_offsets[obj_id] pose = pose * Transform((0, 0, 0, 1), offset).inverse() poses.append(pose.toHomogeneousMatrix()) data = tc.PandasTensorCollection( infos=pd.DataFrame(infos), poses=torch.as_tensor(np.stack(poses)).float(), bboxes=torch.as_tensor(np.stack(bboxes)).float(), ).cpu() return data
def read_csv_candidates(csv_path): df = pd.read_csv(csv_path) infos = df.loc[:, ['im_id', 'scene_id', 'score', 'obj_id']] infos['obj_id'] = infos['obj_id'].apply(lambda x: f'obj_{x:06d}') infos = infos.rename(dict(im_id='view_id', obj_id='label'), axis=1) R = np.stack( df['R'].apply(lambda x: list(map(float, x.split(' '))))).reshape( -1, 3, 3) t = np.stack( df['t'].apply(lambda x: list(map(float, x.split(' '))))).reshape( -1, 3) * 1e-3 R = torch.tensor(R, dtype=torch.float) t = torch.tensor(t, dtype=torch.float) TCO = torch.eye(4, dtype=torch.float).unsqueeze(0).repeat(len(R), 1, 1) TCO[:, :3, :3] = R TCO[:, :3, -1] = t candidates = tc.PandasTensorCollection(poses=TCO, infos=infos) return candidates
def load_pix2pose_results(all_detections=True, remove_incorrect_poses=False): if all_detections: results_path = LOCAL_DATA_DIR / 'saved_detections' / 'tless_pix2pose_retinanet_vivo_all.pkl' else: results_path = LOCAL_DATA_DIR / 'saved_detections' / 'tless_pix2pose_retinanet_siso_top1.pkl' pix2pose_results = pkl.loads(results_path.read_bytes()) infos, poses, bboxes = [], [], [] for key, result in pix2pose_results.items(): scene_id, view_id = key.split('/') scene_id, view_id = int(scene_id), int(view_id) boxes = result['rois'] scores = result['scores'] poses_ = result['poses'] labels = result['labels_txt'] new_boxes = boxes.copy() new_boxes[:, 0] = boxes[:, 1] new_boxes[:, 1] = boxes[:, 0] new_boxes[:, 2] = boxes[:, 3] new_boxes[:, 3] = boxes[:, 2] for o, label in enumerate(labels): t = poses_[o][:3, -1] if remove_incorrect_poses and (np.sum(t) == 0 or np.max(t) > 100): pass else: infos.append( dict( scene_id=scene_id, view_id=view_id, score=scores[o], label=label, )) bboxes.append(new_boxes[o]) poses.append(poses_[o]) data = tc.PandasTensorCollection( infos=pd.DataFrame(infos), poses=torch.as_tensor(np.stack(poses)), bboxes=torch.as_tensor(np.stack(bboxes)).float(), ).cpu() return data
def reproject_scene(self, objects, cameras): TCO_data = [] for o in range(len(objects)): for v in range(len(cameras)): obj = objects[[o]] cam = cameras[[v]] infos = dict( scene_id=cam.infos['scene_id'].values, view_id=cam.infos['view_id'].values, score=obj.infos['score'].values + 1.0, view_group=obj.infos['view_group'].values, label=obj.infos['label'].values, batch_im_id=cam.infos['batch_im_id'].values, obj_id=obj.infos['obj_id'].values, from_ba=[True], ) data_ = tc.PandasTensorCollection( infos=pd.DataFrame(infos), poses=invert_T(cam.TWC) @ obj.TWO, ) TCO_data.append(data_) return tc.concatenate(TCO_data)
def predict_scene_state(self, candidates, cameras, score_th=0.3, use_known_camera_poses=False, ransac_n_iter=2000, ransac_dist_threshold=0.02, ba_n_iter=100): predictions = dict() cand_inputs = candidates assert len(np.unique(candidates.infos['scene_id'])) == 1 scene_id = np.unique(candidates.infos['scene_id']).item() group_id = np.unique(candidates.infos['group_id']).item() keep = np.where(candidates.infos['score'] >= score_th)[0] candidates = candidates[keep] predictions['cand_inputs'] = candidates logger.debug(f'Num candidates: {len(candidates)}') logger.debug(f'Num views: {len(cameras)}') matching_outputs = multiview_candidate_matching( candidates=candidates, mesh_db=self.mesh_db_ransac, n_ransac_iter=ransac_n_iter, dist_threshold=ransac_dist_threshold, cameras=cameras if use_known_camera_poses else None) pairs_TC1C2 = matching_outputs['pairs_TC1C2'] candidates = matching_outputs['filtered_candidates'] logger.debug(f'Matched candidates: {len(candidates)}') for k, v in matching_outputs.items(): if 'time' in k: logger.debug(f'RANSAC {k}: {v}') predictions['cand_matched'] = candidates group_infos = make_view_groups(pairs_TC1C2) candidates = candidates.merge_df(group_infos, on='view_id').cuda() pred_objects, pred_cameras, pred_reproj = [], [], [] pred_reproj_init = [] for (view_group, candidate_ids ) in candidates.infos.groupby('view_group').groups.items(): candidates_n = candidates[candidate_ids] problem = MultiviewRefinement(candidates=candidates_n, cameras=cameras, pairs_TC1C2=pairs_TC1C2, mesh_db=self.mesh_db_ba) ba_outputs = problem.solve( n_iterations=ba_n_iter, optimize_cameras=not use_known_camera_poses, ) pred_objects_, pred_cameras_ = ba_outputs['objects'], ba_outputs[ 'cameras'] for x in (pred_objects_, pred_cameras_): x.infos['view_group'] = view_group x.infos['group_id'] = group_id x.infos['scene_id'] = scene_id pred_reproj.append( self.reproject_scene(pred_objects_, pred_cameras_)) pred_objects.append(pred_objects_) pred_cameras.append(pred_cameras_) pred_objects_init, pred_cameras_init = ba_outputs[ 'objects_init'], ba_outputs['cameras_init'] for x in (pred_objects_init, pred_cameras_init): x.infos['view_group'] = view_group x.infos['group_id'] = group_id x.infos['scene_id'] = scene_id pred_reproj_init.append( self.reproject_scene(pred_objects_init, pred_cameras_init)) for k, v in ba_outputs.items(): if 'time' in k: logger.debug(f'BA {k}: {v}') predictions['scene/objects'] = tc.concatenate(pred_objects) predictions['scene/cameras'] = tc.concatenate(pred_cameras) predictions['ba_output'] = tc.concatenate(pred_reproj) predictions['ba_input'] = tc.concatenate(pred_reproj_init) cand_inputs = tc.PandasTensorCollection( infos=cand_inputs.infos, poses=cand_inputs.poses, ) predictions['ba_output+all_cand'] = tc.concatenate( [predictions['ba_output'], cand_inputs], ) return predictions