def test_vis(): dset_name = sys.argv[1] assert dset_name in DatasetCatalog.list() meta = MetadataCatalog.get(dset_name) dprint("MetadataCatalog: ", meta) objs = meta.objs t_start = time.perf_counter() dicts = DatasetCatalog.get(dset_name) logger.info("Done loading {} samples with {:.3f}s.".format(len(dicts), time.perf_counter() - t_start)) dirname = "output/{}-data-vis".format(dset_name) os.makedirs(dirname, exist_ok=True) for d in dicts: img = read_image_cv2(d["file_name"], format="BGR") depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0 anno = d["annotations"][0] # only one instance per image imH, imW = img.shape[:2] mask = cocosegm2mask(anno["segmentation"], imH, imW) bbox = anno["bbox"] bbox_mode = anno["bbox_mode"] bbox_xyxy = np.array(BoxMode.convert(bbox, bbox_mode, BoxMode.XYXY_ABS)) kpt3d = anno["bbox3d_and_center"] quat = anno["quat"] trans = anno["trans"] R = quat2mat(quat) # 0-based label cat_id = anno["category_id"] K = d["cam"] kpt_2d = misc.project_pts(kpt3d, K, R, trans) # # TODO: visualize pose and keypoints label = objs[cat_id] # img_vis = vis_image_bboxes_cv2(img, bboxes=bboxes_xyxy, labels=labels) img_vis = vis_image_mask_bbox_cv2(img, [mask], bboxes=[bbox_xyxy], labels=[label]) img_vis_kpt2d = img.copy() img_vis_kpt2d = misc.draw_projected_box3d( img_vis_kpt2d, kpt_2d, middle_color=None, bottom_color=(128, 128, 128) ) xyz_info = mmcv.load(anno["xyz_path"]) xyz = np.zeros((imH, imW, 3), dtype=np.float32) xyz_crop = xyz_info["xyz_crop"].astype(np.float32) x1, y1, x2, y2 = xyz_info["xyxy"] xyz[y1 : y2 + 1, x1 : x2 + 1, :] = xyz_crop xyz_show = get_emb_show(xyz) grid_show( [img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]], img_vis_kpt2d[:, :, [2, 1, 0]], depth, xyz_show], ["img", "vis_img", "img_vis_kpts2d", "depth", "emb_show"], row=2, col=3, )
closest_rot = get_closest_rot(est_rot, gt_rot, sym_info) print(("calculate closest rot {}s".format((time.perf_counter() - t) / 3000))) closest_pose = np.copy(gt_pose) closest_pose[:, :3] = closest_rot rd_closest = re(est_rot, closest_pose[:, :3]) print(("rot_est: {}, rot_gt: {}, closest rot_gt: {}".format( mat2axangle(est_rot), mat2axangle(gt_rot), mat2axangle(closest_rot)))) print(("original rot dist: {}, closest rot dist: {}".format(rd_ori, rd_closest))) est_img, _ = renderer.render(obj_id, est_rot, trans) gt_img, _ = renderer.render(obj_id, gt_rot, trans) closest_img, _ = renderer.render(obj_id, closest_rot, trans) show_imgs = [est_img[:, :, [2, 1, 0]], gt_img[:, :, [2, 1, 0]], closest_img[:, :, [2, 1, 0]]] show_titles = ["est", "gt_ori", "gt_closest"] grid_show(show_imgs, show_titles, row=1, col=3) # import cv2 # while(1): # est_img = render(renderer, est_rot, trans) # cv2.imshow('test', cv2.cvtColor(est_img, cv2.COLOR_RGB2BGR)) # q = cv2.waitKey(16) # if q == ord('w'): # trans[1] += 0.05 # elif q == ord('s'): # trans[1] -= 0.05 # elif q == ord('a'): # trans[0] -= 0.1 # elif q == ord('d'): # trans[0] += 0.1 # elif q == ord('q'):
def test_vis(): dset_name = sys.argv[1] assert dset_name in DatasetCatalog.list() meta = MetadataCatalog.get(dset_name) dprint("MetadataCatalog: ", meta) objs = meta.objs t_start = time.perf_counter() dicts = DatasetCatalog.get(dset_name) logger.info("Done loading {} samples with {:.3f}s.".format( len(dicts), time.perf_counter() - t_start)) dirname = "output/{}-data-vis".format(dset_name) os.makedirs(dirname, exist_ok=True) for d in dicts: img = read_image_cv2(d["file_name"], format="BGR") depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0 imH, imW = img.shape[:2] annos = d["annotations"] masks = [ cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos ] bboxes = [anno["bbox"] for anno in annos] bbox_modes = [anno["bbox_mode"] for anno in annos] bboxes_xyxy = np.array([ BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes) ]) kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos] quats = [anno["quat"] for anno in annos] transes = [anno["trans"] for anno in annos] Rs = [quat2mat(quat) for quat in quats] # 0-based label cat_ids = [anno["category_id"] for anno in annos] K = d["cam"] kpts_2d = [ misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes) ] # # TODO: visualize pose and keypoints labels = [objs[cat_id] for cat_id in cat_ids] for _i in range(len(annos)): img_vis = vis_image_mask_bbox_cv2(img, masks[_i:_i + 1], bboxes=bboxes_xyxy[_i:_i + 1], labels=labels[_i:_i + 1]) img_vis_kpts2d = misc.draw_projected_box3d(img_vis.copy(), kpts_2d[_i]) if "test" not in dset_name: xyz_path = annos[_i]["xyz_path"] xyz_info = mmcv.load(xyz_path) x1, y1, x2, y2 = xyz_info["xyxy"] xyz_crop = xyz_info["xyz_crop"].astype(np.float32) xyz = np.zeros((imH, imW, 3), dtype=np.float32) xyz[y1:y2 + 1, x1:x2 + 1, :] = xyz_crop xyz_show = get_emb_show(xyz) xyz_crop_show = get_emb_show(xyz_crop) img_xyz = img.copy() / 255.0 mask_xyz = ((xyz[:, :, 0] != 0) | (xyz[:, :, 1] != 0) | (xyz[:, :, 2] != 0)).astype("uint8") fg_idx = np.where(mask_xyz != 0) img_xyz[fg_idx[0], fg_idx[1], :] = xyz_show[fg_idx[0], fg_idx[1], :3] img_xyz_crop = img_xyz[y1:y2 + 1, x1:x2 + 1, :] img_vis_crop = img_vis[y1:y2 + 1, x1:x2 + 1, :] # diff mask diff_mask_xyz = np.abs(masks[_i] - mask_xyz)[y1:y2 + 1, x1:x2 + 1] grid_show( [ img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]], img_vis_kpts2d[:, :, [2, 1, 0]], depth, # xyz_show, diff_mask_xyz, xyz_crop_show, img_xyz[:, :, [2, 1, 0]], img_xyz_crop[:, :, [2, 1, 0]], img_vis_crop, ], [ "img", "vis_img", "img_vis_kpts2d", "depth", "diff_mask_xyz", "xyz_crop_show", "img_xyz", "img_xyz_crop", "img_vis_crop", ], row=3, col=3, ) else: grid_show( [ img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]], img_vis_kpts2d[:, :, [2, 1, 0]], depth ], ["img", "vis_img", "img_vis_kpts2d", "depth"], row=2, col=2, )
def process(self, inputs, outputs, out_dict): """ Args: inputs: the inputs to a model. It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id", "scene_id". outputs: """ cfg = self.cfg if cfg.TEST.USE_PNP: if cfg.TEST.PNP_TYPE.lower() == "ransac_pnp": return self.process_pnp_ransac(inputs, outputs, out_dict) elif cfg.TEST.PNP_TYPE.lower() == "net_iter_pnp": return self.process_net_and_pnp(inputs, outputs, out_dict, pnp_type="iter") elif cfg.TEST.PNP_TYPE.lower() == "net_ransac_pnp": return self.process_net_and_pnp(inputs, outputs, out_dict, pnp_type="ransac") elif cfg.TEST.PNP_TYPE.lower() == "net_ransac_pnp_rot": # use rot from PnP/RANSAC and translation from Net return self.process_net_and_pnp(inputs, outputs, out_dict, pnp_type="ransac_rot") else: raise NotImplementedError out_rots = out_dict["rot"].detach().to(self._cpu_device).numpy() out_transes = out_dict["trans"].detach().to(self._cpu_device).numpy() out_i = -1 for i, (_input, output) in enumerate(zip(inputs, outputs)): start_process_time = time.perf_counter() for inst_i in range(len(_input["roi_img"])): out_i += 1 file_name = _input["file_name"][inst_i] scene_im_id_split = _input["scene_im_id"][inst_i].split("/") K = _input["cam"][inst_i].cpu().numpy().copy() roi_label = _input["roi_cls"][inst_i] # 0-based label score = _input["score"][inst_i] roi_label, cls_name = self._maybe_adapt_label_cls_name( roi_label) if cls_name is None: continue scene_id = scene_im_id_split[0] im_id = int(scene_im_id_split[1]) # get pose rot_est = out_rots[inst_i] trans_est = out_transes[inst_i] if cfg.DEBUG: # visualize pose pose_est = np.hstack([rot_est, trans_est.reshape(3, 1)]) file_name = _input["file_name"][inst_i] if f"{int(scene_id)}/{im_id}" != "9/499": continue im_ori = mmcv.imread(file_name, "color") bbox = _input["bbox_est"][inst_i].cpu().numpy().copy() x1, y1, x2, y2 = bbox # center = np.array([(x1 + x2) / 2, (y1 + y2) / 2]) # scale = max(x2 - x1, y2 - y1) * 1.5 test_label = _input["roi_cls"][inst_i] kpt_3d = self.kpts_3d[test_label] # kpt_3d = self.kpts_axis_3d[test_label] kpt_2d = misc.project_pts(kpt_3d, K, rot_est, trans_est) gt_dict = self.gts[cls_name][file_name] gt_rot = gt_dict["R"] gt_trans = gt_dict["t"] kpt_2d_gt = misc.project_pts(kpt_3d, K, gt_rot, gt_trans) maxx, maxy, minx, miny = 0, 0, 1000, 1000 for i in range(len(kpt_2d)): maxx, maxy, minx, miny = ( max(maxx, kpt_2d[i][0]), max(maxy, kpt_2d[i][1]), min(minx, kpt_2d[i][0]), min(miny, kpt_2d[i][1]), ) maxx, maxy, minx, miny = ( max(maxx, kpt_2d_gt[i][0]), max(maxy, kpt_2d_gt[i][1]), min(minx, kpt_2d_gt[i][0]), min(miny, kpt_2d_gt[i][1]), ) center = np.array([(minx + maxx) / 2, (miny + maxy) / 2]) scale = max(maxx - minx, maxy - miny) + 5 out_size = 256 zoomed_im = crop_resize_by_warp_affine( im_ori, center, scale, out_size) save_path = osp.join( cfg.OUTPUT_DIR, "vis", "{}_{}_{:06d}_no_bbox.png".format( cls_name, scene_id, im_id)) mmcv.mkdir_or_exist(osp.dirname(save_path)) mmcv.imwrite(zoomed_im, save_path) # yapf: disable kpt_2d = np.array( [ [(x - (center[0] - scale / 2)) * out_size / scale, (y - (center[1] - scale / 2)) * out_size / scale] for [x, y] in kpt_2d ] ) kpt_2d_gt = np.array( [ [(x - (center[0] - scale / 2)) * out_size / scale, (y - (center[1] - scale / 2)) * out_size / scale] for [x, y] in kpt_2d_gt ] ) # yapf: enable # draw est bbox linewidth = 3 visualizer = MyVisualizer(zoomed_im[:, :, ::-1], self._metadata) # zoomed_im_vis = visualizer.draw_axis3d_and_center( # kpt_2d, linewidth=linewidth, draw_center=True # ) # visualizer.draw_bbox3d_and_center( # kpt_2d_gt, top_color=_BLUE, bottom_color=_GREY, linewidth=linewidth, draw_center=True # ) zoomed_im_vis = visualizer.draw_bbox3d_and_center( kpt_2d, top_color=_GREEN, bottom_color=_GREY, linewidth=linewidth, draw_center=True) save_path = osp.join( cfg.OUTPUT_DIR, "vis", "{}_{}_{:06d}_gt_est.png".format( cls_name, scene_id, im_id)) mmcv.mkdir_or_exist(osp.dirname(save_path)) zoomed_im_vis.save(save_path) print("zoomed_in_vis saved to:", save_path) im_vis = vis_image_bboxes_cv2(im_ori, [bbox], [f"{cls_name}_{score}"]) self.ren.clear() self.ren.draw_background( mmcv.bgr2gray(im_ori, keepdim=True)) self.ren.draw_model( self.ren_models[self.data_ref.objects.index(cls_name)], pose_est) ren_im, _ = self.ren.finish() grid_show( [ren_im[:, :, ::-1], im_vis[:, :, ::-1]], [f"ren_im_{cls_name}", f"{scene_id}/{im_id}_{score}"], row=1, col=2, ) output["time"] += time.perf_counter() - start_process_time if cls_name not in self._predictions: self._predictions[cls_name] = OrderedDict() result = { "score": score, "R": rot_est, "t": trans_est, "time": output["time"] } self._predictions[cls_name][file_name] = result
def main(self): split = self.split scene = self.scene # "all" or a single scene sel_scene_ids = self.sel_scene_ids data_root = self.data_root for scene_id in tqdm(sel_scene_ids, postfix=f"{split}_{scene}"): print("split: {} scene: {}".format(split, scene_id)) scene_root = osp.join(data_root, f"{scene_id:06d}") gt_dict = mmcv.load(osp.join(scene_root, "scene_gt.json")) # gt_info_dict = mmcv.load(osp.join(scene_root, "scene_gt_info.json")) # cam_dict = mmcv.load(osp.join(scene_root, "scene_camera.json")) for str_im_id in tqdm(gt_dict, postfix=f"{scene_id}"): int_im_id = int(str_im_id) for anno_i, anno in enumerate(gt_dict[str_im_id]): obj_id = anno["obj_id"] if obj_id not in idx2class: continue R = np.array(anno["cam_R_m2c"], dtype="float32").reshape(3, 3) t = np.array(anno["cam_t_m2c"], dtype="float32") / 1000.0 # pose = np.hstack([R, t.reshape(3, 1)]) save_path = osp.join( xyz_root, f"{scene_id:06d}/{int_im_id:06d}_{anno_i:06d}-xyz.pkl", ) # if osp.exists(save_path) and osp.getsize(save_path) > 0: # continue render_obj_id = cls_indexes.index(obj_id) # 0-based bgr_gl, depth_gl = self.get_renderer().render( render_obj_id, IM_W, IM_H, K, R, t, near, far) mask = (depth_gl > 0).astype("uint8") if mask.sum( ) == 0: # NOTE: this should be ignored at training phase print( f"not visible, split {split} scene {scene_id}, im {int_im_id} obj {idx2class[obj_id]} {obj_id}" ) print(f"{save_path}") xyz_info = { "xyz_crop": np.zeros((IM_H, IM_W, 3), dtype=np.float16), "xyxy": [0, 0, IM_W - 1, IM_H - 1], } if VIS: im_path = osp.join( data_root, f"{scene_id:06d}/rgb/{int_im_id:06d}.jpg", ) im = mmcv.imread(im_path) mask_path = osp.join( data_root, f"{scene_id:06d}/mask/{int_im_id:06d}_{anno_i:06d}.png", ) mask_visib_path = osp.join( data_root, f"{scene_id:06d}/mask_visib/{int_im_id:06d}_{anno_i:06d}.png", ) mask_gt = mmcv.imread(mask_path, "unchanged") mask_visib_gt = mmcv.imread( mask_visib_path, "unchanged") show_ims = [ bgr_gl[:, :, [2, 1, 0]], im[:, :, [2, 1, 0]], mask_gt, mask_visib_gt, ] show_titles = [ "bgr_gl", "im", "mask_gt", "mask_visib_gt", ] grid_show(show_ims, show_titles, row=2, col=2) raise RuntimeError( f"split {split} scene {scene_id}, im {int_im_id}" ) else: x1, y1, x2, y2 = mask2bbox_xyxy(mask) xyz_np = misc.calc_xyz_bp_fast(depth_gl, R, t, K) xyz_crop = xyz_np[y1:y2 + 1, x1:x2 + 1] xyz_info = { "xyz_crop": xyz_crop.astype( "float16" ), # save disk space w/o performance drop "xyxy": [x1, y1, x2, y2], } if VIS: print( f"xyz_crop min {xyz_crop.min()} max {xyz_crop.max()}" ) show_ims = [ bgr_gl[:, :, [2, 1, 0]], get_emb_show(xyz_np), get_emb_show(xyz_crop), ] show_titles = ["bgr_gl", "xyz", "xyz_crop"] grid_show(show_ims, show_titles, row=1, col=3) if not args.no_save: mmcv.mkdir_or_exist(osp.dirname(save_path)) mmcv.dump(xyz_info, save_path) if self.renderer is not None: self.renderer.close()
def test_vis(): dset_name = sys.argv[1] assert dset_name in DatasetCatalog.list() meta = MetadataCatalog.get(dset_name) dprint("MetadataCatalog: ", meta) objs = meta.objs t_start = time.perf_counter() dicts = DatasetCatalog.get(dset_name) logger.info("Done loading {} samples with {:.3f}s.".format( len(dicts), time.perf_counter() - t_start)) dirname = "output/{}-data-vis".format(dset_name) os.makedirs(dirname, exist_ok=True) for d in dicts: img = read_image_cv2(d["file_name"], format="BGR") depth = mmcv.imread(d["depth_file"], "unchanged") / 1000.0 imH, imW = img.shape[:2] annos = d["annotations"] masks = [ cocosegm2mask(anno["segmentation"], imH, imW) for anno in annos ] bboxes = [anno["bbox"] for anno in annos] bbox_modes = [anno["bbox_mode"] for anno in annos] bboxes_xyxy = np.array([ BoxMode.convert(box, box_mode, BoxMode.XYXY_ABS) for box, box_mode in zip(bboxes, bbox_modes) ]) kpts_3d_list = [anno["bbox3d_and_center"] for anno in annos] quats = [anno["quat"] for anno in annos] transes = [anno["trans"] for anno in annos] Rs = [quat2mat(quat) for quat in quats] # 0-based label cat_ids = [anno["category_id"] for anno in annos] K = d["cam"] kpts_2d = [ misc.project_pts(kpt3d, K, R, t) for kpt3d, R, t in zip(kpts_3d_list, Rs, transes) ] # # TODO: visualize pose and keypoints labels = [objs[cat_id] for cat_id in cat_ids] # img_vis = vis_image_bboxes_cv2(img, bboxes=bboxes_xyxy, labels=labels) img_vis = vis_image_mask_bbox_cv2(img, masks, bboxes=bboxes_xyxy, labels=labels) img_vis_kpts2d = img.copy() for anno_i in range(len(annos)): img_vis_kpts2d = misc.draw_projected_box3d(img_vis_kpts2d, kpts_2d[anno_i]) grid_show( [ img[:, :, [2, 1, 0]], img_vis[:, :, [2, 1, 0]], img_vis_kpts2d[:, :, [2, 1, 0]], depth ], [f"img:{d['file_name']}", "vis_img", "img_vis_kpts2d", "depth"], row=2, col=2, )
def process(self, inputs, outputs, out_dict): """ Args: inputs: the inputs to a model. It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id", "scene_id". outputs: stores time """ cfg = self.cfg if cfg.TEST.USE_PNP: if cfg.TEST.PNP_TYPE.lower() == "ransac_pnp": return self.process_pnp_ransac(inputs, outputs, out_dict) elif cfg.TEST.PNP_TYPE.lower() == "net_iter_pnp": return self.process_net_and_pnp(inputs, outputs, out_dict, pnp_type="iter") elif cfg.TEST.PNP_TYPE.lower() == "net_ransac_pnp": return self.process_net_and_pnp(inputs, outputs, out_dict, pnp_type="ransac") else: raise NotImplementedError out_rots = out_dict["rot"].detach().to(self._cpu_device).numpy() out_transes = out_dict["trans"].detach().to(self._cpu_device).numpy() out_i = -1 for i, (_input, output) in enumerate(zip(inputs, outputs)): json_results = [] start_process_time = time.perf_counter() for inst_i in range(len(_input["roi_img"])): out_i += 1 # the index in the flattened output scene_im_id_split = _input["scene_im_id"][inst_i].split("/") K = _input["cam"][inst_i].cpu().numpy().copy() roi_label = _input["roi_cls"][inst_i] # 0-based label score = _input["score"][inst_i] roi_label, cls_name = self._maybe_adapt_label_cls_name( roi_label) if cls_name is None: continue # scene_id = int(scene_im_id_split[0]) scene_id = scene_im_id_split[0] im_id = int(scene_im_id_split[1]) obj_id = self.data_ref.obj2id[cls_name] # get pose rot_est = out_rots[out_i] trans_est = out_transes[out_i] pose_est = np.hstack([rot_est, trans_est.reshape(3, 1)]) if cfg.DEBUG: # visualize pose file_name = _input["file_name"][inst_i] if f"{int(scene_id)}/{im_id}" != "9/47": continue im_ori = mmcv.imread(file_name, "color") bbox = _input["bbox_est"][inst_i].cpu().numpy().copy() im_vis = vis_image_bboxes_cv2(im_ori, [bbox], [f"{cls_name}_{score}"]) self.ren.clear() self.ren.draw_background( mmcv.bgr2gray(im_ori, keepdim=True)) self.ren.draw_model( self.ren_models[self.data_ref.objects.index(cls_name)], pose_est) ren_im, _ = self.ren.finish() grid_show( [ren_im[:, :, ::-1], im_vis[:, :, ::-1]], [f"ren_im_{cls_name}", f"{scene_id}/{im_id}_{score}"], row=1, col=2, ) json_results.extend( self.pose_prediction_to_json(pose_est, scene_id, im_id, obj_id=obj_id, score=score, pose_time=output["time"], K=K)) output["time"] += time.perf_counter() - start_process_time # process time for this image for item in json_results: item["time"] = output["time"] self._predictions.extend(json_results)