def load_data(self): if self.data_split == 'train': db = COCO(self.annot_path) with open(self.smpl_param_path) as f: smpl_params = json.load(f) else: print('Unknown data subset') assert 0 datalist = [] for iid in db.imgs.keys(): img = db.imgs[iid] img_id = img["id"] img_width, img_height = img['width'], img['height'] imgname = img['file_name'] img_path = osp.join(self.img_dir, imgname) focal = img["f"] princpt = img["c"] cam_param = {'focal': focal, 'princpt': princpt} # crop the closest person to the camera ann_ids = db.getAnnIds(img_id) anns = db.loadAnns(ann_ids) root_depths = [ ann['keypoints_cam'][self.muco_root_joint_idx][2] for ann in anns ] closest_pid = root_depths.index(min(root_depths)) pid_list = [closest_pid] for pid in pid_list: joint_cam = np.array(anns[pid]['keypoints_cam']) joint_img = np.array(anns[pid]['keypoints_img']) joint_img = np.concatenate([joint_img, joint_cam[:, 2:]], 1) joint_valid = np.ones((self.muco_joint_num, 1)) bbox = process_bbox(anns[pid]['bbox'], img_width, img_height) if bbox is None: continue # check smpl parameter exist try: smpl_param = smpl_params[str(ann_ids[pid])] except KeyError: smpl_param = None datalist.append({ 'img_path': img_path, 'img_shape': (img_height, img_width), 'bbox': bbox, 'joint_img': joint_img, 'joint_cam': joint_cam, 'joint_valid': joint_valid, 'cam_param': cam_param, 'smpl_param': smpl_param }) return datalist
def load_data(self): db = COCO(osp.join(self.data_path, '3DPW_' + self.data_split + '.json')) if self.data_split == 'test' and not cfg.use_gt_info: print("Get bounding box and root from " + self.human_bbox_root_dir) bbox_root_result = {} with open(self.human_bbox_root_dir) as f: annot = json.load(f) for i in range(len(annot)): ann_id = str(annot[i]['ann_id']) bbox_root_result[ann_id] = { 'bbox': np.array(annot[i]['bbox']), 'root': np.array(annot[i]['root_cam']) } else: print("Get bounding box and root from groundtruth") datalist = [] for aid in db.anns.keys(): ann = db.anns[aid] image_id = ann['image_id'] img = db.loadImgs(image_id)[0] img_width, img_height = img['width'], img['height'] sequence_name = img['sequence'] img_name = img['file_name'] # img_path = osp.join(self.data_path, 'imageFiles', sequence_name, img_name) img_path = osp.join('/scratch2/as2562/datasets/3DPW/', 'imageFiles', sequence_name, img_name) cam_param = { k: np.array(v, dtype=np.float32) for k, v in img['cam_param'].items() } smpl_param = ann['smpl_param'] if self.data_split == 'test' and not cfg.use_gt_info: bbox = bbox_root_result[str( aid )]['bbox'] # bbox should be aspect ratio preserved-extended. It is done in RootNet. root_joint_depth = bbox_root_result[str(aid)]['root'][2] else: bbox = process_bbox(np.array(ann['bbox']), img['width'], img['height']) if bbox is None: continue root_joint_depth = None datalist.append({ 'img_path': img_path, 'img_shape': (img_height, img_width), 'bbox': bbox, 'smpl_param': smpl_param, 'cam_param': cam_param, 'root_joint_depth': root_joint_depth }) return datalist
def __init__(self, transform, mode, annot_subset): self.mode = mode # train, test, val self.annot_subset = annot_subset # all, human_annot, machine_annot self.img_path = '../data/InterHand2.6M/images' self.annot_path = '../data/InterHand2.6M/annotations' if self.annot_subset == 'machine_annot' and self.mode == 'val': self.rootnet_output_path = '../data/InterHand2.6M/rootnet_output/rootnet_interhand2.6m_output_machine_annot_val.json' else: self.rootnet_output_path = '../data/InterHand2.6M/rootnet_output/rootnet_interhand2.6m_output_all_test.json' self.transform = transform self.joint_num = 21 # single hand self.root_joint_idx = {'right': 20, 'left': 41} self.joint_type = {'right': np.arange(0,self.joint_num), 'left': np.arange(self.joint_num,self.joint_num*2)} self.skeleton = load_skeleton(osp.join(self.annot_path, 'skeleton.txt'), self.joint_num*2) self.datalist = [] self.datalist_sh = [] self.datalist_ih = [] self.sequence_names = [] # load annotation print("Load annotation from " + osp.join(self.annot_path, self.annot_subset)) db = COCO(osp.join(self.annot_path, self.annot_subset, 'InterHand2.6M_' + self.mode + '_data.json')) with open(osp.join(self.annot_path, self.annot_subset, 'InterHand2.6M_' + self.mode + '_camera.json')) as f: cameras = json.load(f) with open(osp.join(self.annot_path, self.annot_subset, 'InterHand2.6M_' + self.mode + '_joint_3d.json')) as f: joints = json.load(f) if (self.mode == 'val' or self.mode == 'test') and cfg.trans_test == 'rootnet': print("Get bbox and root depth from " + self.rootnet_output_path) rootnet_result = {} with open(self.rootnet_output_path) as f: annot = json.load(f) for i in range(len(annot)): rootnet_result[str(annot[i]['annot_id'])] = annot[i] else: print("Get bbox and root depth from groundtruth annotation") for aid in db.anns.keys(): ann = db.anns[aid] image_id = ann['image_id'] img = db.loadImgs(image_id)[0] capture_id = img['capture'] seq_name = img['seq_name'] cam = img['camera'] frame_idx = img['frame_idx'] img_path = osp.join(self.img_path, self.mode, img['file_name']) campos, camrot = np.array(cameras[str(capture_id)]['campos'][str(cam)], dtype=np.float32), np.array(cameras[str(capture_id)]['camrot'][str(cam)], dtype=np.float32) focal, princpt = np.array(cameras[str(capture_id)]['focal'][str(cam)], dtype=np.float32), np.array(cameras[str(capture_id)]['princpt'][str(cam)], dtype=np.float32) joint_world = np.array(joints[str(capture_id)][str(frame_idx)], dtype=np.float32) joint_cam = world2cam(joint_world.transpose(1,0), camrot, campos.reshape(3,1)).transpose(1,0) joint_img = cam2pixel(joint_cam, focal, princpt)[:,:2] joint_valid = np.array(ann['joint_valid'],dtype=np.float32).reshape(self.joint_num*2) # if root is not valid -> root-relative 3D pose is also not valid. Therefore, mark all joints as invalid joint_valid[self.joint_type['right']] *= joint_valid[self.root_joint_idx['right']] joint_valid[self.joint_type['left']] *= joint_valid[self.root_joint_idx['left']] hand_type = ann['hand_type'] hand_type_valid = np.array((ann['hand_type_valid']), dtype=np.float32) if (self.mode == 'val' or self.mode == 'test') and cfg.trans_test == 'rootnet': bbox = np.array(rootnet_result[str(aid)]['bbox'],dtype=np.float32) abs_depth = {'right': rootnet_result[str(aid)]['abs_depth'][0], 'left': rootnet_result[str(aid)]['abs_depth'][1]} else: img_width, img_height = img['width'], img['height'] bbox = np.array(ann['bbox'],dtype=np.float32) # x,y,w,h bbox = process_bbox(bbox, (img_height, img_width)) abs_depth = {'right': joint_cam[self.root_joint_idx['right'],2], 'left': joint_cam[self.root_joint_idx['left'],2]} cam_param = {'focal': focal, 'princpt': princpt} joint = {'cam_coord': joint_cam, 'img_coord': joint_img, 'valid': joint_valid} data = {'img_path': img_path, 'seq_name': seq_name, 'cam_param': cam_param, 'bbox': bbox, 'joint': joint, 'hand_type': hand_type, 'hand_type_valid': hand_type_valid, 'abs_depth': abs_depth, 'file_name': img['file_name'], 'capture': capture_id, 'cam': cam, 'frame': frame_idx} if hand_type == 'right' or hand_type == 'left': self.datalist_sh.append(data) else: self.datalist_ih.append(data) if seq_name not in self.sequence_names: self.sequence_names.append(seq_name) self.datalist = self.datalist_sh + self.datalist_ih print('Number of annotations in single hand sequences: ' + str(len(self.datalist_sh))) print('Number of annotations in interacting hand sequences: ' + str(len(self.datalist_ih)))
def load_data(self): if self.data_split == 'train': db = COCO(osp.join(self.data_path, 'freihand_train_coco.json')) with open(osp.join(self.data_path, 'freihand_train_data.json')) as f: data = json.load(f) else: db = COCO(osp.join(self.data_path, 'freihand_eval_coco.json')) with open(osp.join(self.data_path, 'freihand_eval_data.json')) as f: data = json.load(f) print("Get bounding box and root from " + self.human_bbox_root_dir) bbox_root_result = {} with open(self.human_bbox_root_dir) as f: annot = json.load(f) for i in range(len(annot)): bbox_root_result[str(annot[i]['image_id'])] = { 'bbox': np.array(annot[i]['bbox']), 'root': np.array(annot[i]['root_cam']) } datalist = [] for aid in db.anns.keys(): ann = db.anns[aid] image_id = ann['image_id'] img = db.loadImgs(image_id)[0] img_path = osp.join(self.data_path, img['file_name']) img_shape = (img['height'], img['width']) db_idx = str(img['db_idx']) if self.data_split == 'train': cam_param, mano_param, joint_cam = data[db_idx][ 'cam_param'], data[db_idx]['mano_param'], data[db_idx][ 'joint_3d'] joint_cam = np.array(joint_cam).reshape(-1, 3) bbox = process_bbox(np.array(ann['bbox']), img['width'], img['height']) if bbox is None: continue root_joint_depth = joint_cam[self.root_joint_idx][2] else: cam_param, scale = data[db_idx]['cam_param'], data[db_idx][ 'scale'] joint_cam = np.ones((self.joint_num, 3), dtype=np.float32) # dummy mano_param = { 'pose': np.ones((48), dtype=np.float32), 'shape': np.ones((10), dtype=np.float32) } bbox = bbox_root_result[str( image_id )]['bbox'] # bbox should be aspect ratio preserved-extended. It is done in RootNet. root_joint_depth = bbox_root_result[str(image_id)]['root'][2] datalist.append({ 'img_path': img_path, 'img_shape': img_shape, 'bbox': bbox, 'joint_cam': joint_cam, 'cam_param': cam_param, 'mano_param': mano_param, 'root_joint_depth': root_joint_depth }) return datalist
model = get_model(vertex_num, joint_num, 'test') model = DataParallel(model).cuda() ckpt = torch.load(model_path) model.load_state_dict(ckpt['network'], strict=False) model.eval() # prepare input image transform = transforms.ToTensor() img_path = 'input.jpg' original_img = cv2.imread(img_path) original_img_height, original_img_width = original_img.shape[:2] # prepare bbox bbox = [139.41, 102.25, 222.39, 241.57] # xmin, ymin, width, height bbox = process_bbox(bbox, original_img_width, original_img_height) img, img2bb_trans, bb2img_trans = generate_patch_image(original_img, bbox, 1.0, 0.0, False, cfg.input_img_shape) img = transform(img.astype(np.float32))/255 img = img.cuda()[None,:,:,:] # forward inputs = {'img': img} targets = {} meta_info = {'bb2img_trans': bb2img_trans} with torch.no_grad(): out = model(inputs, targets, meta_info, 'test') img = img[0].cpu().numpy().transpose(1,2,0) # cfg.input_img_shape[1], cfg.input_img_shape[0], 3 mesh_lixel_img = out['mesh_coord_img'][0].cpu().numpy() mesh_param_cam = out['mesh_coord_cam'][0].cpu().numpy() # restore mesh_lixel_img to original image space and continuous depth space
def load_data(self): db = COCO( osp.join(self.annot_path, 'person_keypoints_' + self.data_split + '2017.json')) with open(osp.join(self.annot_path, 'coco_smplifyx_train.json')) as f: smplify_results = json.load(f) datalist = [] if self.data_split == 'train': for aid in db.anns.keys(): ann = db.anns[aid] img = db.loadImgs(ann['image_id'])[0] imgname = osp.join('train2017', img['file_name']) img_path = osp.join(self.img_path, imgname) width, height = img['width'], img['height'] if ann['iscrowd'] or (ann['num_keypoints'] == 0): continue # bbox bbox = process_bbox(ann['bbox'], width, height) if bbox is None: continue # joint coordinates joint_img = np.array(ann['keypoints'], dtype=np.float32).reshape(-1, 3) joint_img = self.add_pelvis(joint_img) joint_valid = (joint_img[:, 2].copy().reshape(-1, 1) > 0).astype(np.float32) joint_img[:, 2] = 0 if str(aid) in smplify_results: smplify_result = smplify_results[str(aid)] else: smplify_result = None datalist.append({ 'img_path': img_path, 'img_shape': (height, width), 'bbox': bbox, 'joint_img': joint_img, 'joint_valid': joint_valid, 'smplify_result': smplify_result }) else: with open(self.rootnet_output_path) as f: rootnet_output = json.load(f) print('Load RootNet output from ' + self.rootnet_output_path) for i in range(len(rootnet_output)): image_id = rootnet_output[i]['image_id'] if image_id not in db.imgs: continue img = db.loadImgs(image_id)[0] imgname = osp.join('val2017', img['file_name']) img_path = osp.join(self.img_path, imgname) height, width = img['height'], img['width'] fx, fy, cx, cy = 1500, 1500, img['width'] / 2, img['height'] / 2 focal = np.array([fx, fy], dtype=np.float32) princpt = np.array([cx, cy], dtype=np.float32) root_joint_depth = np.array(rootnet_output[i]['root_cam'][2]) bbox = np.array(rootnet_output[i]['bbox']).reshape(4) cam_param = {'focal': focal, 'princpt': princpt} datalist.append({ 'img_path': img_path, 'img_shape': (height, width), 'bbox': bbox, 'root_joint_depth': root_joint_depth, 'cam_param': cam_param }) return datalist
def __init__(self, transform, mode, annot_subset): self.mode = mode self.root_path = '../data/RHD/data' self.rootnet_output_path = '../data/RHD/rootnet_output/rootnet_rhd_output.json' self.original_img_shape = (320, 320) # height, width self.transform = transform self.joint_num = 21 # single hand self.joint_type = {'right': np.arange(self.joint_num,self.joint_num*2), 'left': np.arange(0,self.joint_num)} self.root_joint_idx = {'right': 21, 'left': 0} self.skeleton = load_skeleton(osp.join(self.root_path, 'skeleton.txt'), self.joint_num*2) self.datalist = []; if self.mode == 'train': set = 'training' else: set = 'evaluation' self.annot_path = osp.join(self.root_path, 'RHD_' + set + '.json') db = COCO(self.annot_path) if self.mode == 'test' and cfg.trans_test == 'rootnet': print("Get bbox and root depth from " + self.rootnet_output_path) rootnet_result = {} with open(self.rootnet_output_path) as f: annot = json.load(f) for i in range(len(annot)): rootnet_result[str(annot[i]['annot_id'])] = annot[i] else: print("Get bbox and root depth from groundtruth annotation") for aid in db.anns.keys(): ann = db.anns[aid] image_id = ann['image_id'] img = db.loadImgs(image_id)[0] img_path = osp.join(self.root_path, set, 'color', img['file_name']) img_width, img_height = img['width'], img['height'] cam_param = img['cam_param'] focal, princpt = np.array(cam_param['focal'],dtype=np.float32), np.array(cam_param['princpt'],dtype=np.float32) joint_img = np.array(ann['joint_img'],dtype=np.float32) joint_cam = np.array(ann['joint_cam'],dtype=np.float32) joint_valid = np.array(ann['joint_valid'],dtype=np.float32) # transform single hand data to double hand data structure hand_type = ann['hand_type'] joint_img_dh = np.zeros((self.joint_num*2,2),dtype=np.float32) joint_cam_dh = np.zeros((self.joint_num*2,3),dtype=np.float32) joint_valid_dh = np.zeros((self.joint_num*2),dtype=np.float32) joint_img_dh[self.joint_type[hand_type]] = joint_img joint_cam_dh[self.joint_type[hand_type]] = joint_cam joint_valid_dh[self.joint_type[hand_type]] = joint_valid joint_img = joint_img_dh; joint_cam = joint_cam_dh; joint_valid = joint_valid_dh; if self.mode == 'test' and cfg.trans_test == 'rootnet': bbox = np.array(rootnet_result[str(aid)]['bbox'],dtype=np.float32) abs_depth = rootnet_result[str(aid)]['abs_depth'] else: bbox = np.array(ann['bbox'],dtype=np.float32) # x,y,w,h bbox = process_bbox(bbox, (img_height, img_width)) abs_depth = joint_cam[self.root_joint_idx[hand_type],2] # single hand abs depth cam_param = {'focal': focal, 'princpt': princpt} joint = {'cam_coord': joint_cam, 'img_coord': joint_img, 'valid': joint_valid} data = {'img_path': img_path, 'bbox': bbox, 'cam_param': cam_param, 'joint': joint, 'hand_type': hand_type, 'abs_depth': abs_depth} self.datalist.append(data)
def main(): # input_size=416 # iou_threshold=0.45 # score_threshold=0.3 # Yolo = Load_Yolo_model() times = [] output_path = "output" vid = cv2.VideoCapture(0) vid.set(3, 1280) vid.set(4, 1024) # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) focal = (1500, 1500) princpt = (width / 2, height / 2) print(f"Width {width} Height {height}") bbox = [0, 0, width, height] bbox = process_bbox(bbox, width, height) root_depth = 11250.5732421875 # obtain this from RootNet (https://github.com/mks0601/3DMPPE_ROOTNET_RELEASE/tree/master/demo) root_depth /= 1000 # output of RootNet is milimeter. change it to meter with torch.no_grad(): while True: _, frame = vid.read() t1 = time.time() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break # image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) # image_data = image_data[np.newaxis, ...].astype(np.float32) # if YOLO_FRAMEWORK == "tf": # pred_bbox = Yolo.predict(image_data) # elif YOLO_FRAMEWORK == "trt": # batched_input = tf.constant(image_data) # result = Yolo(batched_input) # pred_bbox = [] # for key, value in result.items(): # value = value.numpy() # pred_bbox.append(value) # pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] # pred_bbox = tf.concat(pred_bbox, axis=0) # bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) # bboxes = nms(bboxes, iou_threshold, method='nms') # frame = draw_bbox(original_frame, bboxes) #----------------------------------------i2l meshnet--------------------------------------- # original_img_height, original_img_width = original_frame.shape[:2] # bbox = bboxes[0][:4] img, img2bb_trans, bb2img_trans = generate_patch_image( original_frame, bbox, 1.0, 0.0, False, cfg.input_img_shape) img = transform(img.astype(np.float32)) / 255 img = img.cuda()[None, :, :, :] # forward inputs = {'img': img} targets = {} meta_info = {'bb2img_trans': bb2img_trans} out = model(inputs, targets, meta_info, 'test') img = img[0].cpu().numpy().transpose( 1, 2, 0) # cfg.input_img_shape[1], cfg.input_img_shape[0], 3 mesh_lixel_img = out['mesh_coord_img'][0].cpu().numpy() mesh_param_cam = out['mesh_coord_cam'][0].cpu().numpy() # restore mesh_lixel_img to original image space and continuous depth space mesh_lixel_img[:, 0] = mesh_lixel_img[:, 0] / cfg.output_hm_shape[ 2] * cfg.input_img_shape[1] mesh_lixel_img[:, 1] = mesh_lixel_img[:, 1] / cfg.output_hm_shape[ 1] * cfg.input_img_shape[0] mesh_lixel_img[:, :2] = np.dot( bb2img_trans, np.concatenate((mesh_lixel_img[:, :2], np.ones_like(mesh_lixel_img[:, :1])), 1).transpose(1, 0)).transpose(1, 0) mesh_lixel_img[:, 2] = ( mesh_lixel_img[:, 2] / cfg.output_hm_shape[0] * 2. - 1) * (cfg.bbox_3d_size / 2) # root-relative 3D coordinates -> absolute 3D coordinates root_xy = np.dot(joint_regressor, mesh_lixel_img)[root_joint_idx, :2] root_img = np.array([root_xy[0], root_xy[1], root_depth]) root_cam = pixel2cam(root_img[None, :], focal, princpt) mesh_lixel_img[:, 2] += root_depth mesh_lixel_cam = pixel2cam(mesh_lixel_img, focal, princpt) mesh_param_cam += root_cam.reshape(1, 3) # visualize lixel mesh in 2D space # vis_img = frame.copy() # vis_img = vis_mesh(vis_img, mesh_lixel_img) # cv2.imwrite('output_mesh_lixel.jpg', vis_img) # visualize lixel mesh in 2D space # vis_img = frame.copy() # mesh_param_img = cam2pixel(mesh_param_cam, focal, princpt) # vis_img = vis_mesh(vis_img, mesh_param_img) # cv2.imwrite('output_mesh_param.jpg', vis_img) # save mesh (obj) # save_obj(mesh_lixel_cam, face, 'output_mesh_lixel.obj') # save_obj(mesh_param_cam, face, 'output_mesh_param.obj') # render mesh from lixel vis_img = frame.copy() rendered_img = render_mesh(vis_img, mesh_lixel_cam, face, { 'focal': focal, 'princpt': princpt }) # cv2.imwrite('rendered_mesh_lixel.jpg', rendered_img) cv2.imshow('output', rendered_img / 255) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break # render mesh from param # vis_img = frame.copy() # rendered_img = render_mesh(vis_img, mesh_param_cam, face, {'focal': focal, 'princpt': princpt}) # cv2.imwrite('rendered_mesh_param.jpg', rendered_img) #----------------------------------------i2l meshnet--------------------------------------- t2 = time.time() times.append(t2 - t1) times = times[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms print("Time: {:.2f}ms, {:.1f} FPS".format(ms, fps))
print('Load checkpoint from {}'.format(model_path)) model = get_model(vertex_num, joint_num, 'test') model = DataParallel(model).cuda() ckpt = torch.load(model_path) model.load_state_dict(ckpt['network']) model.eval() # prepare input image transform = transforms.ToTensor() img_path = 'input.jpg' img = cv2.imread(img_path) # prepare bbox bbox = [164, 93, 222, 252] # xmin, ymin, width, height bbox = process_bbox(bbox, img.shape[1], img.shape[0]) assert len(bbox) == 4, 'Please set bbox' img, img2bb_trans, bb2img_trans = generate_patch_image(img, bbox, 1.0, 0.0, False, cfg.input_img_shape) img = transform(img.astype(np.float32)) / 255 img = img.cuda()[None, :, :, :] # forward inputs = {'img': img} targets = {} meta_info = {'bb2img_trans': bb2img_trans} with torch.no_grad(): out = model(inputs, targets, meta_info, 'test') img = img[0].cpu().numpy()
def load_data(self): subject_list = self.get_subject() sampling_ratio = self.get_subsampling_ratio() # aggregate annotations from each subject db = COCO() cameras = {} joints = {} smpl_params = {} for subject in subject_list: # data load with open( osp.join(self.annot_path, 'Human36M_subject' + str(subject) + '_data.json'), 'r') as f: annot = json.load(f) if len(db.dataset) == 0: for k, v in annot.items(): db.dataset[k] = v else: for k, v in annot.items(): db.dataset[k] += v # camera load with open( osp.join( self.annot_path, 'Human36M_subject' + str(subject) + '_camera.json'), 'r') as f: cameras[str(subject)] = json.load(f) # joint coordinate load with open( osp.join( self.annot_path, 'Human36M_subject' + str(subject) + '_joint_3d.json'), 'r') as f: joints[str(subject)] = json.load(f) # smpl parameter load with open( osp.join( self.annot_path, 'Human36M_subject' + str(subject) + '_smpl_param.json'), 'r') as f: smpl_params[str(subject)] = json.load(f) db.createIndex() if self.data_split == 'test' and not cfg.use_gt_info: print("Get bounding box and root from " + self.human_bbox_root_dir) bbox_root_result = {} with open(self.human_bbox_root_dir) as f: annot = json.load(f) for i in range(len(annot)): bbox_root_result[str(annot[i]['image_id'])] = { 'bbox': np.array(annot[i]['bbox']), 'root': np.array(annot[i]['root_cam']) } else: print("Get bounding box and root from groundtruth") datalist = [] for aid in db.anns.keys(): ann = db.anns[aid] image_id = ann['image_id'] img = db.loadImgs(image_id)[0] img_path = osp.join(self.img_dir, img['file_name']) img_shape = (img['height'], img['width']) # check subject and frame_idx frame_idx = img['frame_idx'] if frame_idx % sampling_ratio != 0: continue # check smpl parameter exist subject = img['subject'] action_idx = img['action_idx'] subaction_idx = img['subaction_idx'] frame_idx = img['frame_idx'] try: smpl_param = smpl_params[str(subject)][str(action_idx)][str( subaction_idx)][str(frame_idx)] except KeyError: smpl_param = None # camera parameter cam_idx = img['cam_idx'] cam_param = cameras[str(subject)][str(cam_idx)] R, t, f, c = np.array(cam_param['R'], dtype=np.float32), np.array( cam_param['t'], dtype=np.float32), np.array( cam_param['f'], dtype=np.float32), np.array(cam_param['c'], dtype=np.float32) cam_param = {'R': R, 't': t, 'focal': f, 'princpt': c} # only use frontal camera following previous works (HMR and SPIN) if self.data_split == 'test' and str(cam_idx) != '4': continue # project world coordinate to cam, image coordinate space joint_world = np.array(joints[str(subject)][str(action_idx)][str( subaction_idx)][str(frame_idx)], dtype=np.float32) joint_cam = world2cam(joint_world, R, t) joint_img = cam2pixel(joint_cam, f, c) joint_valid = np.ones((self.h36m_joint_num, 1)) if self.data_split == 'test' and not cfg.use_gt_info: bbox = bbox_root_result[str( image_id )]['bbox'] # bbox should be aspect ratio preserved-extended. It is done in RootNet. root_joint_depth = bbox_root_result[str(image_id)]['root'][2] else: bbox = process_bbox(np.array(ann['bbox']), img['width'], img['height']) if bbox is None: continue root_joint_depth = joint_cam[self.h36m_root_joint_idx][2] datalist.append({ 'img_path': img_path, 'img_id': image_id, 'img_shape': img_shape, 'bbox': bbox, 'joint_img': joint_img, 'joint_cam': joint_cam, 'joint_valid': joint_valid, 'smpl_param': smpl_param, 'root_joint_depth': root_joint_depth, 'cam_param': cam_param }) return datalist