model = DataParallel(model).cuda() ckpt = torch.load(model_path) model.load_state_dict(ckpt['network'], strict=False) model.eval() # prepare input image transform = transforms.ToTensor() img_path = 'input.jpg' original_img = cv2.imread(img_path) original_img_height, original_img_width = original_img.shape[:2] # prepare bbox bbox = [139.41, 102.25, 222.39, 241.57] # xmin, ymin, width, height bbox = process_bbox(bbox, original_img_width, original_img_height) img, img2bb_trans, bb2img_trans = generate_patch_image(original_img, bbox, 1.0, 0.0, False, cfg.input_img_shape) img = transform(img.astype(np.float32))/255 img = img.cuda()[None,:,:,:] # forward inputs = {'img': img} targets = {} meta_info = {'bb2img_trans': bb2img_trans} with torch.no_grad(): out = model(inputs, targets, meta_info, 'test') img = img[0].cpu().numpy().transpose(1,2,0) # cfg.input_img_shape[1], cfg.input_img_shape[0], 3 mesh_lixel_img = out['mesh_coord_img'][0].cpu().numpy() mesh_param_cam = out['mesh_coord_cam'][0].cpu().numpy() # restore mesh_lixel_img to original image space and continuous depth space mesh_lixel_img[:,0] = mesh_lixel_img[:,0] / cfg.output_hm_shape[2] * cfg.input_img_shape[1]
def main(): # input_size=416 # iou_threshold=0.45 # score_threshold=0.3 # Yolo = Load_Yolo_model() times = [] output_path = "output" vid = cv2.VideoCapture(0) vid.set(3, 1280) vid.set(4, 1024) # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) focal = (1500, 1500) princpt = (width / 2, height / 2) print(f"Width {width} Height {height}") bbox = [0, 0, width, height] bbox = process_bbox(bbox, width, height) root_depth = 11250.5732421875 # obtain this from RootNet (https://github.com/mks0601/3DMPPE_ROOTNET_RELEASE/tree/master/demo) root_depth /= 1000 # output of RootNet is milimeter. change it to meter with torch.no_grad(): while True: _, frame = vid.read() t1 = time.time() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break # image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) # image_data = image_data[np.newaxis, ...].astype(np.float32) # if YOLO_FRAMEWORK == "tf": # pred_bbox = Yolo.predict(image_data) # elif YOLO_FRAMEWORK == "trt": # batched_input = tf.constant(image_data) # result = Yolo(batched_input) # pred_bbox = [] # for key, value in result.items(): # value = value.numpy() # pred_bbox.append(value) # pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] # pred_bbox = tf.concat(pred_bbox, axis=0) # bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) # bboxes = nms(bboxes, iou_threshold, method='nms') # frame = draw_bbox(original_frame, bboxes) #----------------------------------------i2l meshnet--------------------------------------- # original_img_height, original_img_width = original_frame.shape[:2] # bbox = bboxes[0][:4] img, img2bb_trans, bb2img_trans = generate_patch_image( original_frame, bbox, 1.0, 0.0, False, cfg.input_img_shape) img = transform(img.astype(np.float32)) / 255 img = img.cuda()[None, :, :, :] # forward inputs = {'img': img} targets = {} meta_info = {'bb2img_trans': bb2img_trans} out = model(inputs, targets, meta_info, 'test') img = img[0].cpu().numpy().transpose( 1, 2, 0) # cfg.input_img_shape[1], cfg.input_img_shape[0], 3 mesh_lixel_img = out['mesh_coord_img'][0].cpu().numpy() mesh_param_cam = out['mesh_coord_cam'][0].cpu().numpy() # restore mesh_lixel_img to original image space and continuous depth space mesh_lixel_img[:, 0] = mesh_lixel_img[:, 0] / cfg.output_hm_shape[ 2] * cfg.input_img_shape[1] mesh_lixel_img[:, 1] = mesh_lixel_img[:, 1] / cfg.output_hm_shape[ 1] * cfg.input_img_shape[0] mesh_lixel_img[:, :2] = np.dot( bb2img_trans, np.concatenate((mesh_lixel_img[:, :2], np.ones_like(mesh_lixel_img[:, :1])), 1).transpose(1, 0)).transpose(1, 0) mesh_lixel_img[:, 2] = ( mesh_lixel_img[:, 2] / cfg.output_hm_shape[0] * 2. - 1) * (cfg.bbox_3d_size / 2) # root-relative 3D coordinates -> absolute 3D coordinates root_xy = np.dot(joint_regressor, mesh_lixel_img)[root_joint_idx, :2] root_img = np.array([root_xy[0], root_xy[1], root_depth]) root_cam = pixel2cam(root_img[None, :], focal, princpt) mesh_lixel_img[:, 2] += root_depth mesh_lixel_cam = pixel2cam(mesh_lixel_img, focal, princpt) mesh_param_cam += root_cam.reshape(1, 3) # visualize lixel mesh in 2D space # vis_img = frame.copy() # vis_img = vis_mesh(vis_img, mesh_lixel_img) # cv2.imwrite('output_mesh_lixel.jpg', vis_img) # visualize lixel mesh in 2D space # vis_img = frame.copy() # mesh_param_img = cam2pixel(mesh_param_cam, focal, princpt) # vis_img = vis_mesh(vis_img, mesh_param_img) # cv2.imwrite('output_mesh_param.jpg', vis_img) # save mesh (obj) # save_obj(mesh_lixel_cam, face, 'output_mesh_lixel.obj') # save_obj(mesh_param_cam, face, 'output_mesh_param.obj') # render mesh from lixel vis_img = frame.copy() rendered_img = render_mesh(vis_img, mesh_lixel_cam, face, { 'focal': focal, 'princpt': princpt }) # cv2.imwrite('rendered_mesh_lixel.jpg', rendered_img) cv2.imshow('output', rendered_img / 255) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break # render mesh from param # vis_img = frame.copy() # rendered_img = render_mesh(vis_img, mesh_param_cam, face, {'focal': focal, 'princpt': princpt}) # cv2.imwrite('rendered_mesh_param.jpg', rendered_img) #----------------------------------------i2l meshnet--------------------------------------- t2 = time.time() times.append(t2 - t1) times = times[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms print("Time: {:.2f}ms, {:.1f} FPS".format(ms, fps))
model = DataParallel(model).cuda() ckpt = torch.load(model_path) model.load_state_dict(ckpt['network'], strict=False) model.eval() # prepare input image transform = transforms.ToTensor() img_path = 'input.jpg' original_img = cv2.imread(img_path) original_img_height, original_img_width = original_img.shape[:2] # prepare bbox bbox = [69, 137, 165, 153] # xmin, ymin, width, height bbox = process_bbox( bbox, (original_img_height, original_img_width, original_img_height)) img, trans, inv_trans = generate_patch_image(original_img, bbox, False, 1.0, 0.0, cfg.input_img_shape) img = transform(img.astype(np.float32)) / 255 img = img.cuda()[None, :, :, :] # forward inputs = {'img': img} targets = {} meta_info = {} with torch.no_grad(): out = model(inputs, targets, meta_info, 'test') img = img[0].cpu().numpy().transpose( 1, 2, 0) # cfg.input_img_shape[1], cfg.input_img_shape[0], 3 joint_coord = out['joint_coord'][0].cpu().numpy( ) # x,y pixel, z root-relative discretized depth rel_root_depth = out['rel_root_depth'][0].cpu().numpy() # discretized depth hand_type = out['hand_type'][0].cpu().numpy() # handedness probability
def __getitem__(self, idx): frame = self.framelist[idx] seq_name, cam, frame_idx, joint = frame['seq_name'], frame[ 'cam'], frame['frame_idx'], frame['joint'] joint_coord, joint_valid = joint['world_coord'], joint['valid'] # input data # bbox calculate bbox = get_bbox(joint_coord, joint_valid, self.camrot[cam], self.campos[cam], self.focal[cam], self.princpt[cam]) xmin, ymin, xmax, ymax = bbox xmin = max(xmin, 0) ymin = max(ymin, 0) xmax = min(xmax, self.original_img_shape[1] - 1) ymax = min(ymax, self.original_img_shape[0] - 1) bbox = np.array([xmin, ymin, xmax, ymax]) # image read img_path = osp.join(self.root_path, seq_name, 'images', 'cam' + cam, 'image' + "{:04d}".format(frame_idx) + '.png') img = load_img(img_path) xmin, ymin, xmax, ymax = bbox xmin, xmax = np.array([xmin, xmax ]) / self.original_img_shape[1] * img.shape[1] ymin, ymax = np.array([ymin, ymax ]) / self.original_img_shape[0] * img.shape[0] bbox_img = np.array([xmin, ymin, xmax - xmin + 1, ymax - ymin + 1]) img = generate_patch_image(img, bbox_img, False, 1.0, 0.0, cfg.input_img_shape) input_img = self.transform(img) / 255. target_depthmaps = [] cam_params = [] affine_transes = [] for cam in random.sample(self.selected_cameras, cfg.render_view_num): # bbox calculate bbox = get_bbox(joint_coord, joint_valid, self.camrot[cam], self.campos[cam], self.focal[cam], self.princpt[cam]) xmin, ymin, xmax, ymax = bbox xmin = max(xmin, 0) ymin = max(ymin, 0) xmax = min(xmax, self.original_img_shape[1] - 1) ymax = min(ymax, self.original_img_shape[0] - 1) bbox = np.array([xmin, ymin, xmax, ymax]) # depthmap read depthmap_path = osp.join(self.depthmap_root_path, "{:06d}".format(frame_idx), 'depthmap' + cam + '.pkl') with open(depthmap_path, 'rb') as f: depthmap = pickle.load(f).astype(np.float32) xmin, ymin, xmax, ymax = bbox xmin, xmax = np.array( [xmin, xmax]) / self.original_img_shape[1] * depthmap.shape[1] ymin, ymax = np.array( [ymin, ymax]) / self.original_img_shape[0] * depthmap.shape[0] bbox_depthmap = np.array( [xmin, ymin, xmax - xmin + 1, ymax - ymin + 1]) depthmap = generate_patch_image(depthmap[:, :, None], bbox_depthmap, False, 1.0, 0.0, cfg.rendered_img_shape) target_depthmaps.append(self.transform(depthmap)) xmin, ymin, xmax, ymax = bbox affine_transes.append( gen_trans_from_patch_cv( (xmin + xmax + 1) / 2., (ymin + ymax + 1) / 2., xmax - xmin + 1, ymax - ymin + 1, cfg.rendered_img_shape[1], cfg.rendered_img_shape[0], 1.0, 0.0).astype(np.float32)) cam_params.append({ 'camrot': self.camrot[cam], 'campos': self.campos[cam], 'focal': self.focal[cam], 'princpt': self.princpt[cam] }) inputs = {'img': input_img} targets = {'depthmap': target_depthmaps, 'joint': joint} meta_info = {'cam_param': cam_params, 'affine_trans': affine_transes} return inputs, targets, meta_info