def get_demo_vibe_model(): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') return model
def get_put(model_1_ready, queue_img, queue_dettra, total_time): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") mot = MPT(device=device, batch_size=1, output_format='dict') model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') model_1_ready.value += 1 total_time.value = time.time() i = 0 while True: try: total_time = time.time() frame_orig = queue_img.get(True, 50) # block为True,就是如果队列中无数据了。 # |—————— 若timeout默认是None,那么会一直等待下去。 # |—————— 若timeout设置了时间,那么会等待timeout秒后才会抛出Queue.Empty异常 # block 为False,如果队列中无数据,就抛出Queue.Empty异常 i += 1 print('dettra22222', i) vibe_results = detect_track_vibe(frame_orig, mot, model, device) queue_dettra.put((frame_orig, vibe_results)) total_time = time.time() - total_time print(f'Detection+Tracking FPS : {1 / total_time:.2f}.') except QueueEmpty: break
def main(args): torch.cuda.set_device(args.gpu_id) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print(f'Loading video list {args.video_list}') video_list = [l.strip() for l in open(args.video_list, 'r').readlines()] if len(video_list) < 1: print('No files were found in video list') return print('Loading VIBE model') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load VIBE pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') num_videos = len(video_list) print(f'Processing {num_videos} videos.') for video_idx, video_file in enumerate(video_list, start=1): if not osp.isfile(video_file): print(f'Input video \"{video_file}\" does not exist! Moving on to next file.') continue filename = osp.splitext(osp.basename(video_file))[0] output_path = osp.join(args.output_folder, filename) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames') orig_height, orig_width = img_shape[:2] # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not osp.isabs(video_file): video_file = osp.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!') print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict # Clean-up the temporal folder # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path) output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl') print(f'Saving output results to \"{output_pkl_path}\".') joblib.dump(vibe_results, output_pkl_path) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0,1,0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) # Clean-up after processing del model shutil.rmtree(image_folder) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = 'test.mp4' output_folder = 'output/' output_path = os.path.join( output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) print(f'the output path is:{output_path}') image_folder, num_frames, img_shape = generative() print( f'image_folder is:{image_folder},num_frames is:{num_frames},imshape is:{img_shape}' ) orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # # tracking_method = bbox bbox_scale = 1.1 # run multi object tracker mot = MPT( device=device, batch_size=12, # 12 display=False, # true detector_type='yolo', # yolo output_format='dict', yolo_img_size=416, #416 * 416 ) tracking_results = mot(image_folder) print(f'output the result of tracking->{tracking_results}') for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # # 加载预训练模型 pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None # bboxes = n X 4 bboxes = tracking_results[person_id]['bbox'] ## frames n X 1 frames = tracking_results[person_id]['frames'] # 一个已经标准化了的图像 dataset 其总体大小为 T X 224 X 224 X 3 dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, # none scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=64, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze( 0) #在dim = 0 增加一维张量 变为: 1 X T x 244 x 244 x 3 batch = batch.to(device) batch_size, seqlen = batch.shape[:2] # 得到 seqlen是一段时间序列图像 output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) # 1*T X 3 pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) # 1*T X 72 pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) # 1*T X 10 pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) # pred_cam = torch.cat(pred_cam, dim=0) #(T, 3) pred_verts = torch.cat(pred_verts, dim=0) #(T, ,3) pred_pose = torch.cat(pred_pose, dim=0) #(T, 72) pred_betas = torch.cat(pred_betas, dim=0) #(T, 10) pred_joints3d = torch.cat(pred_joints3d, dim=0) #(T, ,3) del batch # ========= Save results to a pickle file ========= # # 由tensor 转换成 numpy pred_cam = pred_cam.cpu().numpy() #(T, 3) pred_verts = pred_verts.cpu().numpy() #(T, ,3) pred_pose = pred_pose.cpu().numpy() #(T, 72) pred_betas = pred_betas.cpu().numpy() #(T, 10) pred_joints3d = pred_joints3d.cpu().numpy() #(T, ,3) # 结果为: T X 4 orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, # 预测的 cam T X 3 bbox=bboxes, # 目标检测的结果 T X 4 img_width=orig_width, # 最初图像的宽 img_height=orig_height # 最初图像的高 ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # # 图像高和宽 resolution ## 一个 工具 真确传参数即可使用 renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] # 读取图片 img = cv2.imread(img_fname) # true if args.sideview: side_img = np.zeros_like(img) # for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] # 3D模型 mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') # 输出是一张图片 img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) img = np.concatenate([img, side_img], axis=1) cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() print('================= END =================')
def main(args): if args.device == 'cpu': device = torch.device('cpu') print('Running on CPU') else: device = torch.device('cuda') print('Running on GPU') if args.vid_file: video_file = args.vid_file if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') else: image_file = args.img_file if not os.path.isfile(image_file): exit(f'Input video \"{image_file}\" does not exist!') output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) # output_path = os.path.join(args.output_folder, os.path.basename(video_file).split('.')[0]) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # resize video if too big # ffmpeg -i input.avi -filter:v scale=720:-1 -c:a copy output.mkv # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) import pdb pdb.set_trace # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=True) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_images' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') output_pose_folder = f'{image_folder}_poses' os.makedirs(output_pose_folder, exist_ok=True) print(f'Saving poses to {output_pose_folder}') # prepare results for rendering from numpy import save save(f'{os.path.basename(video_file)}_poses.npy', vibe_results[1]['joints3d'][:, :25, :]) print('Saving numpy poses file to' + f'{video_file}_poses.npy') frame_results = prepare_rendering_results( vibe_results, num_frames) # returns a list of dicts (one dict for each person) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] frame_pose = person_data['joints3d'][:25] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') # bgr image (opencv format) img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) # import pdb; pdb.set_trace() # Create a 3D projection and save as img # pose is mirrored # plot_skeleton(output_pose_folder, frame_idx, frame_pose) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) # concatenate pose img with this image before writing cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) # shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) print('================= END =================')
def main(args): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join(args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, smpl_joints2d, norm_joints2d = [], [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) smpl_joints2d.append(output['kp_2d']).reshape(batch_size * seqlen, -1, 2)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) smpl_joints2d = torch.cat(smpl_joints2d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, )
def main(args): # check GPU availability device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # get input video video_file = args.input_video # save hand info save_hand_csv = args.save_hand_csv # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') # check video existence if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') # set output flles output_path = args.output_folder image_folder, num_frames, img_shape = video_to_images(video_file, \ "/tmp/" + output_path.split("/")[-1], return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] # get the frame rate (frames per second) of the input video video = cv2.VideoCapture(video_file) fps = video.get(cv2.CAP_PROP_FPS) # ========= Run tracking ========= # bbox_scale = 1.1 # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # only focus on the frame with the first person largest_num_frames = 0 largest_person = None for person_id in list(tracking_results.keys()): num_frames = tracking_results[person_id]['frames'].shape[0] if num_frames <= largest_num_frames: del tracking_results[person_id] else: largest_num_frames = tracking_results[person_id]['frames'].shape[0] if largest_person != None: del tracking_results[largest_person] largest_person = person_id # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = tracking_results[person_id]['bbox'] frames = tracking_results[person_id]['frames'] np.save(output_path + "/frames", frames) # inference data of each person dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames # load data dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) # extract data with torch.no_grad(): pred_cam, pred_verts, pred_pose = [], [], [] for batch in dataloader: batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) del batch # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() # get camera pose orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) # get result information output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'bboxes': bboxes, 'frame_ids': frames, } np.savetxt(output_path + "/pred_cam.csv", pred_cam, delimiter=",") np.savetxt(output_path + "/orig_cam.csv", orig_cam, delimiter=",") vibe_results[person_id] = output_dict del model frame_results = prepare_rendering_results(vibe_results, len(frames)) np.save(output_path + "/frame_results", frame_results) np.save(output_path + "/image_folder", image_folder) np.save(output_path + "/orig_width", orig_width) np.save(output_path + "/orig_height", orig_height)
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') dict = { 'bridge': 1, 'childs': 2, 'downwarddog': 3, 'mountain': 4, 'plank': 5, 'seatedforwardbend': 6, 'tree': 7, 'trianglepose': 8, 'warrior1': 9, 'warrior2': 10 } dir_path = '/home/ubuntu/PoseEstimation/VIBE/InputData/input_test_set/' output_folder = '/home/ubuntu/PoseEstimation/VIBE/OutputData/test_set/' joints3D_csv = open('output_joints3d_dog.csv', 'a') pose_csv = open('output_pose.csv_dog', 'a') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') video_file = '/home/ubuntu/PoseEstimation/VIBE/DogVideo.mp4' video_label = dict['bridge'] if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } for i in range(len(output_dict['joints3d'])): if (i % 5 == 0): flat_arr = output_dict['joints3d'][i].flatten() len_N = len(flat_arr) np.savetxt(joints3D_csv, [np.append(flat_arr, [video_label])], delimiter=',', fmt=' '.join(['%f'] * len_N + ['%i'])) for i in range(len(output_dict['pose'])): if (i % 5 == 0): pose_arr = output_dict['pose'][i].flatten() len_M = len(pose_arr) np.savetxt(pose_csv, [np.append(pose_arr, [video_label])], delimiter=',', fmt=' '.join(['%f'] * len_M + ['%i'])) end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' )
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') SEQ_LENGTH = args.sequence_length MIN_NUM_FRAMES = 1 # Don't change this TRACKER_BATCH_SIZE = MIN_NUM_FRAMES images_to_eval = [] yolo_img_size = args.yolo_img_size image_folder = 'live_rendered_images' output_path = args.output_folder os.makedirs(image_folder, exist_ok=True) os.makedirs(output_path, exist_ok=True) os.makedirs('live_imgs', exist_ok=True) model = VIBE_Demo(seqlen=SEQ_LENGTH, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, live_inference=True).to(device) pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') mot = MPT( device=device, batch_size=TRACKER_BATCH_SIZE, display=False, detector_type=args.detector, output_format='dict', yolo_img_size=yolo_img_size, ) # An asynchronous camera implementation to run cv2 camera in background while model is running cap = AsyncCamera(0, display=args.live_display) bbox_scale = 1.1 i = 0 bbox_lis, frame_lis, images_lis, joints2d_lis = [], [], [], [] pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] while (True): # If q is pressed cap.stop will turn True if (cap.stop): break ret, captured_frames = cap.read() if (not ret): continue if (len(captured_frames) < MIN_NUM_FRAMES): continue images = get_images_from_captures(captured_frames, MIN_NUM_FRAMES) cap.del_frame_lis() orig_height, orig_width = images[0].shape[:2] orig_dim = (orig_height, orig_width) saveToDir(images) if args.tracking_method == 'pose': images_to_video('./live_imgs', './live_imgs/pose_video.mp4') tracking_results = run_posetracker('live_imgs/pose_video.mp4', staf_folder=args.staf_dir, display=args.display) else: tracking_results = mot('./live_imgs') if args.live_display: cap.set_display_image(images[-1]) if (len(tracking_results.keys()) == 0): print('Unable to detect any person') for image in images: images_lis.append(image) if len(tracking_results.keys()) != 0: person_id = (list)(tracking_results.keys())[0] print(person_id) frames = tracking_results[person_id]['frames'] bboxes, joints2d = None, None if args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] if (joints2d_lis == []): joints2d_lis = joints2d else: joints2d_lis = np.vstack([joints2d_lis, joints2d]) else: bboxes = tracking_results[person_id]['bbox'] if (bbox_lis == []): bbox_lis = bboxes else: bbox_lis = np.vstack([bbox_lis, bboxes]) for x in (1 + i + frames - MIN_NUM_FRAMES): frame_lis.append(x) dataset = LiveInference( images=images_lis[-SEQ_LENGTH:], frames=frame_lis[-SEQ_LENGTH:], bboxes=bbox_lis[-SEQ_LENGTH:], joints2d=joints2d_lis[-SEQ_LENGTH:] if joints2d is not None else None, scale=bbox_scale, ) bboxes = dataset.bboxes if args.tracking_method == 'pose': if (bbox_lis == []): bbox_lis = bboxes else: bbox_lis = np.vstack([bbox_lis, bboxes[-1:]]) cap.set_bounding_box(bbox_lis[-1]) has_keypoints = True if joints2d is not None else False norm_joints2d = [] with torch.no_grad(): # A manual implementation for getting data since dataloader is slow for few inputs tup = [ dataset.__getitem__(x) for x in range(dataset.__len__()) ] if has_keypoints: for j, batch in enumerate(tup): tup[j], nj2d = batch norm_joints2d.append(nj2d[:21, :].reshape(-1, 21, 3)) for j, x in enumerate(tup): tup[j] = x.unsqueeze(0) tup = tuple(tup) batch = torch.cat((tup), 0) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] # Send only latest image to hmr for faster inferencing output = model(batch[:, -1:, :, :, :])[-1] pred_cam.append( output['theta'][:, -MIN_NUM_FRAMES:, :3].reshape( batch_size * MIN_NUM_FRAMES, -1)) pred_verts.append( output['verts'][:, -MIN_NUM_FRAMES:, ].reshape( batch_size * MIN_NUM_FRAMES, -1, 3)) pred_pose.append( output['theta'][:, -MIN_NUM_FRAMES:, ][:, :, 3:75].reshape( batch_size * MIN_NUM_FRAMES, -1)) pred_betas.append( output['theta'][:, -MIN_NUM_FRAMES:, ][:, :, 75:].reshape( batch_size * MIN_NUM_FRAMES, -1)) pred_joints3d.append( output['kp_3d'][:, -MIN_NUM_FRAMES:, ].reshape( batch_size * MIN_NUM_FRAMES, -1, 3)) del batch pred_verts[-MIN_NUM_FRAMES:], pred_cam[ -MIN_NUM_FRAMES:], pred_pose[-MIN_NUM_FRAMES:], pred_betas[ -MIN_NUM_FRAMES:], pred_joints3d[ -MIN_NUM_FRAMES:], norm_joints2d[ -MIN_NUM_FRAMES:] = temporal_simplify( pred_verts[-MIN_NUM_FRAMES:], pred_cam[-MIN_NUM_FRAMES:], pred_pose[-MIN_NUM_FRAMES:], pred_betas[-MIN_NUM_FRAMES:], pred_joints3d[-MIN_NUM_FRAMES:], norm_joints2d[-MIN_NUM_FRAMES:], device, args) get_vibe_results( pred_cam[-MIN_NUM_FRAMES:], pred_verts[-MIN_NUM_FRAMES:], pred_pose[-MIN_NUM_FRAMES:], pred_betas[-MIN_NUM_FRAMES:], pred_joints3d[-MIN_NUM_FRAMES:], joints2d_lis[-MIN_NUM_FRAMES:], bbox_lis[-MIN_NUM_FRAMES:], frame_lis[-MIN_NUM_FRAMES], orig_dim, 0) images = [] i = i + 1 if (i == args.max_frames): break del model vibe_results = get_vibe_results(pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, joints2d_lis, bbox_lis, frame_lis, orig_dim, 0) if not args.no_render: for i, image in enumerate(images_lis): cv2.imwrite(f'{image_folder}/{(i):06d}.jpg', image) print(frame_lis) render(orig_dim, frame_lis, vibe_results, image_folder, output_path, len(images_lis), args) shutil.rmtree('live_imgs') print('================= END =================')
# ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') for person_id in tqdm(sorted(list(ped.keys()))): bboxes = joints2d = None bboxes = ped[person_id]['bbox'] frames = ped[person_id]['frames'] for frame_one in frames: if frame_one not in frames_ped_veh: frames_ped_veh[frame_one] = {'ped_id': set(), 'veh_id': set()} frames_ped_veh[frame_one]['ped_id'].add(person_id) dataset = Inference( image_folder=image_folder,
def main(args): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join(args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False # reduce the num of worker if you encountered the error: DLL load failed: The paging file is too small for this operation to complete dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=8) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!') print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print(f'Total time spent: {total_time:.2f} seconds (including model loading time).') print(f'Total FPS (including model loading time): {num_frames / total_time:.2f}.') print(f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".') joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0,1,0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) font = cv2.FONT_HERSHEY_SIMPLEX x = 10 #position of text y = 20 #position of text cv2.putText(img, str(frame_idx), (x,y), font ,0.55,(0,255,0),1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) # generate and save the joints csv file for animating avatars later output = joblib.load(os.path.join(output_path, "vibe_output.pkl")) for i in output.keys(): print('Track ids:', i , end='\n\n') num_ppl = len(output.keys()) print('VIBE output file content:', end='\n\n') vid_name = os.path.basename(video_file) vibe_result_folder = output_path # output the pose result as csv # format: v_personId_numFrames pose_filename_list = [] for i in output.keys(): pose_filename = vibe_result_folder + "/" + vid_name + "_"+ str(i) + "_" + str(output[i]['pose'].shape[0]) + ".csv" pose_filename_list.append(pose_filename) field_names = [] for idx in range(73): # 72 -> 73 (+ frame_id at 0) field_names.append(str(idx)) with open(pose_filename, 'w', newline='') as file: writer = csv.writer(file) writer.writerow(field_names) for frame_id in range(len(output[i]['pose'])): output_data = [output[i]['frame_ids'][frame_id]] output_data.extend(output[i]['pose'][frame_id]) #print(output_data) writer.writerow(output_data) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) map_vals = { 'bridge': 1, 'childs': 2, 'downwarddog': 3, 'mountain': 4, 'plank': 5, 'seatedforwardbend': 6, 'tree': 7, 'trianglepose': 8, 'warrior1': 9, 'warrior2': 10 } inverse_map = { 1: 'bridge', 2: 'childs', 3: 'downwarddog', 4: 'mountain', 5: 'plank', 6: 'seatedforwardbend', 7: 'tree', 8: 'trianglepose', 9: 'warrior1', 10: 'warrior2' } video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') dir_path = '/home/ubuntu/PoseEstimation/VIBE/InputData/input_test_set/' output_folder = '/home/ubuntu/PoseEstimation/VIBE/OutputData/' # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load Classification Model ========= # classification_model = pickle.load( open('view_classification_model.pkl', 'rb')) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) #print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() #print(f'Loaded pretrained weights from \"{pretrained_file}\"') image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # #print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in list(tracking_results.keys()): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } # ========= Extract 3D joint feature for each frame ========= # list_val = [] for i in range(len(output_dict['joints3d'])): list_val.append(output_dict['joints3d'][i].flatten().reshape( 1, -1)) input_df = pd.DataFrame(np.concatenate(list_val)) input_df = input_df.round(2) predicted_classes = classification_model.predict_classes(input_df) output_df = pd.DataFrame(predicted_classes) # ========= Printing all possible poses detected for the video ========= # total_frames = len(output_df) print( '\nPrinting probabilities for yoga poses predicted in different frames.' ) for i, v in output_df.value_counts().items(): val = round((v / total_frames) * 100, 2) print('Probability of the yoga pose being ' + inverse_map[i[0]].capitalize() + " is: " + str(val)) print('\nThe yoga pose in the given video is: ' + inverse_map[output_df[0].value_counts().idxmax()].capitalize())
def main(args): torch.cuda.set_device(args.gpu_id) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print(f'Loading video list {args.video_list}') video_list = [l.strip() for l in open(args.video_list, 'r').readlines()] if len(video_list) < 1: print('No files were found in video list') return print('Loading VIBE model') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load VIBE pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') num_videos = len(video_list) print(f'Processing {num_videos} videos.') for video_idx, video_file in enumerate(video_list, start=1): if not osp.isfile(video_file): print( f'Input video \"{video_file}\" does not exist! Moving on to next file.' ) continue filename = osp.splitext(osp.basename(video_file))[0] output_path = osp.join(args.output_folder, filename) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames') orig_height, orig_width = img_shape[:2] # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not osp.isabs(video_file): video_file = osp.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to( device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print( '[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict # Clean-up the temporal folder shutil.rmtree(image_folder) # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path) output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl') print(f'Saving output results to \"{output_pkl_path}\".') joblib.dump(vibe_results, output_pkl_path) # Clean-up after processing del model print('================= END =================')
def run_vibe(video_file, args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # Make output dirs output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) # Convert video to images image_folder, num_frames, img_shape = video_to_images( video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker( video_file, staf_folder=args.staf_dir, display=args.display, smoothen=args.smoothen, smoothen_method=args.smoothen_method) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=None, joints2d=joints2d ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader( dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [ ], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).') print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.') print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".') # joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) for person in vibe_results.keys(): dump_path = os.path.join(output_path, "vibe_output_%s.pkl" % person) os.makedirs(os.path.dirname(dump_path), exist_ok=True) pickle.dump(vibe_results[person], open(dump_path, 'wb')) # if not args.no_render: # # ========= Render results as a single video ========= # # renderer = Renderer(resolution=(orig_width, orig_height), # orig_img=True, wireframe=args.wireframe) # output_img_folder = f'{image_folder}_output' # os.makedirs(output_img_folder, exist_ok=True) # print(f'Rendering output video, writing frames to {output_img_folder}') # # prepare results for rendering # frame_results = prepare_rendering_results(vibe_results, num_frames) # mesh_color = {k: colorsys.hsv_to_rgb( # np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} # image_file_names = sorted([ # os.path.join(image_folder, x) # for x in os.listdir(image_folder) # if x.endswith('.png') or x.endswith('.jpg') # ]) # for frame_idx in tqdm(range(len(image_file_names))): # img_fname = image_file_names[frame_idx] # img = cv2.imread(img_fname) # for person_id, person_data in frame_results[frame_idx].items(): # frame_verts = person_data['verts'] # frame_cam = person_data['cam'] # mc = mesh_color[person_id] # mesh_filename = None # img = renderer.render( # img, # frame_verts, # cam=frame_cam, # color=mc, # mesh_filename=mesh_filename, # ) # cv2.imwrite(os.path.join(output_img_folder, # f'{frame_idx:06d}.png'), img) # if args.display: # cv2.imshow('Video', img) # if cv2.waitKey(1) & 0xFF == ord('q'): # break # if args.display: # cv2.destroyAllWindows() # # ========= Save rendered video ========= # # vid_name = os.path.basename(video_file) # save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' # save_name = os.path.join(output_path, save_name) # print(f'Saving result video to {save_name}') # images_to_video(img_folder=output_img_folder, # output_vid_file=save_name) # shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) print('================= END =================')
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Configure depth and color streams pipeline = rs.pipeline() config = rs.config() config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30) config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30) # Start streaming pipeline.start(config) # ========= Run tracking ========= # bbox_scale = 1.1 # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= main loop ======================= # fourcc = cv2.VideoWriter_fourcc(*'XVID') #cap = cv2.VideoCapture('test.avi') #out = cv2.VideoWriter('output/test_5people.avi', fourcc, 30.0, (640, 360), True) #cap = cv2.VideoCapture('sample_video.mp4') out = cv2.VideoWriter('output/test_realsense_2.avi', fourcc, 10.0, (640, 480), True) # load renderer renderer = Renderer(resolution=(640, 480), orig_img=True, wireframe=args.wireframe) i = 0 time_acc = 0.0 while (True): # Capture frame-by-frame total_time = time.time() frames = pipeline.wait_for_frames() frame_orig = frames.get_color_frame() # Convert images to numpy arrays frame_orig = np.asanyarray(frame_orig.get_data()) #ret, frame_orig = cap.read() if frame_orig is None: break #for i in range(1,300): # total_time = time.time() # path = os.path.join('tmp/sample_video/',f'{i:06d}.png') # frame_orig = cv2.imread(path) orig_height, orig_width = frame_orig.shape[:2] frame = cv2.cvtColor(frame_orig, cv2.COLOR_BGR2RGB) frame = frame / 255. frame = frame.transpose((2, 0, 1)) frame = torch.from_numpy(frame) frame = frame.unsqueeze(0) tracking_results = mot(frame) #print('1111111111111tracking result',tracking_results) #print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in list(tracking_results.keys()): bboxes = joints2d = None bboxes = tracking_results[person_id]['bbox'] # shape(1,4) #print('bboxes: ',bboxes) #相同 frames = tracking_results[person_id]['frames'] #print('22222222',bboxes) dataset = Inference(frame=frame_orig, bboxes=bboxes, scale=bbox_scale) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] batch = dataset batch = batch.unsqueeze(0).unsqueeze(0) batch = batch.to(device) #print(batch.shape) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch pred_cam = pred_cam.cpu().numpy() #print('pred_cam: ',pred_cam) #不同 pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() #print('3333333333',pred_cam) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) #print('orig_cam',orig_cam.shape) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict #print('vibe_results orig_cam: ',vibe_results[1]['orig_cam']) #print('vibe_results pose: ', vibe_results[1]['pose']) end = time.time() fps = 1 / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') if not args.no_render: render_time = time.time() # load renderer #renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) # prepare results for rendering num_frames = 1 #print('vibe_results1111: ',vibe_results) #vibe_results[1]['orig_cam'] = vibe_results[1]['orig_cam'][np.newaxis,:] #print('orig_cam: ',vibe_results[1]['orig_cam'].shape) frame_results = prepare_rendering_results(vibe_results, num_frames) #print('frame_results',frame_results) img = frame_orig mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } #img = frame if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[0].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] #print('4444444444frame_cam',frame_cam) mc = mesh_color[person_id] mesh_filename = None img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) fps = 1 / (time.time() - render_time) print(f'RENDER FPS: {fps:.2f}') #img = img.numpy() out.write(img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() total_time = time.time() - total_time i += 1 time_acc += total_time print( 'num of frame: ', i, f' Total time spent: {total_time:.2f} seconds (detect+track+vibe+render).' ) print(f'FPS : { 1 / total_time:.2f}.') print('Total average FPS: ', i / time_acc) # ========= Save rendered video ========= # print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) if args.joints3dview: output_img_raw_folder = f'{image_folder}_raw_output' os.makedirs(output_img_raw_folder, exist_ok=True) output_img_joints3d_folder = f'{image_folder}_joints3d_output' os.makedirs(output_img_joints3d_folder, exist_ok=True) output_img_mesh_folder = f'{image_folder}_mesh_output' os.makedirs(output_img_mesh_folder, exist_ok=True) output_img_meshside_folder = f'{image_folder}_meshside_output' os.makedirs(output_img_meshside_folder, exist_ok=True) output_img_all_folder = f'{image_folder}_all_output' os.makedirs(output_img_all_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) length_image_files = len(image_file_names) #length_image_files = 100 for frame_idx in tqdm(range(length_image_files)): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) if args.joints3dview: img_raw = img.copy() img_joints3d = np.zeros_like(img) joints3d_list = [] for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] joints3d = person_data['joints3d'] #print('frame_verts.shape = {}\nframe_cam.shape ={}\njoints3d.shape = {}'.format( # frame_verts.shape, frame_cam.shape, joints3d.shape)) mc = mesh_color[person_id] if args.joints3dview: joints3d_list.append(joints3d) # img_joints3d = render_joints3d(joints3d, img_raw.shape) mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img_mesh = img.copy() img = np.concatenate([img, side_img], axis=1) cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.joints3dview: #img_joints3d = np.zeros_like(img_raw) if len(joints3d_list) == 0: img_joints3d = np.zeros_like(img_raw) else: joints3d = np.concatenate(joints3d_list) img_joints3d = render_joints3d(joints3d, img_raw.shape) if args.joints3dview: img_up = np.concatenate([img_raw, img_joints3d], axis=1) img_down = np.concatenate([img_mesh, side_img], axis=1) img_all = np.concatenate([img_up, img_down], axis=0) cv2.imwrite( os.path.join(output_img_raw_folder, f'{frame_idx:06d}.png'), img_raw) cv2.imwrite( os.path.join(output_img_joints3d_folder, f'{frame_idx:06d}.png'), img_joints3d) cv2.imwrite( os.path.join(output_img_mesh_folder, f'{frame_idx:06d}.png'), img_mesh) cv2.imwrite( os.path.join(output_img_meshside_folder, f'{frame_idx:06d}.png'), side_img) cv2.imwrite( os.path.join(output_img_all_folder, f'{frame_idx:06d}.png'), img_all) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) if args.joints3dview: ''' save_name_raw = f'{vid_name.replace(".mp4", "")}_raw.mp4' save_name_raw = os.path.join(output_path, save_name_raw) images_to_video(img_folder=output_img_raw_folder, output_vid_file=save_name_raw) shutil.rmtree(output_img_raw_folder) save_name_joints3d = f'{vid_name.replace(".mp4", "")}_joints3d.mp4' save_name_joints3d = os.path.join(output_path, save_name_joints3d) images_to_video(img_folder=output_img_joints3d_folder, output_vid_file=save_name_joints3d) shutil.rmtree(output_img_joints3d_folder) save_name_mesh = f'{vid_name.replace(".mp4", "")}_mesh.mp4' save_name_mesh = os.path.join(output_path, save_name_mesh) images_to_video(img_folder=output_img_mesh_folder, output_vid_file=save_name_mesh) shutil.rmtree(output_img_mesh_folder) save_name_meshside = f'{vid_name.replace(".mp4", "")}_meshside.mp4' save_name_meshside = os.path.join(output_path, save_name_meshside) images_to_video(img_folder=output_img_meshside_folder, output_vid_file=save_name_meshside) shutil.rmtree(output_img_meshside_folder) ''' save_name_all = f'{vid_name.replace(".mp4", "")}_all.mp4' save_name_all = os.path.join(output_path, save_name_all) images_to_video(img_folder=output_img_all_folder, output_vid_file=save_name_all) shutil.rmtree(output_img_all_folder) shutil.rmtree(image_folder) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, device=device, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file, map_location=device) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') total_time = time.time() # ========= Run VIBE on crops ========= # print(f'Running VIBE on crops...') vibe_time = time.time() image_folder = args.input_folder dataset = InferenceFromCrops(image_folder=image_folder) orig_height = orig_width = 512 dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=0) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch_num, batch in enumerate(dataloader): print("BATCH:", batch_num) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # output_path = image_folder.replace('cropped_frames', 'vibe_results') os.makedirs(output_path, exist_ok=True) pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() vibe_results = { 'pred_cam': pred_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, } del model end = time.time() fps = len(dataset) / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {len(dataset) / total_time:.2f}.' ) print( f'Saving vibe results to \"{os.path.join(output_path, "vibe_results.pkl")}\".' ) with open(os.path.join(output_path, "vibe_results.pkl"), 'wb') as f_save: pickle.dump(vibe_results, f_save) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = os.path.join(output_path, 'vibe_images') os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) frame_verts = vibe_results['verts'][frame_idx] frame_cam = vibe_results['pred_cam'][frame_idx] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'vibe_meshes') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') rend_img = renderer.render( img, frame_verts, cam=frame_cam, mesh_filename=mesh_filename, ) whole_img = rend_img if args.sideview: side_img_bg = np.zeros_like(img) side_rend_img90 = renderer.render( side_img_bg, frame_verts, cam=frame_cam, angle=90, axis=[0, 1, 0], ) side_rend_img270 = renderer.render( side_img_bg, frame_verts, cam=frame_cam, angle=270, axis=[0, 1, 0], ) if args.reposed_render: smpl = SMPL('data/vibe_data', batch_size=1) zero_pose = torch.from_numpy( np.zeros((1, pred_pose.shape[-1]))).float() zero_pose[:, 0] = np.pi pred_frame_betas = torch.from_numpy( pred_betas[frame_idx][None, :]).float() with torch.no_grad(): reposed_smpl_output = smpl( betas=pred_frame_betas, body_pose=zero_pose[:, 3:], global_orient=zero_pose[:, :3]) reposed_verts = reposed_smpl_output.vertices reposed_verts = reposed_verts.cpu().detach().numpy() reposed_cam = np.array([0.9, 0, 0]) reposed_rend_img = renderer.render(side_img_bg, reposed_verts[0], cam=reposed_cam) reposed_rend_img90 = renderer.render(side_img_bg, reposed_verts[0], cam=reposed_cam, angle=90, axis=[0, 1, 0]) top_row = np.concatenate( [img, reposed_rend_img, reposed_rend_img90], axis=1) bot_row = np.concatenate( [rend_img, side_rend_img90, side_rend_img270], axis=1) whole_img = np.concatenate([top_row, bot_row], axis=0) else: top_row = np.concatenate([img, side_img_bg, side_img_bg], axis=1) bot_row = np.concatenate( [rend_img, side_rend_img90, side_rend_img270], axis=1) whole_img = np.concatenate([top_row, bot_row], axis=0) # cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), whole_img) cv2.imwrite( os.path.join(output_img_folder, os.path.basename(img_fname)), whole_img) # ========= Save rendered video ========= # save_vid_path = os.path.join(output_path, 'vibe_video.mp4') print(f'Saving result video to {save_vid_path}') images_to_video(img_folder=output_img_folder, output_vid_file=save_vid_path) print('================= END =================')