def temporal_simplify(pred_verts, pred_cam, pred_pose, pred_betas, pred_joints3d, norm_joints2d, device, args): if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) print('pred_verts is ', pred_verts) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=torch.cat(pred_pose, dim=0), pred_betas=torch.cat(pred_betas, dim=0), pred_cam=torch.cat(pred_cam, dim=0), j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) pred_verts[0][update] = new_opt_vertices[update].to(device) pred_cam[0][update] = new_opt_cam[update].to(device) pred_pose[0][update] = new_opt_pose[update].to(device) pred_betas[0][update] = new_opt_betas[update].to(device) pred_joints3d[0][update] = new_opt_joints3d[update].to(device) pred_verts[0] = pred_verts[0].cpu() pred_cam[0] = pred_cam[0].cpu() pred_pose[0] = pred_pose[0].cpu() pred_betas[0] = pred_betas[0].cpu() pred_joints3d[0] = pred_joints3d[0].cpu() elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') return pred_verts, pred_cam, pred_pose, pred_betas, pred_joints3d, norm_joints2d
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) if args.joints3dview: output_img_raw_folder = f'{image_folder}_raw_output' os.makedirs(output_img_raw_folder, exist_ok=True) output_img_joints3d_folder = f'{image_folder}_joints3d_output' os.makedirs(output_img_joints3d_folder, exist_ok=True) output_img_mesh_folder = f'{image_folder}_mesh_output' os.makedirs(output_img_mesh_folder, exist_ok=True) output_img_meshside_folder = f'{image_folder}_meshside_output' os.makedirs(output_img_meshside_folder, exist_ok=True) output_img_all_folder = f'{image_folder}_all_output' os.makedirs(output_img_all_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) length_image_files = len(image_file_names) #length_image_files = 100 for frame_idx in tqdm(range(length_image_files)): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) if args.joints3dview: img_raw = img.copy() img_joints3d = np.zeros_like(img) joints3d_list = [] for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] joints3d = person_data['joints3d'] #print('frame_verts.shape = {}\nframe_cam.shape ={}\njoints3d.shape = {}'.format( # frame_verts.shape, frame_cam.shape, joints3d.shape)) mc = mesh_color[person_id] if args.joints3dview: joints3d_list.append(joints3d) # img_joints3d = render_joints3d(joints3d, img_raw.shape) mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img_mesh = img.copy() img = np.concatenate([img, side_img], axis=1) cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.joints3dview: #img_joints3d = np.zeros_like(img_raw) if len(joints3d_list) == 0: img_joints3d = np.zeros_like(img_raw) else: joints3d = np.concatenate(joints3d_list) img_joints3d = render_joints3d(joints3d, img_raw.shape) if args.joints3dview: img_up = np.concatenate([img_raw, img_joints3d], axis=1) img_down = np.concatenate([img_mesh, side_img], axis=1) img_all = np.concatenate([img_up, img_down], axis=0) cv2.imwrite( os.path.join(output_img_raw_folder, f'{frame_idx:06d}.png'), img_raw) cv2.imwrite( os.path.join(output_img_joints3d_folder, f'{frame_idx:06d}.png'), img_joints3d) cv2.imwrite( os.path.join(output_img_mesh_folder, f'{frame_idx:06d}.png'), img_mesh) cv2.imwrite( os.path.join(output_img_meshside_folder, f'{frame_idx:06d}.png'), side_img) cv2.imwrite( os.path.join(output_img_all_folder, f'{frame_idx:06d}.png'), img_all) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) if args.joints3dview: ''' save_name_raw = f'{vid_name.replace(".mp4", "")}_raw.mp4' save_name_raw = os.path.join(output_path, save_name_raw) images_to_video(img_folder=output_img_raw_folder, output_vid_file=save_name_raw) shutil.rmtree(output_img_raw_folder) save_name_joints3d = f'{vid_name.replace(".mp4", "")}_joints3d.mp4' save_name_joints3d = os.path.join(output_path, save_name_joints3d) images_to_video(img_folder=output_img_joints3d_folder, output_vid_file=save_name_joints3d) shutil.rmtree(output_img_joints3d_folder) save_name_mesh = f'{vid_name.replace(".mp4", "")}_mesh.mp4' save_name_mesh = os.path.join(output_path, save_name_mesh) images_to_video(img_folder=output_img_mesh_folder, output_vid_file=save_name_mesh) shutil.rmtree(output_img_mesh_folder) save_name_meshside = f'{vid_name.replace(".mp4", "")}_meshside.mp4' save_name_meshside = os.path.join(output_path, save_name_meshside) images_to_video(img_folder=output_img_meshside_folder, output_vid_file=save_name_meshside) shutil.rmtree(output_img_meshside_folder) ''' save_name_all = f'{vid_name.replace(".mp4", "")}_all.mp4' save_name_all = os.path.join(output_path, save_name_all) images_to_video(img_folder=output_img_all_folder, output_vid_file=save_name_all) shutil.rmtree(output_img_all_folder) shutil.rmtree(image_folder) print('================= END =================')
def run_vibe(video_file, args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # Make output dirs output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) # Convert video to images image_folder, num_frames, img_shape = video_to_images( video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker( video_file, staf_folder=args.staf_dir, display=args.display, smoothen=args.smoothen, smoothen_method=args.smoothen_method) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=None, joints2d=joints2d ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader( dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [ ], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).') print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.') print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".') # joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) for person in vibe_results.keys(): dump_path = os.path.join(output_path, "vibe_output_%s.pkl" % person) os.makedirs(os.path.dirname(dump_path), exist_ok=True) pickle.dump(vibe_results[person], open(dump_path, 'wb')) # if not args.no_render: # # ========= Render results as a single video ========= # # renderer = Renderer(resolution=(orig_width, orig_height), # orig_img=True, wireframe=args.wireframe) # output_img_folder = f'{image_folder}_output' # os.makedirs(output_img_folder, exist_ok=True) # print(f'Rendering output video, writing frames to {output_img_folder}') # # prepare results for rendering # frame_results = prepare_rendering_results(vibe_results, num_frames) # mesh_color = {k: colorsys.hsv_to_rgb( # np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} # image_file_names = sorted([ # os.path.join(image_folder, x) # for x in os.listdir(image_folder) # if x.endswith('.png') or x.endswith('.jpg') # ]) # for frame_idx in tqdm(range(len(image_file_names))): # img_fname = image_file_names[frame_idx] # img = cv2.imread(img_fname) # for person_id, person_data in frame_results[frame_idx].items(): # frame_verts = person_data['verts'] # frame_cam = person_data['cam'] # mc = mesh_color[person_id] # mesh_filename = None # img = renderer.render( # img, # frame_verts, # cam=frame_cam, # color=mc, # mesh_filename=mesh_filename, # ) # cv2.imwrite(os.path.join(output_img_folder, # f'{frame_idx:06d}.png'), img) # if args.display: # cv2.imshow('Video', img) # if cv2.waitKey(1) & 0xFF == ord('q'): # break # if args.display: # cv2.destroyAllWindows() # # ========= Save rendered video ========= # # vid_name = os.path.basename(video_file) # save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' # save_name = os.path.join(output_path, save_name) # print(f'Saving result video to {save_name}') # images_to_video(img_folder=output_img_folder, # output_vid_file=save_name) # shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) print('================= END =================')
def main(args): torch.cuda.set_device(args.gpu_id) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print(f'Loading video list {args.video_list}') video_list = [l.strip() for l in open(args.video_list, 'r').readlines()] if len(video_list) < 1: print('No files were found in video list') return print('Loading VIBE model') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load VIBE pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') num_videos = len(video_list) print(f'Processing {num_videos} videos.') for video_idx, video_file in enumerate(video_list, start=1): if not osp.isfile(video_file): print(f'Input video \"{video_file}\" does not exist! Moving on to next file.') continue filename = osp.splitext(osp.basename(video_file))[0] output_path = osp.join(args.output_folder, filename) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames') orig_height, orig_width = img_shape[:2] # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not osp.isabs(video_file): video_file = osp.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!') print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict # Clean-up the temporal folder # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path) output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl') print(f'Saving output results to \"{output_pkl_path}\".') joblib.dump(vibe_results, output_pkl_path) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0,1,0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) # Clean-up after processing del model shutil.rmtree(image_folder) print('================= END =================')
def main(args): if args.device == 'cpu': device = torch.device('cpu') print('Running on CPU') else: device = torch.device('cuda') print('Running on GPU') if args.vid_file: video_file = args.vid_file if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') else: image_file = args.img_file if not os.path.isfile(image_file): exit(f'Input video \"{image_file}\" does not exist!') output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) # output_path = os.path.join(args.output_folder, os.path.basename(video_file).split('.')[0]) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # resize video if too big # ffmpeg -i input.avi -filter:v scale=720:-1 -c:a copy output.mkv # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) import pdb pdb.set_trace # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=True) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_images' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') output_pose_folder = f'{image_folder}_poses' os.makedirs(output_pose_folder, exist_ok=True) print(f'Saving poses to {output_pose_folder}') # prepare results for rendering from numpy import save save(f'{os.path.basename(video_file)}_poses.npy', vibe_results[1]['joints3d'][:, :25, :]) print('Saving numpy poses file to' + f'{video_file}_poses.npy') frame_results = prepare_rendering_results( vibe_results, num_frames) # returns a list of dicts (one dict for each person) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] frame_pose = person_data['joints3d'][:25] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') # bgr image (opencv format) img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) # import pdb; pdb.set_trace() # Create a 3D projection and save as img # pose is mirrored # plot_skeleton(output_pose_folder, frame_idx, frame_pose) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) # concatenate pose img with this image before writing cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) # shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') dict = { 'bridge': 1, 'childs': 2, 'downwarddog': 3, 'mountain': 4, 'plank': 5, 'seatedforwardbend': 6, 'tree': 7, 'trianglepose': 8, 'warrior1': 9, 'warrior2': 10 } dir_path = '/home/ubuntu/PoseEstimation/VIBE/InputData/input_test_set/' output_folder = '/home/ubuntu/PoseEstimation/VIBE/OutputData/test_set/' joints3D_csv = open('output_joints3d_dog.csv', 'a') pose_csv = open('output_pose.csv_dog', 'a') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') video_file = '/home/ubuntu/PoseEstimation/VIBE/DogVideo.mp4' video_label = dict['bridge'] if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } for i in range(len(output_dict['joints3d'])): if (i % 5 == 0): flat_arr = output_dict['joints3d'][i].flatten() len_N = len(flat_arr) np.savetxt(joints3D_csv, [np.append(flat_arr, [video_label])], delimiter=',', fmt=' '.join(['%f'] * len_N + ['%i'])) for i in range(len(output_dict['pose'])): if (i % 5 == 0): pose_arr = output_dict['pose'][i].flatten() len_M = len(pose_arr) np.savetxt(pose_csv, [np.append(pose_arr, [video_label])], delimiter=',', fmt=' '.join(['%f'] * len_M + ['%i'])) end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' )
def main(args): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join(args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False # reduce the num of worker if you encountered the error: DLL load failed: The paging file is too small for this operation to complete dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=8) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!') print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print(f'Total time spent: {total_time:.2f} seconds (including model loading time).') print(f'Total FPS (including model loading time): {num_frames / total_time:.2f}.') print(f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".') joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0,1,0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) font = cv2.FONT_HERSHEY_SIMPLEX x = 10 #position of text y = 20 #position of text cv2.putText(img, str(frame_idx), (x,y), font ,0.55,(0,255,0),1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) # generate and save the joints csv file for animating avatars later output = joblib.load(os.path.join(output_path, "vibe_output.pkl")) for i in output.keys(): print('Track ids:', i , end='\n\n') num_ppl = len(output.keys()) print('VIBE output file content:', end='\n\n') vid_name = os.path.basename(video_file) vibe_result_folder = output_path # output the pose result as csv # format: v_personId_numFrames pose_filename_list = [] for i in output.keys(): pose_filename = vibe_result_folder + "/" + vid_name + "_"+ str(i) + "_" + str(output[i]['pose'].shape[0]) + ".csv" pose_filename_list.append(pose_filename) field_names = [] for idx in range(73): # 72 -> 73 (+ frame_id at 0) field_names.append(str(idx)) with open(pose_filename, 'w', newline='') as file: writer = csv.writer(file) writer.writerow(field_names) for frame_id in range(len(output[i]['pose'])): output_data = [output[i]['frame_ids'][frame_id]] output_data.extend(output[i]['pose'][frame_id]) #print(output_data) writer.writerow(output_data) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) map_vals = { 'bridge': 1, 'childs': 2, 'downwarddog': 3, 'mountain': 4, 'plank': 5, 'seatedforwardbend': 6, 'tree': 7, 'trianglepose': 8, 'warrior1': 9, 'warrior2': 10 } inverse_map = { 1: 'bridge', 2: 'childs', 3: 'downwarddog', 4: 'mountain', 5: 'plank', 6: 'seatedforwardbend', 7: 'tree', 8: 'trianglepose', 9: 'warrior1', 10: 'warrior2' } video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') dir_path = '/home/ubuntu/PoseEstimation/VIBE/InputData/input_test_set/' output_folder = '/home/ubuntu/PoseEstimation/VIBE/OutputData/' # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load Classification Model ========= # classification_model = pickle.load( open('view_classification_model.pkl', 'rb')) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) #print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() #print(f'Loaded pretrained weights from \"{pretrained_file}\"') image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # #print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in list(tracking_results.keys()): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } # ========= Extract 3D joint feature for each frame ========= # list_val = [] for i in range(len(output_dict['joints3d'])): list_val.append(output_dict['joints3d'][i].flatten().reshape( 1, -1)) input_df = pd.DataFrame(np.concatenate(list_val)) input_df = input_df.round(2) predicted_classes = classification_model.predict_classes(input_df) output_df = pd.DataFrame(predicted_classes) # ========= Printing all possible poses detected for the video ========= # total_frames = len(output_df) print( '\nPrinting probabilities for yoga poses predicted in different frames.' ) for i, v in output_df.value_counts().items(): val = round((v / total_frames) * 100, 2) print('Probability of the yoga pose being ' + inverse_map[i[0]].capitalize() + " is: " + str(val)) print('\nThe yoga pose in the given video is: ' + inverse_map[output_df[0].value_counts().idxmax()].capitalize())
def main(args): torch.cuda.set_device(args.gpu_id) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print(f'Loading video list {args.video_list}') video_list = [l.strip() for l in open(args.video_list, 'r').readlines()] if len(video_list) < 1: print('No files were found in video list') return print('Loading VIBE model') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load VIBE pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') num_videos = len(video_list) print(f'Processing {num_videos} videos.') for video_idx, video_file in enumerate(video_list, start=1): if not osp.isfile(video_file): print( f'Input video \"{video_file}\" does not exist! Moving on to next file.' ) continue filename = osp.splitext(osp.basename(video_file))[0] output_path = osp.join(args.output_folder, filename) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames') orig_height, orig_width = img_shape[:2] # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not osp.isabs(video_file): video_file = osp.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to( device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print( '[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict # Clean-up the temporal folder shutil.rmtree(image_folder) # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path) output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl') print(f'Saving output results to \"{output_pkl_path}\".') joblib.dump(vibe_results, output_pkl_path) # Clean-up after processing del model print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, device=device, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file, map_location=device) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') total_time = time.time() # ========= Run VIBE on crops ========= # print(f'Running VIBE on crops...') vibe_time = time.time() image_folder = args.input_folder dataset = InferenceFromCrops(image_folder=image_folder) orig_height = orig_width = 512 dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=0) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch_num, batch in enumerate(dataloader): print("BATCH:", batch_num) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # output_path = image_folder.replace('cropped_frames', 'vibe_results') os.makedirs(output_path, exist_ok=True) pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() vibe_results = { 'pred_cam': pred_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, } del model end = time.time() fps = len(dataset) / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {len(dataset) / total_time:.2f}.' ) print( f'Saving vibe results to \"{os.path.join(output_path, "vibe_results.pkl")}\".' ) with open(os.path.join(output_path, "vibe_results.pkl"), 'wb') as f_save: pickle.dump(vibe_results, f_save) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = os.path.join(output_path, 'vibe_images') os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) frame_verts = vibe_results['verts'][frame_idx] frame_cam = vibe_results['pred_cam'][frame_idx] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'vibe_meshes') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') rend_img = renderer.render( img, frame_verts, cam=frame_cam, mesh_filename=mesh_filename, ) whole_img = rend_img if args.sideview: side_img_bg = np.zeros_like(img) side_rend_img90 = renderer.render( side_img_bg, frame_verts, cam=frame_cam, angle=90, axis=[0, 1, 0], ) side_rend_img270 = renderer.render( side_img_bg, frame_verts, cam=frame_cam, angle=270, axis=[0, 1, 0], ) if args.reposed_render: smpl = SMPL('data/vibe_data', batch_size=1) zero_pose = torch.from_numpy( np.zeros((1, pred_pose.shape[-1]))).float() zero_pose[:, 0] = np.pi pred_frame_betas = torch.from_numpy( pred_betas[frame_idx][None, :]).float() with torch.no_grad(): reposed_smpl_output = smpl( betas=pred_frame_betas, body_pose=zero_pose[:, 3:], global_orient=zero_pose[:, :3]) reposed_verts = reposed_smpl_output.vertices reposed_verts = reposed_verts.cpu().detach().numpy() reposed_cam = np.array([0.9, 0, 0]) reposed_rend_img = renderer.render(side_img_bg, reposed_verts[0], cam=reposed_cam) reposed_rend_img90 = renderer.render(side_img_bg, reposed_verts[0], cam=reposed_cam, angle=90, axis=[0, 1, 0]) top_row = np.concatenate( [img, reposed_rend_img, reposed_rend_img90], axis=1) bot_row = np.concatenate( [rend_img, side_rend_img90, side_rend_img270], axis=1) whole_img = np.concatenate([top_row, bot_row], axis=0) else: top_row = np.concatenate([img, side_img_bg, side_img_bg], axis=1) bot_row = np.concatenate( [rend_img, side_rend_img90, side_rend_img270], axis=1) whole_img = np.concatenate([top_row, bot_row], axis=0) # cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), whole_img) cv2.imwrite( os.path.join(output_img_folder, os.path.basename(img_fname)), whole_img) # ========= Save rendered video ========= # save_vid_path = os.path.join(output_path, 'vibe_video.mp4') print(f'Saving result video to {save_vid_path}') images_to_video(img_folder=output_img_folder, output_vid_file=save_vid_path) print('================= END =================')