def get_single_item(self, index): start_index, end_index = self.vid_indices[index] with h5py.File(self.h5_file, 'r') as db: self.db = db kp_2d = self.db['joints2D'][start_index:end_index + 1] kp_2d = convert_kps(kp_2d, src='insta', dst='spin') kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) input = torch.from_numpy(self.db['features'][start_index:end_index+1]).float() vid_name = self.db['vid_name'][start_index:end_index + 1] frame_id = self.db['frame_id'][start_index:end_index + 1].astype(str) instance_id = np.array([v.decode('ascii') + f for v, f in zip(vid_name, frame_id)]) for idx in range(self.seqlen): kp_2d[idx,:,:2] = normalize_2d_kp(kp_2d[idx,:,:2], 224) kp_2d_tensor[idx] = kp_2d[idx] target = { 'features': input, 'kp_2d': torch.from_numpy(kp_2d_tensor).float(), # 2D keypoints transformed according to bbox cropping # 'instance_id': instance_id } return target
def temporal_simplify(pred_verts, pred_cam, pred_pose, pred_betas, pred_joints3d, norm_joints2d, device, args): if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) print('pred_verts is ', pred_verts) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=torch.cat(pred_pose, dim=0), pred_betas=torch.cat(pred_betas, dim=0), pred_cam=torch.cat(pred_cam, dim=0), j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) pred_verts[0][update] = new_opt_vertices[update].to(device) pred_cam[0][update] = new_opt_cam[update].to(device) pred_pose[0][update] = new_opt_pose[update].to(device) pred_betas[0][update] = new_opt_betas[update].to(device) pred_joints3d[0][update] = new_opt_joints3d[update].to(device) pred_verts[0] = pred_verts[0].cpu() pred_cam[0] = pred_cam[0].cpu() pred_pose[0] = pred_pose[0].cpu() pred_betas[0] = pred_betas[0].cpu() pred_joints3d[0] = pred_joints3d[0].cpu() elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') return pred_verts, pred_cam, pred_pose, pred_betas, pred_joints3d, norm_joints2d
def main(args): torch.cuda.set_device(args.gpu_id) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print(f'Loading video list {args.video_list}') video_list = [l.strip() for l in open(args.video_list, 'r').readlines()] if len(video_list) < 1: print('No files were found in video list') return print('Loading VIBE model') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load VIBE pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') num_videos = len(video_list) print(f'Processing {num_videos} videos.') for video_idx, video_file in enumerate(video_list, start=1): if not osp.isfile(video_file): print(f'Input video \"{video_file}\" does not exist! Moving on to next file.') continue filename = osp.splitext(osp.basename(video_file))[0] output_path = osp.join(args.output_folder, filename) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames') orig_height, orig_width = img_shape[:2] # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not osp.isabs(video_file): video_file = osp.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!') print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict # Clean-up the temporal folder # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path) output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl') print(f'Saving output results to \"{output_pkl_path}\".') joblib.dump(vibe_results, output_pkl_path) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0,1,0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) # Clean-up after processing del model shutil.rmtree(image_folder) print('================= END =================')
def read_data_train(dataset_path, debug=False): h, w = 2048, 2048 dataset = { 'vid_name': [], 'frame_id': [], 'joints3D': [], 'joints2D': [], 'bbox': [], 'img_name': [], 'features': [], } model = spin.get_pretrained_hmr() # training data user_list = range(1, 9) seq_list = range(1, 3) vid_list = list(range(3)) + list(range(4, 9)) # product = product(user_list, seq_list, vid_list) # user_i, seq_i, vid_i = product[process_id] for user_i in user_list: for seq_i in seq_list: seq_path = os.path.join(dataset_path, 'S' + str(user_i), 'Seq' + str(seq_i)) # mat file with annotations annot_file = os.path.join(seq_path, 'annot.mat') annot2 = sio.loadmat(annot_file)['annot2'] annot3 = sio.loadmat(annot_file)['annot3'] # calibration file and camera parameters for j, vid_i in enumerate(vid_list): # image folder imgs_path = os.path.join(seq_path, 'video_' + str(vid_i)) # per frame pattern = os.path.join(imgs_path, '*.jpg') img_list = sorted(glob.glob(pattern)) vid_used_frames = [] vid_used_joints = [] vid_used_bbox = [] vid_segments = [] vid_uniq_id = "subj" + str(user_i) + '_seq' + str( seq_i) + "_vid" + str(vid_i) + "_seg0" for i, img_i in tqdm_enumerate(img_list): # for each image we store the relevant annotations img_name = img_i.split('/')[-1] joints_2d_raw = np.reshape(annot2[vid_i][0][i], (1, 28, 2)) joints_2d_raw = np.append(joints_2d_raw, np.ones((1, 28, 1)), axis=2) joints_2d = convert_kps(joints_2d_raw, "mpii3d", "spin").reshape((-1, 3)) # visualize = True # if visualize == True and i == 500: # import matplotlib.pyplot as plt # # frame = cv2.cvtColor(cv2.imread(img_i), cv2.COLOR_BGR2RGB) # # for k in range(49): # kp = joints_2d[k] # # frame = cv2.circle( # frame.copy(), # (int(kp[0]), int(kp[1])), # thickness=3, # color=(255, 0, 0), # radius=5, # ) # # cv2.putText(frame, f'{k}', (int(kp[0]), int(kp[1]) + 1), cv2.FONT_HERSHEY_SIMPLEX, 1.5, # (0, 255, 0), # thickness=3) # # plt.imshow(frame) # plt.show() joints_3d_raw = np.reshape(annot3[vid_i][0][i], (1, 28, 3)) / 1000 joints_3d = convert_kps(joints_3d_raw, "mpii3d", "spin").reshape((-1, 3)) bbox = get_bbox_from_kp2d( joints_2d[~np.all(joints_2d == 0, axis=1)]).reshape(4) joints_3d = joints_3d - joints_3d[39] # 4 is the root # check that all joints are visible x_in = np.logical_and(joints_2d[:, 0] < w, joints_2d[:, 0] >= 0) y_in = np.logical_and(joints_2d[:, 1] < h, joints_2d[:, 1] >= 0) ok_pts = np.logical_and(x_in, y_in) if np.sum(ok_pts) < joints_2d.shape[0]: vid_uniq_id = "_".join(vid_uniq_id.split("_")[:-1])+ "_seg" +\ str(int(dataset['vid_name'][-1].split("_")[-1][3:])+1) continue dataset['vid_name'].append(vid_uniq_id) dataset['frame_id'].append(img_name.split(".")[0]) dataset['img_name'].append(img_i) dataset['joints2D'].append(joints_2d) dataset['joints3D'].append(joints_3d) dataset['bbox'].append(bbox) vid_segments.append(vid_uniq_id) vid_used_frames.append(img_i) vid_used_joints.append(joints_2d) vid_used_bbox.append(bbox) vid_segments = np.array(vid_segments) ids = np.zeros((len(set(vid_segments)) + 1)) ids[-1] = len(vid_used_frames) + 1 if (np.where( vid_segments[:-1] != vid_segments[1:])[0]).size != 0: ids[1:-1] = (np.where( vid_segments[:-1] != vid_segments[1:])[0]) + 1 # for i in tqdm(range(len(set(vid_segments)))): # features = extract_features(model, np.array(vid_used_frames)[int(ids[i]):int(ids[i+1])], # vid_used_bbox[int(ids[i]):int((ids[i+1]))], # kp_2d=np.array(vid_used_joints)[int(ids[i]):int(ids[i+1])], # dataset='spin', debug=False) # dataset['features'].append(features) for k in dataset.keys(): dataset[k] = np.array(dataset[k]) # dataset['features'] = np.concatenate(dataset['features']) return dataset
def read_test_data(dataset_path): dataset = { 'vid_name': [], 'frame_id': [], 'joints3D': [], 'joints2D': [], 'bbox': [], 'img_name': [], 'features': [], "valid_i": [] } model = spin.get_pretrained_hmr() user_list = range(1, 7) for user_i in user_list: print('Subject', user_i) seq_path = os.path.join(dataset_path, 'mpi_inf_3dhp_test_set', 'TS' + str(user_i)) # mat file with annotations annot_file = os.path.join(seq_path, 'annot_data.mat') mat_as_h5 = h5py.File(annot_file, 'r') annot2 = np.array(mat_as_h5['annot2']) annot3 = np.array(mat_as_h5['univ_annot3']) valid = np.array(mat_as_h5['valid_frame']) vid_used_frames = [] vid_used_joints = [] vid_used_bbox = [] vid_segments = [] vid_uniq_id = "subj" + str(user_i) + "_seg0" for frame_i, valid_i in tqdm(enumerate(valid)): img_i = os.path.join('mpi_inf_3dhp_test_set', 'TS' + str(user_i), 'imageSequence', 'img_' + str(frame_i + 1).zfill(6) + '.jpg') joints_2d_raw = np.expand_dims(annot2[frame_i, 0, :, :], axis=0) joints_2d_raw = np.append(joints_2d_raw, np.ones((1, 17, 1)), axis=2) joints_2d = convert_kps(joints_2d_raw, src="mpii3d_test", dst="spin").reshape((-1, 3)) # visualize = True # if visualize == True: # import matplotlib.pyplot as plt # # frame = cv2.cvtColor(cv2.imread(os.path.join(dataset_path, img_i)), cv2.COLOR_BGR2RGB) # # for k in range(49): # kp = joints_2d[k] # # frame = cv2.circle( # frame.copy(), # (int(kp[0]), int(kp[1])), # thickness=3, # color=(255, 0, 0), # radius=5, # ) # # cv2.putText(frame, f'{k}', (int(kp[0]), int(kp[1]) + 1), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), # thickness=3) # # plt.imshow(frame) # plt.show() joints_3d_raw = np.reshape(annot3[frame_i, 0, :, :], (1, 17, 3)) / 1000 joints_3d = convert_kps(joints_3d_raw, "mpii3d_test", "spin").reshape((-1, 3)) joints_3d = joints_3d - joints_3d[ 39] # substract pelvis zero is the root for test bbox = get_bbox_from_kp2d( joints_2d[~np.all(joints_2d == 0, axis=1)]).reshape(4) # check that all joints are visible img_file = os.path.join(dataset_path, img_i) I = cv2.imread(img_file) h, w, _ = I.shape x_in = np.logical_and(joints_2d[:, 0] < w, joints_2d[:, 0] >= 0) y_in = np.logical_and(joints_2d[:, 1] < h, joints_2d[:, 1] >= 0) ok_pts = np.logical_and(x_in, y_in) if np.sum(ok_pts) < joints_2d.shape[0]: vid_uniq_id = "_".join(vid_uniq_id.split("_")[:-1]) + "_seg" + \ str(int(dataset['vid_name'][-1].split("_")[-1][3:]) + 1) continue dataset['vid_name'].append(vid_uniq_id) dataset['frame_id'].append(img_file.split("/")[-1].split(".")[0]) dataset['img_name'].append(img_file) dataset['joints2D'].append(joints_2d) dataset['joints3D'].append(joints_3d) dataset['bbox'].append(bbox) dataset['valid_i'].append(valid_i) vid_segments.append(vid_uniq_id) vid_used_frames.append(img_file) vid_used_joints.append(joints_2d) vid_used_bbox.append(bbox) vid_segments = np.array(vid_segments) ids = np.zeros((len(set(vid_segments)) + 1)) ids[-1] = len(vid_used_frames) + 1 if (np.where(vid_segments[:-1] != vid_segments[1:])[0]).size != 0: ids[1:-1] = (np.where( vid_segments[:-1] != vid_segments[1:])[0]) + 1 # for i in tqdm(range(len(set(vid_segments)))): # features = extract_features(model, np.array(vid_used_frames)[int(ids[i]):int(ids[i + 1])], # vid_used_bbox[int(ids[i]):int(ids[i + 1])], # kp_2d=np.array(vid_used_joints)[int(ids[i]):int(ids[i + 1])], # dataset='spin', debug=False) # dataset['features'].append(features) for k in dataset.keys(): dataset[k] = np.array(dataset[k]) # dataset['features'] = np.concatenate(dataset['features']) return dataset
def main(args): if args.device == 'cpu': device = torch.device('cpu') print('Running on CPU') else: device = torch.device('cuda') print('Running on GPU') if args.vid_file: video_file = args.vid_file if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') else: image_file = args.img_file if not os.path.isfile(image_file): exit(f'Input video \"{image_file}\" does not exist!') output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) # output_path = os.path.join(args.output_folder, os.path.basename(video_file).split('.')[0]) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # resize video if too big # ffmpeg -i input.avi -filter:v scale=720:-1 -c:a copy output.mkv # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) import pdb pdb.set_trace # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=True) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_images' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') output_pose_folder = f'{image_folder}_poses' os.makedirs(output_pose_folder, exist_ok=True) print(f'Saving poses to {output_pose_folder}') # prepare results for rendering from numpy import save save(f'{os.path.basename(video_file)}_poses.npy', vibe_results[1]['joints3d'][:, :25, :]) print('Saving numpy poses file to' + f'{video_file}_poses.npy') frame_results = prepare_rendering_results( vibe_results, num_frames) # returns a list of dicts (one dict for each person) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] frame_pose = person_data['joints3d'][:25] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') # bgr image (opencv format) img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) # import pdb; pdb.set_trace() # Create a 3D projection and save as img # pose is mirrored # plot_skeleton(output_pose_folder, frame_idx, frame_pose) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) # concatenate pose img with this image before writing cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) # shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) print('================= END =================')
def read_data(folder, set): dataset = { 'img_name': [], 'joints2D': [], 'bbox': [], 'vid_name': [], 'features': [], } model = spin.get_pretrained_hmr() file_names = glob.glob( osp.join(folder, 'posetrack_data/annotations/', f'{set}/*.json')) file_names = sorted(file_names) nn_corrupted = 0 tot_frames = 0 min_frame_number = 8 for fid, fname in tqdm_enumerate(file_names): if fname == osp.join(folder, 'annotations/train/021133_mpii_train.json'): continue with open(fname, 'r') as entry: anns = json.load(entry) # num_frames = anns['images'][0]['nframes'] anns['images'] = [ item for item in anns['images'] if item['is_labeled'] ] num_frames = len(anns['images']) frame2imgname = dict() for el in anns['images']: frame2imgname[el['frame_id']] = el['file_name'] num_people = -1 for x in anns['annotations']: if num_people < x['track_id']: num_people = x['track_id'] num_people += 1 posetrack_joints = get_posetrack_original_kp_names() idxs = [ anns['categories'][0]['keypoints'].index(h) for h in posetrack_joints if h in anns['categories'][0]['keypoints'] ] for x in anns['annotations']: kps = np.array(x['keypoints']).reshape((17, 3)) kps = kps[idxs, :] x['keypoints'] = list(kps.flatten()) tot_frames += num_people * num_frames for p_id in range(num_people): annot_pid = [(item['keypoints'], item['bbox'], item['image_id']) for item in anns['annotations'] if item['track_id'] == p_id and not (np.count_nonzero(item['keypoints']) == 0)] if len(annot_pid) < min_frame_number: nn_corrupted += len(annot_pid) continue bbox = np.zeros((len(annot_pid), 4)) # perm_idxs = get_perm_idxs('posetrack', 'common') kp_2d = np.zeros((len(annot_pid), len(annot_pid[0][0]) // 3, 3)) img_paths = np.zeros((len(annot_pid))) for i, (key2djnts, bbox_p, image_id) in enumerate(annot_pid): if (bbox_p[2] == 0 or bbox_p[3] == 0): nn_corrupted += 1 continue img_paths[i] = image_id key2djnts[2::3] = len(key2djnts[2::3]) * [1] kp_2d[i, :] = np.array(key2djnts).reshape( int(len(key2djnts) / 3), 3) # [perm_idxs, :] for kp_loc in kp_2d[i, :]: if kp_loc[0] == 0 and kp_loc[1] == 0: kp_loc[2] = 0 x_tl = bbox_p[0] y_tl = bbox_p[1] w = bbox_p[2] h = bbox_p[3] bbox_p[0] = x_tl + w / 2 bbox_p[1] = y_tl + h / 2 # w = h = np.where(w / h > 1, w, h) w = h = h * 0.8 bbox_p[2] = w bbox_p[3] = h bbox[i, :] = bbox_p img_paths = list(img_paths) img_paths = [ osp.join(folder, frame2imgname[item]) if item != 0 else 0 for item in img_paths ] bbx_idxs = [] for bbx_id, bbx in enumerate(bbox): if np.count_nonzero(bbx) == 0: bbx_idxs += [bbx_id] kp_2d = np.delete(kp_2d, bbx_idxs, 0) img_paths = np.delete(np.array(img_paths), bbx_idxs, 0) bbox = np.delete(bbox, np.where(~bbox.any(axis=1))[0], axis=0) # Convert to common 2d keypoint format if bbox.size == 0 or bbox.shape[0] < min_frame_number: nn_corrupted += 1 continue kp_2d = convert_kps(kp_2d, src='posetrack', dst='spin') dataset['vid_name'].append( np.array([f'{fname}_{p_id}'] * img_paths.shape[0])) dataset['img_name'].append(np.array(img_paths)) dataset['joints2D'].append(kp_2d) dataset['bbox'].append(np.array(bbox)) # compute_features features = extract_features( model, np.array(img_paths), bbox, kp_2d=kp_2d, dataset='spin', debug=False, ) assert kp_2d.shape[0] == img_paths.shape[0] == bbox.shape[0] dataset['features'].append(features) print(nn_corrupted, tot_frames) for k in dataset.keys(): dataset[k] = np.array(dataset[k]) for k in dataset.keys(): dataset[k] = np.concatenate(dataset[k]) for k, v in dataset.items(): print(k, v.shape) return dataset
def get_single_item(self, index): start_index, end_index = self.vid_indices[index] kp_2d = self.db['joints2D'][start_index:end_index + 1] if self.dataset_name != 'posetrack': kp_2d = convert_kps(kp_2d, src=self.dataset_name, dst='spin') kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) bbox = self.db['bbox'][start_index:end_index + 1] input = torch.from_numpy(self.db['features'][start_index:end_index + 1]).float() for idx in range(self.seqlen): # crop image and transform 2d keypoints kp_2d[idx, :, :2], trans = transfrom_keypoints( kp_2d=kp_2d[idx, :, :2], center_x=bbox[idx, 0], center_y=bbox[idx, 1], width=bbox[idx, 2], height=bbox[idx, 3], patch_width=224, patch_height=224, do_augment=False, ) kp_2d[idx, :, :2] = normalize_2d_kp(kp_2d[idx, :, :2], 224) kp_2d_tensor[idx] = kp_2d[idx] vid_name = self.db['vid_name'][start_index:end_index + 1] frame_id = self.db['img_name'][start_index:end_index + 1].astype(str) instance_id = np.array([v + f for v, f in zip(vid_name, frame_id)]) target = { 'features': input, 'kp_2d': torch.from_numpy(kp_2d_tensor).float( ), # 2D keypoints transformed according to bbox cropping # 'instance_id': instance_id, } if self.debug: from lib.data_utils.img_utils import get_single_image_crop vid_name = self.db['vid_name'][start_index] if self.dataset_name == 'pennaction': vid_folder = "frames" vid_name = vid_name.split('/')[-1].split('.')[0] img_id = "img_name" elif self.dataset_name == 'posetrack': vid_folder = osp.join('images', vid_name.split('/')[-2]) vid_name = vid_name.split('/')[-1].split('.')[0] img_id = "img_name" else: vid_name = '_'.join(vid_name.split('_')[:-1]) vid_folder = 'imageFiles' img_id = 'frame_id' f = osp.join(self.folder, vid_folder, vid_name) video_file_list = [ osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg') ] frame_idxs = self.db[img_id][start_index:end_index + 1] if self.dataset_name == 'pennaction' or self.dataset_name == 'posetrack': video = frame_idxs else: video = [video_file_list[i] for i in frame_idxs] video = torch.cat([ get_single_image_crop(image, bbox).unsqueeze(0) for image, bbox in zip(video, bbox) ], dim=0) target['video'] = video return target
def main(args): torch.cuda.set_device(args.gpu_id) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print(f'Loading video list {args.video_list}') video_list = [l.strip() for l in open(args.video_list, 'r').readlines()] if len(video_list) < 1: print('No files were found in video list') return print('Loading VIBE model') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load VIBE pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') num_videos = len(video_list) print(f'Processing {num_videos} videos.') for video_idx, video_file in enumerate(video_list, start=1): if not osp.isfile(video_file): print( f'Input video \"{video_file}\" does not exist! Moving on to next file.' ) continue filename = osp.splitext(osp.basename(video_file))[0] output_path = osp.join(args.output_folder, filename) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'[{video_idx}/{num_videos}] Processing {num_frames} frames') orig_height, orig_width = img_shape[:2] # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not osp.isabs(video_file): video_file = osp.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to( device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print( '[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict # Clean-up the temporal folder shutil.rmtree(image_folder) # Save the outputs to joblib pkl file. File is loaded through joblib.load(pkl_path) output_pkl_path = osp.join(args.output_folder, f'{filename}.pkl') print(f'Saving output results to \"{output_pkl_path}\".') joblib.dump(vibe_results, output_pkl_path) # Clean-up after processing del model print('================= END =================')
def get_single_item(self, index): start_index, end_index = self.vid_indices[index] is_train = self.set == 'train' if self.dataset_name == '3dpw': kp_2d = convert_kps(self.db['joints2D'][start_index:end_index + 1], src='common', dst='spin') kp_3d = self.db['joints3D'][start_index:end_index + 1] elif self.dataset_name == 'mpii3d': kp_2d = self.db['joints2D'][start_index:end_index + 1] if is_train: kp_3d = self.db['joints3D'][start_index:end_index + 1] else: kp_3d = convert_kps(self.db['joints3D'][start_index:end_index + 1], src='spin', dst='common') elif self.dataset_name == 'h36m': kp_2d = self.db['joints2D'][start_index:end_index + 1] if is_train: kp_3d = self.db['joints3D'][start_index:end_index + 1] else: kp_3d = convert_kps(self.db['joints3D'][start_index:end_index + 1], src='spin', dst='common') kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) nj = 14 if not is_train else 49 kp_3d_tensor = np.zeros((self.seqlen, nj, 3), dtype=np.float16) if self.dataset_name == '3dpw': pose = self.db['pose'][start_index:end_index+1] shape = self.db['shape'][start_index:end_index+1] w_smpl = torch.ones(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() elif self.dataset_name == 'h36m': if not is_train: pose = np.zeros((kp_2d.shape[0], 72)) shape = np.zeros((kp_2d.shape[0], 10)) w_smpl = torch.zeros(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() else: pose = self.db['pose'][start_index:end_index + 1] shape = self.db['shape'][start_index:end_index + 1] w_smpl = torch.ones(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() elif self.dataset_name == 'mpii3d': pose = np.zeros((kp_2d.shape[0], 72)) shape = np.zeros((kp_2d.shape[0], 10)) w_smpl = torch.zeros(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() bbox = self.db['bbox'][start_index:end_index + 1] bbox_orig = bbox bbox_orig[:, 2] = bbox[:, 2] * 0.5 input = torch.from_numpy(self.db['features'][start_index:end_index+1]).float() theta_tensor = np.zeros((self.seqlen, 85), dtype=np.float16) for idx in range(self.seqlen): # crop image and transform 2d keypoints kp_2d[idx,:,:2], trans = transfrom_keypoints( kp_2d=kp_2d[idx,:,:2], center_x=bbox[idx,0], center_y=bbox[idx,1], width=bbox[idx,2], height=bbox[idx,3], patch_width=224, patch_height=224, do_augment=False, ) kp_2d[idx,:,:2] = normalize_2d_kp(kp_2d[idx,:,:2], 224) # theta shape (85,) theta = np.concatenate((np.array([1., 0., 0.]), pose[idx], shape[idx]), axis=0) kp_2d_tensor[idx] = kp_2d[idx] theta_tensor[idx] = theta kp_3d_tensor[idx] = kp_3d[idx] target = { 'features': input, 'theta': torch.from_numpy(theta_tensor).float(), # camera, pose and shape 'kp_2d': torch.from_numpy(kp_2d_tensor).float(), # 2D keypoints transformed according to bbox cropping 'kp_3d': torch.from_numpy(kp_3d_tensor).float(), # 3D keypoints 'w_smpl': w_smpl, 'w_3d': w_3d, } if self.dataset_name == 'mpii3d' and not is_train: target['valid'] = self.db['valid_i'][start_index:end_index+1] if self.dataset_name == '3dpw' and not is_train: vn = self.db['vid_name'][start_index:end_index + 1] fi = self.db['frame_id'][start_index:end_index + 1] target['instance_id'] = [f'{v}/{f}'for v,f in zip(vn,fi)] # if self.dataset_name == '3dpw' and not self.is_train: # target['imgname'] = self.db['img_name'][start_index:end_index+1].tolist() # target['imgname'] = np.array(target['imgname']) # print(target['imgname'].dtype) # target['center'] = self.db['bbox'][start_index:end_index+1, :2] # target['valid'] = torch.from_numpy(self.db['valid'][start_index:end_index+1]) #if self.debug: from lib.data_utils.img_utils import get_single_image_crop if self.dataset_name == 'mpii3d': video_names = self.db['img_name'][start_index:end_index+1] # print(video) elif self.dataset_name == 'h36m': video_names = self.db['img_name'][start_index:end_index + 1] else: vid_name = self.db['vid_name'][start_index] vid_name = '_'.join(vid_name.split('_')[:-1]) f = osp.join(self.folder, 'imageFiles', vid_name) video_file_list = [osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg')] frame_idxs = self.db['frame_id'][start_index:end_index + 1] # print(f, frame_idxs) video_names = [video_file_list[i] for i in frame_idxs] count = 0 for image_name, tmp_bbox_orig in zip(video_names, bbox_orig): image_yolo, image_big, bbox_orig_yolo, bbox_orig_big = get_single_image_full(image_name, tmp_bbox_orig) if count == 0: bbox_orig_big_all = [bbox_orig_big] bbox_orig_yolo_all = [bbox_orig_yolo] video_big = image_big.unsqueeze(0) video_yolo = image_yolo.unsqueeze(0) else: bbox_orig_big_all = np.append(bbox_orig_big_all, [bbox_orig_big], axis=0) bbox_orig_yolo_all = np.append(bbox_orig_yolo_all, [bbox_orig_yolo], axis=0) video_big = torch.cat([video_big, image_big.unsqueeze(0)]) video_yolo = torch.cat([video_yolo, image_yolo.unsqueeze(0)]) count += 1 target['video_big'] = video_big target['video_yolo'] = video_yolo target['bbox_orig_yolo'] = bbox_orig_yolo_all target['bbox_orig_big'] = bbox_orig_big_all return target
def get_single_item(self, index): curr_key = self.data_keys[index] curr_length = self.vid_lengths[curr_key] vid_start = self.vid_start[curr_key] start_index = (torch.randint(curr_length - self.seqlen, (1, )) + vid_start if curr_length - self.seqlen != 0 else vid_start).long() end_index = (start_index + self.seqlen - 1).long() is_train = self.set == 'train' if self.dataset_name == '3dpw' or self.dataset_name == 'amass_rend_take3': kp_2d = convert_kps(self.db['joints2D'][start_index:end_index + 1], src='common', dst='spin') kp_3d = self.db['joints3D'][start_index:end_index + 1] elif self.dataset_name == 'mpii3d': kp_2d = self.db['joints2D'][start_index:end_index + 1] if is_train: kp_3d = self.db['joints3D'][start_index:end_index + 1] else: kp_3d = convert_kps(self.db['joints3D'][start_index:end_index + 1], src='spin', dst='common') elif self.dataset_name == 'h36m': kp_2d = self.db['joints2D'][start_index:end_index + 1] if is_train: kp_3d = self.db['joints3D'][start_index:end_index + 1] else: kp_3d = convert_kps(self.db['joints3D'][start_index:end_index + 1], src='spin', dst='common') kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) nj = 14 if not is_train else 49 kp_3d_tensor = np.zeros((self.seqlen, nj, 3), dtype=np.float16) if self.dataset_name == '3dpw' or self.dataset_name == 'amass_rend_take3': pose = self.db['pose'][start_index:end_index + 1] shape = self.db['shape'][start_index:end_index + 1] w_smpl = torch.ones(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() elif self.dataset_name == 'h36m': if not is_train: pose = np.zeros((kp_2d.shape[0], 72)) shape = np.zeros((kp_2d.shape[0], 10)) w_smpl = torch.zeros(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() else: pose = self.db['pose'][start_index:end_index + 1] shape = self.db['shape'][start_index:end_index + 1] w_smpl = torch.ones(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() elif self.dataset_name == 'mpii3d': pose = np.zeros((kp_2d.shape[0], 72)) shape = np.zeros((kp_2d.shape[0], 10)) w_smpl = torch.zeros(self.seqlen).float() w_3d = torch.ones(self.seqlen).float() bbox = self.db['bbox'][start_index:end_index + 1] input = torch.from_numpy(self.db['features'][start_index:end_index + 1]).float() theta_tensor = np.zeros((self.seqlen, 85), dtype=np.float16) for idx in range(self.seqlen): # crop image and transform 2d keypoints kp_2d[idx, :, :2], trans = transfrom_keypoints( kp_2d=kp_2d[idx, :, :2], center_x=bbox[idx, 0], center_y=bbox[idx, 1], width=bbox[idx, 2], height=bbox[idx, 3], patch_width=224, patch_height=224, do_augment=False, ) kp_2d[idx, :, :2] = normalize_2d_kp(kp_2d[idx, :, :2], 224) # theta shape (85,) theta = np.concatenate( (np.array([1., 0., 0.]), pose[idx], shape[idx]), axis=0) kp_2d_tensor[idx] = kp_2d[idx] theta_tensor[idx] = theta kp_3d_tensor[idx] = kp_3d[idx] target = { 'features': input, 'theta': torch.from_numpy(theta_tensor).float(), # camera, pose and shape 'kp_2d': torch.from_numpy(kp_2d_tensor).float( ), # 2D keypoints transformed according to bbox cropping 'kp_3d': torch.from_numpy(kp_3d_tensor).float(), # 3D keypoints 'w_smpl': w_smpl, 'w_3d': w_3d, } if self.dataset_name == 'mpii3d' and not is_train: target['valid'] = self.db['valid_i'][start_index:end_index + 1] if (self.dataset_name == '3dpw' or self.dataset_name == 'amass_rend_take3') and not is_train: vn = self.db['vid_name'][start_index:end_index + 1] fi = self.db['frame_id'][start_index:end_index + 1] target['instance_id'] = [f'{v}/{f}' for v, f in zip(vn, fi)] # if self.dataset_name == '3dpw' and not self.is_train: # target['imgname'] = self.db['img_name'][start_index:end_index+1].tolist() # target['imgname'] = np.array(target['imgname']) # print(target['imgname'].dtype) # target['center'] = self.db['bbox'][start_index:end_index+1, :2] # target['valid'] = torch.from_numpy(self.db['valid'][start_index:end_index+1]) if self.debug: from lib.data_utils.img_utils import get_single_image_crop if self.dataset_name == 'mpii3d': video = self.db['img_name'][start_index:end_index + 1] # print(video) elif self.dataset_name == 'h36m': video = self.db['img_name'][start_index:end_index + 1] else: vid_name = self.db['vid_name'][start_index] vid_name = '_'.join(vid_name.split('_')[:-1]) f = osp.join(self.folder, 'imageFiles', vid_name) video_file_list = [ osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg') ] frame_idxs = self.db['frame_id'][start_index:end_index + 1] # print(f, frame_idxs) video = [video_file_list[i] for i in frame_idxs] video = torch.cat([ get_single_image_crop(image, bbox).unsqueeze(0) for image, bbox in zip(video, bbox) ], dim=0) target['video'] = video return target
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') dict = { 'bridge': 1, 'childs': 2, 'downwarddog': 3, 'mountain': 4, 'plank': 5, 'seatedforwardbend': 6, 'tree': 7, 'trianglepose': 8, 'warrior1': 9, 'warrior2': 10 } dir_path = '/home/ubuntu/PoseEstimation/VIBE/InputData/input_test_set/' output_folder = '/home/ubuntu/PoseEstimation/VIBE/OutputData/test_set/' joints3D_csv = open('output_joints3d_dog.csv', 'a') pose_csv = open('output_pose.csv_dog', 'a') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') video_file = '/home/ubuntu/PoseEstimation/VIBE/DogVideo.mp4' video_label = dict['bridge'] if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } for i in range(len(output_dict['joints3d'])): if (i % 5 == 0): flat_arr = output_dict['joints3d'][i].flatten() len_N = len(flat_arr) np.savetxt(joints3D_csv, [np.append(flat_arr, [video_label])], delimiter=',', fmt=' '.join(['%f'] * len_N + ['%i'])) for i in range(len(output_dict['pose'])): if (i % 5 == 0): pose_arr = output_dict['pose'][i].flatten() len_M = len(pose_arr) np.savetxt(pose_csv, [np.append(pose_arr, [video_label])], delimiter=',', fmt=' '.join(['%f'] * len_M + ['%i'])) end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' )
def main(args): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join(args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False # reduce the num of worker if you encountered the error: DLL load failed: The paging file is too small for this operation to complete dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=8) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print(f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print('[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!') print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print(f'Total time spent: {total_time:.2f} seconds (including model loading time).') print(f'Total FPS (including model loading time): {num_frames / total_time:.2f}.') print(f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".') joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = {k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mc = mesh_color[person_id] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0,1,0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) font = cv2.FONT_HERSHEY_SIMPLEX x = 10 #position of text y = 20 #position of text cv2.putText(img, str(frame_idx), (x,y), font ,0.55,(0,255,0),1) cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) # generate and save the joints csv file for animating avatars later output = joblib.load(os.path.join(output_path, "vibe_output.pkl")) for i in output.keys(): print('Track ids:', i , end='\n\n') num_ppl = len(output.keys()) print('VIBE output file content:', end='\n\n') vid_name = os.path.basename(video_file) vibe_result_folder = output_path # output the pose result as csv # format: v_personId_numFrames pose_filename_list = [] for i in output.keys(): pose_filename = vibe_result_folder + "/" + vid_name + "_"+ str(i) + "_" + str(output[i]['pose'].shape[0]) + ".csv" pose_filename_list.append(pose_filename) field_names = [] for idx in range(73): # 72 -> 73 (+ frame_id at 0) field_names.append(str(idx)) with open(pose_filename, 'w', newline='') as file: writer = csv.writer(file) writer.writerow(field_names) for frame_id in range(len(output[i]['pose'])): output_data = [output[i]['frame_ids'][frame_id]] output_data.extend(output[i]['pose'][frame_id]) #print(output_data) writer.writerow(output_data) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) map_vals = { 'bridge': 1, 'childs': 2, 'downwarddog': 3, 'mountain': 4, 'plank': 5, 'seatedforwardbend': 6, 'tree': 7, 'trianglepose': 8, 'warrior1': 9, 'warrior2': 10 } inverse_map = { 1: 'bridge', 2: 'childs', 3: 'downwarddog', 4: 'mountain', 5: 'plank', 6: 'seatedforwardbend', 7: 'tree', 8: 'trianglepose', 9: 'warrior1', 10: 'warrior2' } video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') dir_path = '/home/ubuntu/PoseEstimation/VIBE/InputData/input_test_set/' output_folder = '/home/ubuntu/PoseEstimation/VIBE/OutputData/' # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load Classification Model ========= # classification_model = pickle.load( open('view_classification_model.pkl', 'rb')) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) #print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() #print(f'Loaded pretrained weights from \"{pretrained_file}\"') image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Run VIBE on each person ========= # #print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in list(tracking_results.keys()): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } # ========= Extract 3D joint feature for each frame ========= # list_val = [] for i in range(len(output_dict['joints3d'])): list_val.append(output_dict['joints3d'][i].flatten().reshape( 1, -1)) input_df = pd.DataFrame(np.concatenate(list_val)) input_df = input_df.round(2) predicted_classes = classification_model.predict_classes(input_df) output_df = pd.DataFrame(predicted_classes) # ========= Printing all possible poses detected for the video ========= # total_frames = len(output_df) print( '\nPrinting probabilities for yoga poses predicted in different frames.' ) for i, v in output_df.value_counts().items(): val = round((v / total_frames) * 100, 2) print('Probability of the yoga pose being ' + inverse_map[i[0]].capitalize() + " is: " + str(val)) print('\nThe yoga pose in the given video is: ' + inverse_map[output_df[0].value_counts().idxmax()].capitalize())
def run_vibe(video_file, args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # Make output dirs output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) # Convert video to images image_folder, num_frames, img_shape = video_to_images( video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker( video_file, staf_folder=args.staf_dir, display=args.display, smoothen=args.smoothen, smoothen_method=args.smoothen_method) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=None, joints2d=joints2d ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader( dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [ ], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}') pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() orig_cam = convert_crop_cam_to_orig_img( cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height ) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).') print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.') print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".') # joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) for person in vibe_results.keys(): dump_path = os.path.join(output_path, "vibe_output_%s.pkl" % person) os.makedirs(os.path.dirname(dump_path), exist_ok=True) pickle.dump(vibe_results[person], open(dump_path, 'wb')) # if not args.no_render: # # ========= Render results as a single video ========= # # renderer = Renderer(resolution=(orig_width, orig_height), # orig_img=True, wireframe=args.wireframe) # output_img_folder = f'{image_folder}_output' # os.makedirs(output_img_folder, exist_ok=True) # print(f'Rendering output video, writing frames to {output_img_folder}') # # prepare results for rendering # frame_results = prepare_rendering_results(vibe_results, num_frames) # mesh_color = {k: colorsys.hsv_to_rgb( # np.random.rand(), 0.5, 1.0) for k in vibe_results.keys()} # image_file_names = sorted([ # os.path.join(image_folder, x) # for x in os.listdir(image_folder) # if x.endswith('.png') or x.endswith('.jpg') # ]) # for frame_idx in tqdm(range(len(image_file_names))): # img_fname = image_file_names[frame_idx] # img = cv2.imread(img_fname) # for person_id, person_data in frame_results[frame_idx].items(): # frame_verts = person_data['verts'] # frame_cam = person_data['cam'] # mc = mesh_color[person_id] # mesh_filename = None # img = renderer.render( # img, # frame_verts, # cam=frame_cam, # color=mc, # mesh_filename=mesh_filename, # ) # cv2.imwrite(os.path.join(output_img_folder, # f'{frame_idx:06d}.png'), img) # if args.display: # cv2.imshow('Video', img) # if cv2.waitKey(1) & 0xFF == ord('q'): # break # if args.display: # cv2.destroyAllWindows() # # ========= Save rendered video ========= # # vid_name = os.path.basename(video_file) # save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' # save_name = os.path.join(output_path, save_name) # print(f'Saving result video to {save_name}') # images_to_video(img_folder=output_img_folder, # output_vid_file=save_name) # shutil.rmtree(output_img_folder) shutil.rmtree(image_folder) print('================= END =================')
def main(args): device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join(args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, smpl_joints2d, norm_joints2d = [], [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape(batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:,:,3:75].reshape(batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :,75:].reshape(batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape(batch_size * seqlen, -1, 3)) smpl_joints2d.append(output['kp_2d']).reshape(batch_size * seqlen, -1, 2)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) smpl_joints2d = torch.cat(smpl_joints2d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, )
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') video_file = args.vid_file # ========= [Optional] download the youtube video ========= # if video_file.startswith('https://www.youtube.com'): print(f'Donwloading YouTube video \"{video_file}\"') video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f'YouTube Video has been downloaded to {video_file}...') if not os.path.isfile(video_file): exit(f'Input video \"{video_file}\" does not exist!') output_path = os.path.join( args.output_folder, os.path.basename(video_file).replace('.mp4', '')) os.makedirs(output_path, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f'Input video number of frames {num_frames}') orig_height, orig_width = img_shape[:2] total_time = time.time() # ========= Run tracking ========= # bbox_scale = 1.1 if args.tracking_method == 'pose': if not os.path.isabs(video_file): video_file = os.path.join(os.getcwd(), video_file) tracking_results = run_posetracker(video_file, staf_folder=args.staf_dir, display=args.display) else: # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') # ========= Run VIBE on each person ========= # print(f'Running VIBE on each tracklet...') vibe_time = time.time() vibe_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None if args.tracking_method == 'bbox': bboxes = tracking_results[person_id]['bbox'] elif args.tracking_method == 'pose': joints2d = tracking_results[person_id]['joints2d'] frames = tracking_results[person_id]['frames'] dataset = Inference( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=16) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch in dataloader: if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape( batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() # Runs 1 Euro Filter to smooth out the results if args.smooth: min_cutoff = args.smooth_min_cutoff # 0.004 beta = args.smooth_beta # 1.5 print( f'Running smoothing on person {person_id}, min_cutoff: {min_cutoff}, beta: {beta}' ) pred_verts, pred_pose, pred_joints3d = smooth_pose( pred_pose, pred_betas, min_cutoff=min_cutoff, beta=beta) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } vibe_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) print( f'Saving output results to \"{os.path.join(output_path, "vibe_output.pkl")}\".' ) joblib.dump(vibe_results, os.path.join(output_path, "vibe_output.pkl")) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' os.makedirs(output_img_folder, exist_ok=True) if args.joints3dview: output_img_raw_folder = f'{image_folder}_raw_output' os.makedirs(output_img_raw_folder, exist_ok=True) output_img_joints3d_folder = f'{image_folder}_joints3d_output' os.makedirs(output_img_joints3d_folder, exist_ok=True) output_img_mesh_folder = f'{image_folder}_mesh_output' os.makedirs(output_img_mesh_folder, exist_ok=True) output_img_meshside_folder = f'{image_folder}_meshside_output' os.makedirs(output_img_meshside_folder, exist_ok=True) output_img_all_folder = f'{image_folder}_all_output' os.makedirs(output_img_all_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') # prepare results for rendering frame_results = prepare_rendering_results(vibe_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in vibe_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) length_image_files = len(image_file_names) #length_image_files = 100 for frame_idx in tqdm(range(length_image_files)): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) if args.sideview: side_img = np.zeros_like(img) if args.joints3dview: img_raw = img.copy() img_joints3d = np.zeros_like(img) joints3d_list = [] for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] joints3d = person_data['joints3d'] #print('frame_verts.shape = {}\nframe_cam.shape ={}\njoints3d.shape = {}'.format( # frame_verts.shape, frame_cam.shape, joints3d.shape)) mc = mesh_color[person_id] if args.joints3dview: joints3d_list.append(joints3d) # img_joints3d = render_joints3d(joints3d, img_raw.shape) mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img_mesh = img.copy() img = np.concatenate([img, side_img], axis=1) cv2.imwrite( os.path.join(output_img_folder, f'{frame_idx:06d}.png'), img) if args.joints3dview: #img_joints3d = np.zeros_like(img_raw) if len(joints3d_list) == 0: img_joints3d = np.zeros_like(img_raw) else: joints3d = np.concatenate(joints3d_list) img_joints3d = render_joints3d(joints3d, img_raw.shape) if args.joints3dview: img_up = np.concatenate([img_raw, img_joints3d], axis=1) img_down = np.concatenate([img_mesh, side_img], axis=1) img_all = np.concatenate([img_up, img_down], axis=0) cv2.imwrite( os.path.join(output_img_raw_folder, f'{frame_idx:06d}.png'), img_raw) cv2.imwrite( os.path.join(output_img_joints3d_folder, f'{frame_idx:06d}.png'), img_joints3d) cv2.imwrite( os.path.join(output_img_mesh_folder, f'{frame_idx:06d}.png'), img_mesh) cv2.imwrite( os.path.join(output_img_meshside_folder, f'{frame_idx:06d}.png'), side_img) cv2.imwrite( os.path.join(output_img_all_folder, f'{frame_idx:06d}.png'), img_all) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() # ========= Save rendered video ========= # vid_name = os.path.basename(video_file) save_name = f'{vid_name.replace(".mp4", "")}_vibe_result.mp4' save_name = os.path.join(output_path, save_name) print(f'Saving result video to {save_name}') images_to_video(img_folder=output_img_folder, output_vid_file=save_name) shutil.rmtree(output_img_folder) if args.joints3dview: ''' save_name_raw = f'{vid_name.replace(".mp4", "")}_raw.mp4' save_name_raw = os.path.join(output_path, save_name_raw) images_to_video(img_folder=output_img_raw_folder, output_vid_file=save_name_raw) shutil.rmtree(output_img_raw_folder) save_name_joints3d = f'{vid_name.replace(".mp4", "")}_joints3d.mp4' save_name_joints3d = os.path.join(output_path, save_name_joints3d) images_to_video(img_folder=output_img_joints3d_folder, output_vid_file=save_name_joints3d) shutil.rmtree(output_img_joints3d_folder) save_name_mesh = f'{vid_name.replace(".mp4", "")}_mesh.mp4' save_name_mesh = os.path.join(output_path, save_name_mesh) images_to_video(img_folder=output_img_mesh_folder, output_vid_file=save_name_mesh) shutil.rmtree(output_img_mesh_folder) save_name_meshside = f'{vid_name.replace(".mp4", "")}_meshside.mp4' save_name_meshside = os.path.join(output_path, save_name_meshside) images_to_video(img_folder=output_img_meshside_folder, output_vid_file=save_name_meshside) shutil.rmtree(output_img_meshside_folder) ''' save_name_all = f'{vid_name.replace(".mp4", "")}_all.mp4' save_name_all = os.path.join(output_path, save_name_all) images_to_video(img_folder=output_img_all_folder, output_vid_file=save_name_all) shutil.rmtree(output_img_all_folder) shutil.rmtree(image_folder) print('================= END =================')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # ========= Define VIBE model ========= # model = VIBE_Demo( seqlen=16, device=device, n_layers=2, hidden_size=1024, add_linear=True, use_residual=True, ).to(device) # ========= Load pretrained weights ========= # pretrained_file = download_ckpt(use_3dpw=False) ckpt = torch.load(pretrained_file, map_location=device) print(f'Performance of pretrained model on 3DPW: {ckpt["performance"]}') ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) model.eval() print(f'Loaded pretrained weights from \"{pretrained_file}\"') total_time = time.time() # ========= Run VIBE on crops ========= # print(f'Running VIBE on crops...') vibe_time = time.time() image_folder = args.input_folder dataset = InferenceFromCrops(image_folder=image_folder) orig_height = orig_width = 512 dataloader = DataLoader(dataset, batch_size=args.vibe_batch_size, num_workers=0) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for batch_num, batch in enumerate(dataloader): print("BATCH:", batch_num) batch = batch.unsqueeze(0) batch = batch.to(device) batch_size, seqlen = batch.shape[:2] output = model(batch)[-1] pred_cam.append(output['theta'][:, :, :3].reshape( batch_size * seqlen, -1)) pred_verts.append(output['verts'].reshape(batch_size * seqlen, -1, 3)) pred_pose.append(output['theta'][:, :, 3:75].reshape( batch_size * seqlen, -1)) pred_betas.append(output['theta'][:, :, 75:].reshape( batch_size * seqlen, -1)) pred_joints3d.append(output['kp_3d'].reshape( batch_size * seqlen, -1, 3)) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= [Optional] run Temporal SMPLify to refine the results ========= # if args.run_smplify and args.tracking_method == 'pose': norm_joints2d = np.concatenate(norm_joints2d, axis=0) norm_joints2d = convert_kps(norm_joints2d, src='staf', dst='spin') norm_joints2d = torch.from_numpy(norm_joints2d).float().to(device) # Run Temporal SMPLify update, new_opt_vertices, new_opt_cam, new_opt_pose, new_opt_betas, \ new_opt_joints3d, new_opt_joint_loss, opt_joint_loss = smplify_runner( pred_rotmat=pred_pose, pred_betas=pred_betas, pred_cam=pred_cam, j2d=norm_joints2d, device=device, batch_size=norm_joints2d.shape[0], pose2aa=False, ) # update the parameters after refinement print( f'Update ratio after Temporal SMPLify: {update.sum()} / {norm_joints2d.shape[0]}' ) pred_verts = pred_verts.cpu() pred_cam = pred_cam.cpu() pred_pose = pred_pose.cpu() pred_betas = pred_betas.cpu() pred_joints3d = pred_joints3d.cpu() pred_verts[update] = new_opt_vertices[update] pred_cam[update] = new_opt_cam[update] pred_pose[update] = new_opt_pose[update] pred_betas[update] = new_opt_betas[update] pred_joints3d[update] = new_opt_joints3d[update] elif args.run_smplify and args.tracking_method == 'bbox': print( '[WARNING] You need to enable pose tracking to run Temporal SMPLify algorithm!' ) print('[WARNING] Continuing without running Temporal SMPLify!..') # ========= Save results to a pickle file ========= # output_path = image_folder.replace('cropped_frames', 'vibe_results') os.makedirs(output_path, exist_ok=True) pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() vibe_results = { 'pred_cam': pred_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, } del model end = time.time() fps = len(dataset) / (end - vibe_time) print(f'VIBE FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {len(dataset) / total_time:.2f}.' ) print( f'Saving vibe results to \"{os.path.join(output_path, "vibe_results.pkl")}\".' ) with open(os.path.join(output_path, "vibe_results.pkl"), 'wb') as f_save: pickle.dump(vibe_results, f_save) if not args.no_render: # ========= Render results as a single video ========= # renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = os.path.join(output_path, 'vibe_images') os.makedirs(output_img_folder, exist_ok=True) print(f'Rendering output video, writing frames to {output_img_folder}') image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) frame_verts = vibe_results['verts'][frame_idx] frame_cam = vibe_results['pred_cam'][frame_idx] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'vibe_meshes') os.makedirs(mesh_folder, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') rend_img = renderer.render( img, frame_verts, cam=frame_cam, mesh_filename=mesh_filename, ) whole_img = rend_img if args.sideview: side_img_bg = np.zeros_like(img) side_rend_img90 = renderer.render( side_img_bg, frame_verts, cam=frame_cam, angle=90, axis=[0, 1, 0], ) side_rend_img270 = renderer.render( side_img_bg, frame_verts, cam=frame_cam, angle=270, axis=[0, 1, 0], ) if args.reposed_render: smpl = SMPL('data/vibe_data', batch_size=1) zero_pose = torch.from_numpy( np.zeros((1, pred_pose.shape[-1]))).float() zero_pose[:, 0] = np.pi pred_frame_betas = torch.from_numpy( pred_betas[frame_idx][None, :]).float() with torch.no_grad(): reposed_smpl_output = smpl( betas=pred_frame_betas, body_pose=zero_pose[:, 3:], global_orient=zero_pose[:, :3]) reposed_verts = reposed_smpl_output.vertices reposed_verts = reposed_verts.cpu().detach().numpy() reposed_cam = np.array([0.9, 0, 0]) reposed_rend_img = renderer.render(side_img_bg, reposed_verts[0], cam=reposed_cam) reposed_rend_img90 = renderer.render(side_img_bg, reposed_verts[0], cam=reposed_cam, angle=90, axis=[0, 1, 0]) top_row = np.concatenate( [img, reposed_rend_img, reposed_rend_img90], axis=1) bot_row = np.concatenate( [rend_img, side_rend_img90, side_rend_img270], axis=1) whole_img = np.concatenate([top_row, bot_row], axis=0) else: top_row = np.concatenate([img, side_img_bg, side_img_bg], axis=1) bot_row = np.concatenate( [rend_img, side_rend_img90, side_rend_img270], axis=1) whole_img = np.concatenate([top_row, bot_row], axis=0) # cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.png'), whole_img) cv2.imwrite( os.path.join(output_img_folder, os.path.basename(img_fname)), whole_img) # ========= Save rendered video ========= # save_vid_path = os.path.join(output_path, 'vibe_video.mp4') print(f'Saving result video to {save_vid_path}') images_to_video(img_folder=output_img_folder, output_vid_file=save_vid_path) print('================= END =================')