def __init__(self, step_size=1e-2, batch_size=66, num_iters=100, focal_length=5000, use_lbfgs=True, device=torch.device('cuda'), max_iter=20): # Store options self.device = device self.focal_length = focal_length self.step_size = step_size self.max_iter = max_iter # Ignore the the following joints for the fitting process ign_joints = ['OP Neck', 'OP RHip', 'OP LHip', 'Right Hip', 'Left Hip'] self.ign_joints = [JOINT_IDS[i] for i in ign_joints] self.num_iters = num_iters # GMM pose prior self.pose_prior = MaxMixturePrior(prior_folder=VIBE_DATA_DIR, num_gaussians=8, dtype=torch.float32).to(device) self.use_lbfgs = use_lbfgs # Load SMPL model self.smpl = SMPL(SMPL_MODEL_DIR, batch_size=batch_size, create_transl=False).to(self.device)
def __init__(self, smpl_mean_params=SMPL_MEAN_PARAMS): super(Regressor, self).__init__() npose = 24 * 6 self.fc1 = nn.Linear(512 * 4 + npose + 13, 1024) self.drop1 = nn.Dropout() self.fc2 = nn.Linear(1024, 1024) self.drop2 = nn.Dropout() self.decpose = nn.Linear(1024, npose) self.decshape = nn.Linear(1024, 10) self.deccam = nn.Linear(1024, 3) nn.init.xavier_uniform_(self.decpose.weight, gain=0.01) nn.init.xavier_uniform_(self.decshape.weight, gain=0.01) nn.init.xavier_uniform_(self.deccam.weight, gain=0.01) self.smpl = SMPL(SMPL_MODEL_DIR, batch_size=64, create_transl=False) mean_params = np.load(smpl_mean_params) init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0) init_shape = torch.from_numpy( mean_params['shape'][:].astype('float32')).unsqueeze(0) init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0) self.register_buffer('init_pose', init_pose) self.register_buffer('init_shape', init_shape) self.register_buffer('init_cam', init_cam)
def compute_error_verts(pred_verts, target_verts=None, target_theta=None): """ Computes MPJPE over 6890 surface vertices. Args: verts_gt (Nx6890x3). verts_pred (Nx6890x3). Returns: error_verts (N). """ if target_verts is None: from lib.models.smpl import SMPL_MODEL_DIR from lib.models.smpl import SMPL device = 'cpu' smpl = SMPL( SMPL_MODEL_DIR, batch_size=1, # target_theta.shape[0], ).to(device) betas = torch.from_numpy(target_theta[:,75:]).to(device) pose = torch.from_numpy(target_theta[:,3:75]).to(device) target_verts = [] b_ = torch.split(betas, 5000) p_ = torch.split(pose, 5000) for b,p in zip(b_,p_): output = smpl(betas=b, body_pose=p[:, 3:], global_orient=p[:, :3], pose2rot=True) target_verts.append(output.vertices.detach().cpu().numpy()) target_verts = np.concatenate(target_verts, axis=0) assert len(pred_verts) == len(target_verts) error_per_vert = np.sqrt(np.sum((target_verts - pred_verts) ** 2, axis=2)) return np.mean(error_per_vert, axis=1)
def __init__(self, block, layers, smpl_mean_params): self.inplanes = 64 super(HMR, self).__init__() npose = 24 * 6 self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AvgPool2d(7, stride=1) self.fc1 = nn.Linear(512 * block.expansion + npose + 13, 1024) self.drop1 = nn.Dropout() self.fc2 = nn.Linear(1024, 1024) self.drop2 = nn.Dropout() self.decpose = nn.Linear(1024, npose) self.decshape = nn.Linear(1024, 10) self.deccam = nn.Linear(1024, 3) nn.init.xavier_uniform_(self.decpose.weight, gain=0.01) nn.init.xavier_uniform_(self.decshape.weight, gain=0.01) nn.init.xavier_uniform_(self.deccam.weight, gain=0.01) self.smpl = SMPL(SMPL_MODEL_DIR, batch_size=64, create_transl=False).to('cpu') for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() mean_params = np.load(smpl_mean_params) init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0) init_shape = torch.from_numpy( mean_params['shape'][:].astype('float32')).unsqueeze(0) init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0) self.register_buffer('init_pose', init_pose) self.register_buffer('init_shape', init_shape) self.register_buffer('init_cam', init_cam)
def smooth_pose(pred_pose, pred_betas, min_cutoff=0.004, beta=0.7): # min_cutoff: Decreasing the minimum cutoff frequency decreases slow speed jitter # beta: Increasing the speed coefficient(beta) decreases speed lag. one_euro_filter = OneEuroFilter( np.zeros_like(pred_pose[0]), pred_pose[0], min_cutoff=min_cutoff, beta=beta, ) smpl = SMPL(model_path=SMPL_MODEL_DIR) pred_pose_hat = np.zeros_like(pred_pose) # initialize pred_pose_hat[0] = pred_pose[0] pred_verts_hat = [] pred_joints3d_hat = [] smpl_output = smpl( betas=torch.from_numpy(pred_betas[0]).unsqueeze(0), body_pose=torch.from_numpy(pred_pose[0, 1:]).unsqueeze(0), global_orient=torch.from_numpy(pred_pose[0, 0:1]).unsqueeze(0), ) pred_verts_hat.append(smpl_output.vertices.detach().cpu().numpy()) pred_joints3d_hat.append(smpl_output.joints.detach().cpu().numpy()) for idx, pose in enumerate(pred_pose[1:]): idx += 1 t = np.ones_like(pose) * idx pose = one_euro_filter(t, pose) pred_pose_hat[idx] = pose smpl_output = smpl( betas=torch.from_numpy(pred_betas[idx]).unsqueeze(0), body_pose=torch.from_numpy(pred_pose_hat[idx, 1:]).unsqueeze(0), global_orient=torch.from_numpy(pred_pose_hat[idx, 0:1]).unsqueeze(0), ) pred_verts_hat.append(smpl_output.vertices.detach().cpu().numpy()) pred_joints3d_hat.append(smpl_output.joints.detach().cpu().numpy()) return np.vstack(pred_verts_hat), pred_pose_hat, np.vstack( pred_joints3d_hat)
def __init__( self, vibe, cfg = "zen_rec_23" ): super(REFINERV2, self).__init__() vae_cfg = Config(cfg) self.vae_model, _, _ = get_models(vae_cfg, iter = -1) self.vibe = vibe self.vibe.eval() self.smpl = SMPL( SMPL_MODEL_DIR, batch_size=64, create_transl=False )
def get_regressor_output(features): from lib.models.spin import Regressor batch_size, seqlen = features.shape[:2] device = 'cuda' if torch.cuda.is_available() else 'cpu' model = Regressor().to(device) smpl = SMPL(SMPL_MODEL_DIR).to(device) pretrained = torch.load('models/model_best.pth.tar', map_location=torch.device('cpu'))['gen_state_dict'] new_pretrained_dict = {} for k, v in pretrained.items(): if 'regressor' in k: new_pretrained_dict[k[10:]] = v # adapt mean theta to new batch size if 'mean_theta' in k: del new_pretrained_dict[k[10:]] model.load_state_dict(new_pretrained_dict, strict=False) features = features.reshape(batch_size * seqlen, -1) features = features.to(device) theta = model(features)[-1] cam = theta[:, 0:3].contiguous() pose = theta[:, 3:75].contiguous() shape = theta[:, 75:].contiguous() pred_output = smpl(betas=shape, body_pose=pose[:, 3:], global_orient=pose[:, :3], pose2rot=True) verts = pred_output.vertices # , _, _ = smpl(pose, shape) verts = verts.reshape(batch_size, seqlen, -1, 3) cam = cam.reshape(batch_size, seqlen, -1) return verts, cam
def read_data(folder, set, debug=False): dataset = { 'vid_name': [], 'frame_id': [], 'joints3D': [], 'joints2D': [], 'shape': [], 'pose': [], 'bbox': [], 'img_name': [], 'features': [], 'valid': [], } model = spin.get_pretrained_hmr() if set == 'val': set = 'test' sequences = [ x.split('.')[0] for x in os.listdir(osp.join(folder, 'sequenceFiles', set)) ] J_regressor = None smpl = SMPL(SMPL_MODEL_DIR, batch_size=1, create_transl=False) if set == 'test': J_regressor = torch.from_numpy( np.load(osp.join(VIBE_DATA_DIR, 'J_regressor_h36m.npy'))).float() for i, seq in tqdm(enumerate(sequences)): data_file = osp.join(folder, 'sequenceFiles', set, seq + '.pkl') data = pkl.load(open(data_file, 'rb'), encoding='latin1') img_dir = osp.join(folder, 'imageFiles', seq) num_people = len(data['poses']) num_frames = len(data['img_frame_ids']) assert (data['poses2d'][0].shape[0] == num_frames) for p_id in range(num_people): pose = torch.from_numpy(data['poses'][p_id]).float() shape = torch.from_numpy(data['betas'][p_id][:10]).float().repeat( pose.size(0), 1) trans = torch.from_numpy(data['trans'][p_id]).float() j2d = data['poses2d'][p_id].transpose(0, 2, 1) cam_pose = data['cam_poses'] campose_valid = data['campose_valid'][p_id] # ======== Align the mesh params ======== # rot = pose[:, :3] rot_mat = batch_rodrigues(rot) Rc = torch.from_numpy(cam_pose[:, :3, :3]).float() Rs = torch.bmm(Rc, rot_mat.reshape(-1, 3, 3)) rot = rotation_matrix_to_angle_axis(Rs) pose[:, :3] = rot # ======== Align the mesh params ======== # output = smpl(betas=shape, body_pose=pose[:, 3:], global_orient=pose[:, :3], transl=trans) # verts = output.vertices j3d = output.joints if J_regressor is not None: vertices = output.vertices J_regressor_batch = J_regressor[None, :].expand( vertices.shape[0], -1, -1).to(vertices.device) j3d = torch.matmul(J_regressor_batch, vertices) j3d = j3d[:, H36M_TO_J14, :] img_paths = [] for i_frame in range(num_frames): img_path = os.path.join(img_dir + '/image_{:05d}.jpg'.format(i_frame)) img_paths.append(img_path) bbox_params, time_pt1, time_pt2 = get_smooth_bbox_params( j2d, vis_thresh=VIS_THRESH, sigma=8) # process bbox_params c_x = bbox_params[:, 0] c_y = bbox_params[:, 1] scale = bbox_params[:, 2] w = h = 150. / scale w = h = h * 1.1 bbox = np.vstack([c_x, c_y, w, h]).T # process keypoints j2d[:, :, 2] = j2d[:, :, 2] > 0.3 # set the visibility flags # Convert to common 2d keypoint format perm_idxs = get_perm_idxs('3dpw', 'common') perm_idxs += [0, 0] # no neck, top head j2d = j2d[:, perm_idxs] j2d[:, 12:, 2] = 0.0 # print('j2d', j2d[time_pt1:time_pt2].shape) # print('campose', campose_valid[time_pt1:time_pt2].shape) img_paths_array = np.array(img_paths)[time_pt1:time_pt2] dataset['vid_name'].append( np.array([f'{seq}_{p_id}'] * num_frames)[time_pt1:time_pt2]) dataset['frame_id'].append( np.arange(0, num_frames)[time_pt1:time_pt2]) dataset['img_name'].append(img_paths_array) dataset['joints3D'].append(j3d.numpy()[time_pt1:time_pt2]) dataset['joints2D'].append(j2d[time_pt1:time_pt2]) dataset['shape'].append(shape.numpy()[time_pt1:time_pt2]) dataset['pose'].append(pose.numpy()[time_pt1:time_pt2]) dataset['bbox'].append(bbox) dataset['valid'].append(campose_valid[time_pt1:time_pt2]) features = extract_features(model, img_paths_array, bbox, kp_2d=j2d[time_pt1:time_pt2], debug=debug, dataset='3dpw', scale=1.2) dataset['features'].append(features) for k in dataset.keys(): dataset[k] = np.concatenate(dataset[k]) print(k, dataset[k].shape) # Filter out keypoints indices_to_use = np.where( (dataset['joints2D'][:, :, 2] > VIS_THRESH).sum(-1) > MIN_KP)[0] for k in dataset.keys(): dataset[k] = dataset[k][indices_to_use] return dataset
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') """ Prepare input video (images) """ video_file = args.vid_file if video_file.startswith('https://www.youtube.com'): print(f"Donwloading YouTube video \'{video_file}\'") video_file = download_youtube_clip(video_file, '/tmp') if video_file is None: exit('Youtube url is not valid!') print(f"YouTube Video has been downloaded to {video_file}...") if not os.path.isfile(video_file): exit(f"Input video \'{video_file}\' does not exist!") output_path = osp.join('./output/demo_output', os.path.basename(video_file).replace('.mp4', '')) Path(output_path).mkdir(parents=True, exist_ok=True) image_folder, num_frames, img_shape = video_to_images(video_file, return_info=True) print(f"Input video number of frames {num_frames}\n") orig_height, orig_width = img_shape[:2] """ Run tracking """ total_time = time.time() bbox_scale = 1.2 # run multi object tracker mot = MPT( device=device, batch_size=args.tracker_batch_size, display=args.display, detector_type=args.detector, output_format='dict', yolo_img_size=args.yolo_img_size, ) tracking_results = mot(image_folder) # remove tracklets if num_frames is less than MIN_NUM_FRAMES for person_id in list(tracking_results.keys()): if tracking_results[person_id]['frames'].shape[0] < MIN_NUM_FRAMES: del tracking_results[person_id] """ Get TCMR model """ seq_len = 16 model = TCMR(seqlen=seq_len, n_layers=2, hidden_size=1024).to(device) # Load pretrained weights pretrained_file = args.model ckpt = torch.load(pretrained_file) print(f"Load pretrained weights from \'{pretrained_file}\'") ckpt = ckpt['gen_state_dict'] model.load_state_dict(ckpt, strict=False) # Change mesh gender gender = args.gender # 'neutral', 'male', 'female' model.regressor.smpl = SMPL(SMPL_MODEL_DIR, batch_size=64, create_transl=False, gender=gender).cuda() model.eval() # Get feature_extractor from lib.models.spin import hmr hmr = hmr().to(device) checkpoint = torch.load( osp.join(BASE_DATA_DIR, 'spin_model_checkpoint.pth.tar')) hmr.load_state_dict(checkpoint['model'], strict=False) hmr.eval() """ Run TCMR on each person """ print("\nRunning TCMR on each person tracklet...") tcmr_time = time.time() tcmr_results = {} for person_id in tqdm(list(tracking_results.keys())): bboxes = joints2d = None bboxes = tracking_results[person_id]['bbox'] frames = tracking_results[person_id]['frames'] # Prepare static image features dataset = CropDataset( image_folder=image_folder, frames=frames, bboxes=bboxes, joints2d=joints2d, scale=bbox_scale, ) bboxes = dataset.bboxes frames = dataset.frames has_keypoints = True if joints2d is not None else False crop_dataloader = DataLoader(dataset, batch_size=256, num_workers=16) with torch.no_grad(): feature_list = [] for i, batch in enumerate(crop_dataloader): if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.to(device) feature = hmr.feature_extractor(batch.reshape(-1, 3, 224, 224)) feature_list.append(feature.cpu()) del batch feature_list = torch.cat(feature_list, dim=0) # Encode temporal features and estimate 3D human mesh dataset = FeatureDataset( image_folder=image_folder, frames=frames, seq_len=seq_len, ) dataset.feature_list = feature_list dataloader = DataLoader(dataset, batch_size=64, num_workers=32) with torch.no_grad(): pred_cam, pred_verts, pred_pose, pred_betas, pred_joints3d, norm_joints2d = [], [], [], [], [], [] for i, batch in enumerate(dataloader): if has_keypoints: batch, nj2d = batch norm_joints2d.append(nj2d.numpy().reshape(-1, 21, 3)) batch = batch.to(device) output = model(batch)[0][-1] pred_cam.append(output['theta'][:, :3]) pred_verts.append(output['verts']) pred_pose.append(output['theta'][:, 3:75]) pred_betas.append(output['theta'][:, 75:]) pred_joints3d.append(output['kp_3d']) pred_cam = torch.cat(pred_cam, dim=0) pred_verts = torch.cat(pred_verts, dim=0) pred_pose = torch.cat(pred_pose, dim=0) pred_betas = torch.cat(pred_betas, dim=0) pred_joints3d = torch.cat(pred_joints3d, dim=0) del batch # ========= Save results to a pickle file ========= # pred_cam = pred_cam.cpu().numpy() pred_verts = pred_verts.cpu().numpy() pred_pose = pred_pose.cpu().numpy() pred_betas = pred_betas.cpu().numpy() pred_joints3d = pred_joints3d.cpu().numpy() bboxes[:, 2:] = bboxes[:, 2:] * 1.2 if args.render_plain: pred_cam[:, 0], pred_cam[:, 1:] = 1, 0 # np.array([[1, 0, 0]]) orig_cam = convert_crop_cam_to_orig_img(cam=pred_cam, bbox=bboxes, img_width=orig_width, img_height=orig_height) output_dict = { 'pred_cam': pred_cam, 'orig_cam': orig_cam, 'verts': pred_verts, 'pose': pred_pose, 'betas': pred_betas, 'joints3d': pred_joints3d, 'joints2d': joints2d, 'bboxes': bboxes, 'frame_ids': frames, } tcmr_results[person_id] = output_dict del model end = time.time() fps = num_frames / (end - tcmr_time) print(f'TCMR FPS: {fps:.2f}') total_time = time.time() - total_time print( f'Total time spent: {total_time:.2f} seconds (including model loading time).' ) print( f'Total FPS (including model loading time): {num_frames / total_time:.2f}.' ) if args.save_pkl: print( f"Saving output results to \'{os.path.join(output_path, 'tcmr_output.pkl')}\'." ) joblib.dump(tcmr_results, os.path.join(output_path, "tcmr_output.pkl")) """ Render results as a single video """ renderer = Renderer(resolution=(orig_width, orig_height), orig_img=True, wireframe=args.wireframe) output_img_folder = f'{image_folder}_output' input_img_folder = f'{image_folder}_input' os.makedirs(output_img_folder, exist_ok=True) os.makedirs(input_img_folder, exist_ok=True) print(f"\nRendering output video, writing frames to {output_img_folder}") # prepare results for rendering frame_results = prepare_rendering_results(tcmr_results, num_frames) mesh_color = { k: colorsys.hsv_to_rgb(np.random.rand(), 0.5, 1.0) for k in tcmr_results.keys() } image_file_names = sorted([ os.path.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for frame_idx in tqdm(range(len(image_file_names))): img_fname = image_file_names[frame_idx] img = cv2.imread(img_fname) input_img = img.copy() if args.render_plain: img[:] = 0 if args.sideview: side_img = np.zeros_like(img) for person_id, person_data in frame_results[frame_idx].items(): frame_verts = person_data['verts'] frame_cam = person_data['cam'] mesh_filename = None if args.save_obj: mesh_folder = os.path.join(output_path, 'meshes', f'{person_id:04d}') Path(mesh_folder).mkdir(parents=True, exist_ok=True) mesh_filename = os.path.join(mesh_folder, f'{frame_idx:06d}.obj') mc = mesh_color[person_id] img = renderer.render( img, frame_verts, cam=frame_cam, color=mc, mesh_filename=mesh_filename, ) if args.sideview: side_img = renderer.render( side_img, frame_verts, cam=frame_cam, color=mc, angle=270, axis=[0, 1, 0], ) if args.sideview: img = np.concatenate([img, side_img], axis=1) # save output frames cv2.imwrite(os.path.join(output_img_folder, f'{frame_idx:06d}.jpg'), img) cv2.imwrite(os.path.join(input_img_folder, f'{frame_idx:06d}.jpg'), input_img) if args.display: cv2.imshow('Video', img) if cv2.waitKey(1) & 0xFF == ord('q'): break if args.display: cv2.destroyAllWindows() """ Save rendered video """ vid_name = os.path.basename(video_file) save_name = f'tcmr_{vid_name.replace(".mp4", "")}_output.mp4' save_path = os.path.join(output_path, save_name) images_to_video(img_folder=output_img_folder, output_vid_file=save_path) images_to_video(img_folder=input_img_folder, output_vid_file=os.path.join(output_path, vid_name)) print(f"Saving result video to {os.path.abspath(save_path)}") shutil.rmtree(output_img_folder) shutil.rmtree(input_img_folder) shutil.rmtree(image_folder)
from lib.utils.vis import batch_draw_skeleton, batch_visualize_preds dataset = 'MPII3D' seqlen = 16 DEBUG = True db = eval(dataset)(set='val', seqlen=seqlen, debug=DEBUG) dataloader = DataLoader( dataset=db, batch_size=4, shuffle=True, num_workers=1, ) smpl = SMPL(SMPL_MODEL_DIR) start = time.time() for i, target in enumerate(dataloader): data_time = time.time() - start start = time.time() print(f'Data loading time {data_time:.4f}') for k, v in target.items(): print(k, v.shape) if DEBUG: input = target['video'][0] single_target = {k: v[0] for k, v in target.items()} if dataset == 'MPII3D':
).to(cfg.DEVICE) if cfg.TRAIN.PRETRAINED != '' and os.path.isfile(cfg.TRAIN.PRETRAINED): checkpoint = torch.load(cfg.TRAIN.PRETRAINED) best_performance = checkpoint['performance'] model.load_state_dict(checkpoint['gen_state_dict']) print(f'==> Loaded pretrained model from {cfg.TRAIN.PRETRAINED}...') print(f'Performance on 3DPW test set {best_performance}') else: print(f'{cfg.TRAIN.PRETRAINED} is not a pretrained model!!!!') exit() dtype = torch.float smpl = SMPL(SMPL_MODEL_DIR, batch_size=50, create_transl=False, dtype=dtype).to(cfg.DEVICE) J_regressor = torch.from_numpy( np.load(osp.join(VIBE_DATA_DIR, 'J_regressor_h36m.npy'))).float() t_total = 16 ################## 3dpw ################## dataset_setting = "test" dataset_3dpw = joblib.load( "/hdd/zen/data/video_pose/vibe_db/3dpw_{}_db.pt".format( dataset_setting)) vid_names = dataset_3dpw['vid_name'] thetas = dataset_3dpw['pose'] features = dataset_3dpw['features'] j3ds = dataset_3dpw['joints3D']