def init_model(self): self.model = TemporalModel(self.valid_poses[0].shape[-2], self.valid_poses[0].shape[-1], self.dataset.skeleton().num_joints(), filter_widths=[3, 3, 3, 3, 3], causal=False, dropout=0.25, channels=1024, dense=False) self.model.load_state_dict(self.checkpoint['model_pos'])
def videopose_model_load(): # load trained model from common.model import TemporalModel chk_filename = main_path + '/../checkpoint/cpn-pt-243.bin' checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=False, dropout=False, channels=1024, dense=False) #bypass CUDA for now to run only on CPU #model_pos = model_pos.cuda() model_pos.load_state_dict(checkpoint['model_pos']) # Print model's state_dict print("Model's state_dict:") for param_tensor in model_pos.state_dict(): print(param_tensor, "\t", model_pos.state_dict()[param_tensor].size()) receptive_field = model_pos.receptive_field() return model_pos
def create_model(): # 加载模型 filter_widths = [int(x) for x in args.architecture.split(',')] model_eval = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) receptive_field = model_eval.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side if args.causal: print('INFO: Using causal convolutions') causal_shift = pad else: causal_shift = 0 model_params = 0 for parameter in model_eval.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) model_eval.to(device) return model_eval, causal_shift, pad
def get_pose3d_predictor(ckpt_dir, ckpt_name, filter_widths, causal=False, channels=1024): """ 加载3d关节点坐标预测器 Args: channels: ckpt_dir: ckpt_name: filter_widths: causal: Returns: pose3d_predictor """ ckpt_path = os.path.join(ckpt_dir, ckpt_name) print('Loading checkpoint', ckpt_path) checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) pose3d_predictor = TemporalModel(17, 2, 17, filter_widths=filter_widths, causal=causal, channels=channels) receptive_field = pose3d_predictor.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pose3d_predictor.load_state_dict(checkpoint['model_pos']) return pose3d_predictor.to(device).eval()
def videopose_model_load(): # load trained model from common.model import TemporalModel chk_filename = main_path + '/checkpoint/cpn-pt-243.bin' checkpoint = torch.load( chk_filename, map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=False, dropout=False, channels=1024, dense=False) model_pos = model_pos.cuda() model_pos.load_state_dict(checkpoint['model_pos']) receptive_field = model_pos.receptive_field() return model_pos
cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test, None) filter_widths = [int(x) for x in args['architecture'].split(',')] model_pos_train = TemporalModelOptimized1f(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), filter_widths=filter_widths, causal=args['causal'], dropout=args['dropout'], channels=args['channels']) model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), filter_widths=filter_widths, causal=args['causal'], dropout=args['dropout'], channels=args['channels'], dense=args['dense']) causal_shift = 0 if torch.cuda.is_available(): model_pos = model_pos.cuda() model_pos_train = model_pos_train.cuda() if args['resume'] or args['evaluate']: chk_filename = os.path.join( args['checkpoint'], args['resume'] if args['resume'] else args['evaluate']) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename,
elif stride > 1: # Downsample as requested for i in range(len(out_poses_2d)): out_poses_2d[i] = out_poses_2d[i][::stride] if out_poses_3d is not None: out_poses_3d[i] = out_poses_3d[i][::stride] return out_camera_params, out_poses_3d, out_poses_2d cameras_valid, poses_valid, poses_valid_2d = fetch(['detectron2'], None) model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), filter_widths=[3, 3, 3, 3, 3], causal=False, dropout=0.25, channels=1024, dense=False) receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 causal_shift = 0 if torch.cuda.is_available(): model_pos = model_pos.cuda() checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) model_pos.load_state_dict(checkpoint['model_pos']) test_generator = UnchunkedGenerator(cameras_valid,
def analyze_frame(h, frame): boxes, keypoints = infer.inference_on_frame(h['predictor'], frame) # step 4: prepare data. # take 2d keypoints, that's it # first element is empty array, second is our actual frame data, a 3d numpy array with first dimension 1, second and third being the 17 joints of 3 doubles each. kp = keypoints[1][0][:2, :].T # extract (x, y) just like in prepare_data_2d_custom code # what to do if kp is NaN or missing data or something? # I guess just ignore it # they do this at the end of step4. but we keep it simple, and take the data from step2 directly into a variable. # output[canonical_name]['custom'] = [data[0]['keypoints'].astype('float32')] #output_custom_canonical_bullshit = kp.astype('float32') # this is what happens at the end of step4. which is a file that is loaded in the beginning of step 5. # np.savez_compressed(os.path.join(args.dataoutputdir, output_prefix_2d + args.output), positions_2d=output, metadata=metadata) # this is the bullshit they do in the original script. # confusingly, keypoints is actually just data, until it is set to keypoints[positions_2d] # keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True) # step 5: ..... all the other shit # starting to copy stuff over from run.py # extract dataset from the init dictionary dataset = h['dataset'] keypoints_metadata = h['keypoints_metadata'] keypoints_symmetry = h['keypoints_symmetry'] kps_left = h['kps_left'] kps_right = h['kps_right'] joints_left = h['joints_left'] joints_right = h['joints_right'] # normalize for i in range(len(kp)): koord = kp[i] kp[i] = normalize_screen_coordinates(koord, h['frame_metadata']['w'], h['frame_metadata']['h']) #for kps in enumerate(keypoints): # kps[..., :2] = normalize_screen_coordinates(kps[..., :2], frame_metadata['w'], frame_metadata['h']) # this is taken from the args.architecture and run.py and just hardcoded, skipping a lot of nonsense filter_widths = [int(x) for x in "3,3,3,3,3".split(',')] skeleton_num_joints = dataset.skeleton().num_joints() #skeleton_num_joints = 17 causal = True dropout = 0.25 channels = 1024 dense = False model_pos_train = TemporalModelOptimized1f(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels) model_pos = TemporalModel(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side #if args.causal: # print('INFO: Using causal convolutions') # causal_shift = pad #else: # causal_shift = 0 causal_shift = pad model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() model_pos_train = model_pos_train.cuda() #if args.resume or args.evaluate: if True: chk_filename = "checkpoint/pretrained_h36m_detectron_coco.bin" print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) # false in our particular case... we might benefit from getting rid of model_traj, # unless it's super fast then we should just keep it in case we ever upgrade if 'model_traj' in checkpoint: # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild) model_traj = TemporalModel(kp.shape[-2], kp.shape[-1], 1, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) if torch.cuda.is_available(): model_traj = model_traj.cuda() model_traj.load_state_dict(checkpoint['model_traj']) else: model_traj = None test_generator = UnchunkedGenerator(None, None, kp, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) # Evaluate def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): if not use_trajectory_model: model_pos.eval() else: model_traj.eval() N = 0 for _, batch, batch_2d in eval_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model if not use_trajectory_model: predicted_3d_pos = model_pos(inputs_2d) else: predicted_3d_pos = model_traj(inputs_2d) # Test-time augmentation (if enabled) if eval_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 if not use_trajectory_model: predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if eval_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----'+action+'----') e1 = (epoch_loss_3d_pos / N)*1000 e2 = (epoch_loss_3d_pos_procrustes / N)*1000 e3 = (epoch_loss_3d_pos_scale / N)*1000 ev = (epoch_loss_3d_vel / N)*1000 print('Test time augmentation:', eval_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev image_keypoints2d = kp gen = UnchunkedGenerator(None, None, [[image_keypoints2d]], pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # here is the data format # public enum VideoPose3dJointOrder # { # HIP = 0, # R_HIP = 1, # R_KNEE = 2, # R_FOOT = 3, # L_HIP = 4, # L_KNEE = 5, # L_FOOT = 6, # SPINE = 7, # THORAX = 8, # NOSE = 9, # HEAD = 10, # L_SHOULDER = 11, # L_ELBOW = 12, # L_WRIST = 13, # R_SHOULDER = 14, # R_ELBOW = 15, # R_WRIST = 16 # } # this bugs out. dunno what the hell they were trying to do. # anyway we can fix it by just getting width/height some other way. # Invert camera transformation cam = dataset.cameras() width = cam['frame'][0]['res_w'] height = cam['frame'][0]['res_h'] image_keypoints2d = image_coordinates(image_keypoints2d[..., :2], w=width, h=height) viz_camera = 0 # If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. for subject in dataset.cameras(): if 'orientation' in dataset.cameras()[subject][viz_camera]: rot = dataset.cameras()[subject][viz_camera]['orientation'] break prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) # because algo was meant for a list of frames, we take the first frame (our only frame) prediction3d = prediction[0] return prediction3d, image_keypoints2d # do we want to visualize? this code used to write to json and create a video for visualization #if args.viz_output is not None: if True: anim_output = {'Reconstruction': prediction} # format the data in the same format as mediapipe, so we can load it in unity with the same script # we need a list (frames) of lists of 3d landmarks. unity_landmarks = prediction.tolist() # how to send data? or display it? # maybe draw it on the webcam feed....?!?!?! #with open(args.output_json, "w") as json_file: # json.dump(unity_landmarks, json_file) #if args.rendervideo == "yes": # from common.visualization import render_animation # render_animation(input_keypoints, keypoints_metadata, anim_output, # dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, # limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, # input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), # input_video_skip=args.viz_skip) we_re_done_here = 1
class Predictor: def __init__(self, dataset_path, checkpoint_path, input_video_path=None, export_path=None, output_path=None, with_cude=False): self.with_cuda = with_cude self.dataset_path = dataset_path self.export_path = export_path self.output_path = output_path self.input_video_path = input_video_path self.dataset = CustomDataset(self.dataset_path) self.keypoints = None self.keypoints_left = None self.keypoints_right = None self.joints_left = None self.joints_right = None self.checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) self.model = None self.init_keypoints() self.valid_poses = self.keypoints["detectron2"]["custom"] self.init_model() self.test_generator = None self.init_generator() self.prediction = None self.make_prediction() def export_prediction(self): if self.export_path is not None: np.save(self.export_path, self.prediction) def init_model(self): self.model = TemporalModel(self.valid_poses[0].shape[-2], self.valid_poses[0].shape[-1], self.dataset.skeleton().num_joints(), filter_widths=[3, 3, 3, 3, 3], causal=False, dropout=0.25, channels=1024, dense=False) self.model.load_state_dict(self.checkpoint['model_pos']) def init_keypoints(self): self.keypoints = np.load(self.dataset_path, allow_pickle=True) keypoints_metadata = self.keypoints['metadata'].item() keypoints_symmetry = keypoints_metadata['keypoints_symmetry'] self.keypoints_left, self.keypoints_right = list( keypoints_symmetry[0]), list(keypoints_symmetry[1]) self.joints_left, self.joints_right = list( self.dataset.skeleton().joints_left()), list( self.dataset.skeleton().joints_right()) self.keypoints = self.keypoints['positions_2d'].item() for subject in self.keypoints.keys(): for action in self.keypoints[subject]: for cam_idx, kps in enumerate(self.keypoints[subject][action]): # Normalize camera frame cam = self.dataset.cameras()[subject][cam_idx] kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) self.keypoints[subject][action][cam_idx] = kps def init_generator(self): receptive_field = self.model.receptive_field() pad = (receptive_field - 1) // 2 causal_shift = 0 self.test_generator = UnchunkedGenerator( None, None, self.valid_poses, pad=pad, causal_shift=causal_shift, augment=False, kps_left=self.keypoints_left, kps_right=self.keypoints_right, joints_left=self.joints_left, joints_right=self.joints_right) def make_prediction(self): if self.with_cuda: self.model = self.model.cuda() with torch.no_grad(): self.model.eval() for _, batch, batch_2d in self.test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if self.with_cuda: inputs_2d = inputs_2d.cuda() predicted_3d_pos = self.model(inputs_2d) if self.test_generator.augment_enabled(): predicted_3d_pos[1, :, :, 0] *= -1 predicted_3d_pos[1, :, self.joints_left + self.joints_right] = predicted_3d_pos[ 1, :, self.joints_right + self.joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) predicted_3d_pos = predicted_3d_pos.squeeze(0).cpu().numpy() rot = self.dataset.cameras()['detectron2'][0]['orientation'] predicted_3d_pos = camera_to_world(predicted_3d_pos, R=rot, t=0) predicted_3d_pos[:, :, 2] -= np.min(predicted_3d_pos[:, :, 2]) self.prediction = predicted_3d_pos def plot_pose(self, pose_index=0): pose = make_pose(self.prediction.tolist()[pose_index]) pose.prepare_plot() pose.plot()
def videpose_infer(args): from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates from common.generators import UnchunkedGenerator from common.model import TemporalModel from common.utils import Timer, evaluate, add_path from videopose import get_detector_2d, ckpt_time, metadata, time0 import gene_npz gene_npz.args.outputpath = str(args.viz_output / "alpha_pose_kunkun_cut") print(gene_npz.args) # detector_2d = get_detector_2d(args.detector_2d) detector_2d = gene_npz.generate_kpts(args.detector_2d) assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)' # 2D kpts loads or generate if not args.input_npz: video_name = args.viz_video keypoints = detector_2d(video_name) else: npz = np.load(args.input_npz) keypoints = npz['kpts'] # (N, 17, 2) keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list( keypoints_symmetry[0]), list(keypoints_symmetry[1]) joints_left, joints_right = list( [4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints Suppose using the camera parameter keypoints = normalize_screen_coordinates( keypoints[..., :2], w=1000, h=1002) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_pos = model_pos.cuda() ckpt, time1 = ckpt_time(time0) print('-------------- load data spends {:.2f} seconds'.format(ckpt)) # load trained model chk_filename = os.path.join( args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load( chk_filename, map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time2 = ckpt_time(time1) print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) # save 3D joint points np.save(args.viz_output / "test_3d_output.npy", prediction, allow_pickle=True) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates( input_keypoints[..., :2], w=1000, h=1002) ckpt, time3 = ckpt_time(time2) print( '-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt)) ckpt, time4 = ckpt_time(time3) print('total spend {:2f} second'.format(ckpt))
def main(input_args): vp3d_dir = input_args.vp3d_dir sys.path.append(vp3d_dir) from common.camera import normalize_screen_coordinates from common.model import TemporalModel from common.generators import UnchunkedGenerator from common.arguments import parse_args args = parse_args() print(args) kps_left = [4, 5, 6, 11, 12, 13] kps_right = [1, 2, 3, 14, 15, 16] joints_left = [4, 5, 6, 11, 12, 13] joints_right = [1, 2, 3, 14, 15, 16] filter_widths = [int(x) for x in args.architecture.split(',')] num_joints_in = 17 in_features = 2 num_joints_out = 17 model_pos = TemporalModel(num_joints_in, in_features, num_joints_out, filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side if args.causal: print('INFO: Using causal convolutions') causal_shift = pad else: causal_shift = 0 model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() if args.resume or args.evaluate: chk_filename = os.path.join(vp3d_dir, args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos.load_state_dict(checkpoint['model_pos']) # Evaluate def evaluate(test_generator, action=None, return_predictions=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): model_pos.eval() N = 0 for _, batch, batch_2d in test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model predicted_3d_pos = model_pos(inputs_2d) # Test-time augmentation (if enabled) if test_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if test_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----'+action+'----') e1 = (epoch_loss_3d_pos / N)*1000 e2 = (epoch_loss_3d_pos_procrustes / N)*1000 e3 = (epoch_loss_3d_pos_scale / N)*1000 ev = (epoch_loss_3d_vel / N)*1000 print('Test time augmentation:', test_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev def get_gt_dirs(input_path, camera_id='dev3'): """Get all directories with ground-truth 2D human pose annotations """ gt_path_list = [] category_path_list = get_subdirs(input_path) for category in category_path_list: if os.path.basename(category) != 'Calibration': category_scans = get_subdirs(category) for category_scan in category_scans: device_list = get_subdirs(category_scan) for device_path in device_list: if camera_id in device_path: if os.path.exists(os.path.join(device_path, 'pose2d')): # 2D annotations exist gt_path_list.append(device_path) # eg <root>/Lack_TV_Bench/0007_white_floor_08_04_2019_08_28_10_47/dev3 return gt_path_list def get_subdirs(input_path): ''' get a list of subdirectories in input_path directory :param input_path: parent directory (in which to get the subdirectories) :return: subdirs: list of subdirectories in input_path ''' subdirs = [os.path.join(input_path, dir_i) for dir_i in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, dir_i))] subdirs.sort() return subdirs fps = 30 frame_width = 1920.0 frame_height = 1080.0 h36m_joint_names = get_h36m_joint_names() h36m_joint_names_dict = {name: i for i, name in enumerate(h36m_joint_names)} joint_names = get_body25_joint_names() joint_names_dict = {name: i for i, name in enumerate(joint_names)} dataset_dir = input_args.dataset_dir camera_id = input_args.camera_id gt_dirs = get_gt_dirs(dataset_dir, camera_id) for i, gt_dir in enumerate(gt_dirs): print(f"\nProcessing {i} of {len(gt_dirs)}: {' '.join(gt_dir.split('/')[-3:-1])}") input_dir = os.path.join(gt_dir, 'predictions', 'pose2d', 'openpose') output_dir = os.path.join(gt_dir, 'predictions', 'pose3d', 'vp3d') os.makedirs(output_dir, exist_ok=True) json_mask = os.path.join(input_dir, 'scan_video_00000000????_keypoints.json') json_files = sorted(glob(json_mask)) input_keypoints = [] for json_file in json_files: with open(json_file, 'r') as f: pose2d = json.load(f) if len(pose2d["people"]) == 0: keypoints_op = np.zeros((19, 3)) else: keypoints_op = np.array(pose2d["people"][0]["pose_keypoints_2d"]).reshape(-1, 3) # Takes first detected person every time... keypoints = np.zeros((17, 3)) for i, joint_name in enumerate(h36m_joint_names): if joint_name == 'spine' or joint_name == 'head': continue joint_id = joint_names_dict[joint_name] keypoints[i, :] = keypoints_op[joint_id, :] keypoints[h36m_joint_names_dict['mid hip'], :] = np.mean((keypoints[h36m_joint_names_dict['left hip'], :], keypoints[h36m_joint_names_dict['right hip'], :]), axis=0) # mid hip = mean(left hip, right hip) keypoints[h36m_joint_names_dict['spine'], :] = np.mean((keypoints[h36m_joint_names_dict['neck'], :], keypoints[h36m_joint_names_dict['mid hip'], :]), axis=0) # spine = mean(neck, mid hip) keypoints[h36m_joint_names_dict['head'], :] = np.mean((keypoints_op[joint_names_dict['left ear'], :], keypoints_op[joint_names_dict['right ear'], :]), axis=0) # head = mean(left ear, right ear) input_keypoints.append(keypoints) input_keypoints = np.array(input_keypoints) input_keypoints = input_keypoints[:, :, :2] # For pretrained_h36m_cpn.bin and cpn_ft_h36m_dbb input_keypoints[..., :2] = normalize_screen_coordinates(input_keypoints[..., :2], w=frame_width, h=frame_height) args.test_time_augmentation=True gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # Nx17x3 pickle.dump(prediction, open(os.path.join(output_dir, 'vp3d_output.pkl'), "wb"))