# If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. rot = None for subject in dataset.cameras(): if 'orientation' in dataset.cameras()[subject][args.viz_camera]: rot = dataset.cameras()[subject][ args.viz_camera]['orientation'] break prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates(input_keypoints[..., :2], w=cam['res_w'], h=cam['res_h']) # print('w, h:', cam['res_w'], cam['res_h']) from common.visualization import render_animation print("rot:", rot) print("cam['azimuth']:", cam['azimuth']) render_animation(input_keypoints, keypoints_metadata, anim_output, dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, limit=args.viz_limit,
print('Computing ground-truth 2D poses...') dataset = Human36mDataset(output_filename + '.npz') output_2d_poses = {} for subject in dataset.subjects(): output_2d_poses[subject] = {} for action in dataset[subject].keys(): anim = dataset[subject][action] positions_2d = [] for cam in anim['cameras']: pos_3d = world_to_camera(anim['positions'], R=cam['orientation'], t=cam['translation']) pos_2d = wrap(project_to_2d, True, pos_3d, cam['intrinsic']) pos_2d_pixel_space = image_coordinates(pos_2d, w=cam['res_w'], h=cam['res_h']) positions_2d.append(pos_2d_pixel_space.astype('float32')) output_2d_poses[subject][action] = positions_2d print('Saving...') metadata = { 'num_joints': dataset.skeleton().num_joints(), 'keypoints_symmetry': [dataset.skeleton().joints_left(), dataset.skeleton().joints_right()] } np.savez_compressed(output_filename_2d, positions_2d=output_2d_poses, metadata=metadata)
def analyze_frame(h, frame): boxes, keypoints = infer.inference_on_frame(h['predictor'], frame) # step 4: prepare data. # take 2d keypoints, that's it # first element is empty array, second is our actual frame data, a 3d numpy array with first dimension 1, second and third being the 17 joints of 3 doubles each. kp = keypoints[1][0][:2, :].T # extract (x, y) just like in prepare_data_2d_custom code # what to do if kp is NaN or missing data or something? # I guess just ignore it # they do this at the end of step4. but we keep it simple, and take the data from step2 directly into a variable. # output[canonical_name]['custom'] = [data[0]['keypoints'].astype('float32')] #output_custom_canonical_bullshit = kp.astype('float32') # this is what happens at the end of step4. which is a file that is loaded in the beginning of step 5. # np.savez_compressed(os.path.join(args.dataoutputdir, output_prefix_2d + args.output), positions_2d=output, metadata=metadata) # this is the bullshit they do in the original script. # confusingly, keypoints is actually just data, until it is set to keypoints[positions_2d] # keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True) # step 5: ..... all the other shit # starting to copy stuff over from run.py # extract dataset from the init dictionary dataset = h['dataset'] keypoints_metadata = h['keypoints_metadata'] keypoints_symmetry = h['keypoints_symmetry'] kps_left = h['kps_left'] kps_right = h['kps_right'] joints_left = h['joints_left'] joints_right = h['joints_right'] # normalize for i in range(len(kp)): koord = kp[i] kp[i] = normalize_screen_coordinates(koord, h['frame_metadata']['w'], h['frame_metadata']['h']) #for kps in enumerate(keypoints): # kps[..., :2] = normalize_screen_coordinates(kps[..., :2], frame_metadata['w'], frame_metadata['h']) # this is taken from the args.architecture and run.py and just hardcoded, skipping a lot of nonsense filter_widths = [int(x) for x in "3,3,3,3,3".split(',')] skeleton_num_joints = dataset.skeleton().num_joints() #skeleton_num_joints = 17 causal = True dropout = 0.25 channels = 1024 dense = False model_pos_train = TemporalModelOptimized1f(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels) model_pos = TemporalModel(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side #if args.causal: # print('INFO: Using causal convolutions') # causal_shift = pad #else: # causal_shift = 0 causal_shift = pad model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() model_pos_train = model_pos_train.cuda() #if args.resume or args.evaluate: if True: chk_filename = "checkpoint/pretrained_h36m_detectron_coco.bin" print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) # false in our particular case... we might benefit from getting rid of model_traj, # unless it's super fast then we should just keep it in case we ever upgrade if 'model_traj' in checkpoint: # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild) model_traj = TemporalModel(kp.shape[-2], kp.shape[-1], 1, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) if torch.cuda.is_available(): model_traj = model_traj.cuda() model_traj.load_state_dict(checkpoint['model_traj']) else: model_traj = None test_generator = UnchunkedGenerator(None, None, kp, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) # Evaluate def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): if not use_trajectory_model: model_pos.eval() else: model_traj.eval() N = 0 for _, batch, batch_2d in eval_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model if not use_trajectory_model: predicted_3d_pos = model_pos(inputs_2d) else: predicted_3d_pos = model_traj(inputs_2d) # Test-time augmentation (if enabled) if eval_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 if not use_trajectory_model: predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if eval_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----'+action+'----') e1 = (epoch_loss_3d_pos / N)*1000 e2 = (epoch_loss_3d_pos_procrustes / N)*1000 e3 = (epoch_loss_3d_pos_scale / N)*1000 ev = (epoch_loss_3d_vel / N)*1000 print('Test time augmentation:', eval_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev image_keypoints2d = kp gen = UnchunkedGenerator(None, None, [[image_keypoints2d]], pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # here is the data format # public enum VideoPose3dJointOrder # { # HIP = 0, # R_HIP = 1, # R_KNEE = 2, # R_FOOT = 3, # L_HIP = 4, # L_KNEE = 5, # L_FOOT = 6, # SPINE = 7, # THORAX = 8, # NOSE = 9, # HEAD = 10, # L_SHOULDER = 11, # L_ELBOW = 12, # L_WRIST = 13, # R_SHOULDER = 14, # R_ELBOW = 15, # R_WRIST = 16 # } # this bugs out. dunno what the hell they were trying to do. # anyway we can fix it by just getting width/height some other way. # Invert camera transformation cam = dataset.cameras() width = cam['frame'][0]['res_w'] height = cam['frame'][0]['res_h'] image_keypoints2d = image_coordinates(image_keypoints2d[..., :2], w=width, h=height) viz_camera = 0 # If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. for subject in dataset.cameras(): if 'orientation' in dataset.cameras()[subject][viz_camera]: rot = dataset.cameras()[subject][viz_camera]['orientation'] break prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) # because algo was meant for a list of frames, we take the first frame (our only frame) prediction3d = prediction[0] return prediction3d, image_keypoints2d # do we want to visualize? this code used to write to json and create a video for visualization #if args.viz_output is not None: if True: anim_output = {'Reconstruction': prediction} # format the data in the same format as mediapipe, so we can load it in unity with the same script # we need a list (frames) of lists of 3d landmarks. unity_landmarks = prediction.tolist() # how to send data? or display it? # maybe draw it on the webcam feed....?!?!?! #with open(args.output_json, "w") as json_file: # json.dump(unity_landmarks, json_file) #if args.rendervideo == "yes": # from common.visualization import render_animation # render_animation(input_keypoints, keypoints_metadata, anim_output, # dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, # limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, # input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), # input_video_skip=args.viz_skip) we_re_done_here = 1
def main(args): print('==> Using settings {}'.format(args)) convm = torch.zeros(3, 17, 17, dtype=torch.float) print('==> Loading dataset...') dataset_path = path.join('data', 'data_3d_' + args.dataset + '.npz') if args.dataset == 'h36m': from common.h36m_dataset import Human36mDataset dataset = Human36mDataset(dataset_path) else: raise KeyError('Invalid dataset') print('==> Preparing data...') dataset = read_3d_data(dataset) print('==> Loading 2D detections...') keypoints = create_2d_data( path.join('data', 'data_2d_' + args.dataset + '_' + args.keypoints + '.npz'), dataset) cudnn.benchmark = True device = torch.device("cuda") # Create model print("==> Creating model...") if args.architecture == 'linear': from models.linear_model import LinearModel, init_weights num_joints = dataset.skeleton().num_joints() model_pos = LinearModel(num_joints * 2, (num_joints - 1) * 3).to(device) model_pos.apply(init_weights) elif args.architecture == 'gcn': from models.sem_gcn import SemGCN from common.graph_utils import adj_mx_from_skeleton p_dropout = (None if args.dropout == 0.0 else args.dropout) adj = adj_mx_from_skeleton(dataset.skeleton()) model_pos = SemGCN(convm, adj, args.hid_dim, num_layers=args.num_layers, p_dropout=p_dropout, nodes_group=dataset.skeleton().joints_group() if args.non_local else None).to(device) else: raise KeyError('Invalid model architecture') print("==> Total parameters: {:.2f}M".format( sum(p.numel() for p in model_pos.parameters()) / 1000000.0)) # Resume from a checkpoint ckpt_path = args.evaluate if path.isfile(ckpt_path): print("==> Loading checkpoint '{}'".format(ckpt_path)) ckpt = torch.load(ckpt_path) start_epoch = ckpt['epoch'] error_best = ckpt['error'] model_pos.load_state_dict(ckpt['state_dict']) print("==> Loaded checkpoint (Epoch: {} | Error: {})".format( start_epoch, error_best)) else: raise RuntimeError("==> No checkpoint found at '{}'".format(ckpt_path)) print('==> Rendering...') poses_2d = keypoints[args.viz_subject][args.viz_action] out_poses_2d = poses_2d[args.viz_camera] out_actions = [args.viz_camera] * out_poses_2d.shape[0] poses_3d = dataset[args.viz_subject][args.viz_action]['positions_3d'] assert len(poses_3d) == len(poses_2d), 'Camera count mismatch' out_poses_3d = poses_3d[args.viz_camera] ground_truth = dataset[args.viz_subject][args.viz_action]['positions_3d'][ args.viz_camera].copy() input_keypoints = out_poses_2d.copy() render_loader = DataLoader(PoseGenerator([out_poses_3d], [out_poses_2d], [out_actions]), batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) prediction = evaluate(render_loader, model_pos, device, args.architecture)[0] # Invert camera transformation cam = dataset.cameras()[args.viz_subject][args.viz_camera] prediction = camera_to_world(prediction, R=cam['orientation'], t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) ground_truth = camera_to_world(ground_truth, R=cam['orientation'], t=0) ground_truth[:, :, 2] -= np.min(ground_truth[:, :, 2]) anim_output = {'Regression': prediction, 'Ground truth': ground_truth} input_keypoints = image_coordinates(input_keypoints[..., :2], w=cam['res_w'], h=cam['res_h']) render_animation(input_keypoints, anim_output, dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), input_video_skip=args.viz_skip)
def videpose_infer(args): from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates from common.generators import UnchunkedGenerator from common.model import TemporalModel from common.utils import Timer, evaluate, add_path from videopose import get_detector_2d, ckpt_time, metadata, time0 import gene_npz gene_npz.args.outputpath = str(args.viz_output / "alpha_pose_kunkun_cut") print(gene_npz.args) # detector_2d = get_detector_2d(args.detector_2d) detector_2d = gene_npz.generate_kpts(args.detector_2d) assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)' # 2D kpts loads or generate if not args.input_npz: video_name = args.viz_video keypoints = detector_2d(video_name) else: npz = np.load(args.input_npz) keypoints = npz['kpts'] # (N, 17, 2) keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list( keypoints_symmetry[0]), list(keypoints_symmetry[1]) joints_left, joints_right = list( [4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints Suppose using the camera parameter keypoints = normalize_screen_coordinates( keypoints[..., :2], w=1000, h=1002) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_pos = model_pos.cuda() ckpt, time1 = ckpt_time(time0) print('-------------- load data spends {:.2f} seconds'.format(ckpt)) # load trained model chk_filename = os.path.join( args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load( chk_filename, map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time2 = ckpt_time(time1) print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) # save 3D joint points np.save(args.viz_output / "test_3d_output.npy", prediction, allow_pickle=True) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates( input_keypoints[..., :2], w=1000, h=1002) ckpt, time3 = ckpt_time(time2) print( '-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt)) ckpt, time4 = ckpt_time(time3) print('total spend {:2f} second'.format(ckpt))