def init_generator(self): receptive_field = self.model.receptive_field() pad = (receptive_field - 1) // 2 causal_shift = 0 self.test_generator = UnchunkedGenerator( None, None, self.valid_poses, pad=pad, causal_shift=causal_shift, augment=False, kps_left=self.keypoints_left, kps_right=self.keypoints_right, joints_left=self.joints_left, joints_right=self.joints_right)
def run_evaluation(actions, action_filter=None): errors_p1 = [] errors_p2 = [] for action_key in actions.keys(): if action_filter is not None: found = False for a in action_filter: if action_key.startswith(a): found = True break if not found: continue poses_act, poses_2d_act = fetch_actions(actions[action_key]) gen = UnchunkedGenerator(None, poses_act, poses_2d_act, pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) e1, e2 = evaluate(gen, model_pos, joints_left, joints_right, action_key) errors_p1.append(e1) errors_p2.append(e2) print('Protocol #1 (MPJPE) action-wise average:', round(np.mean(errors_p1), 1), 'mm') print('Protocol #2 (P-MPJPE) action-wise average:', round(np.mean(errors_p2), 1), 'mm')
def gen_pose_frame(kpts, width, height, model_pos, pad, causal_shift=0): # kpts: (M, T, N, 2) norm_seqs = [] for kpt in kpts: norm_kpt = normalize_screen_coordinates(kpt, w=width, h=height) norm_seqs.append(norm_kpt) gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction_to_world = [] for i in range(len(prediction)): sub_prediction = prediction[i][0] sub_prediction = camera_to_world(sub_prediction, R=rot, t=0) sub_prediction[:, 2] -= np.amin(sub_prediction[:, 2]) prediction_to_world.append(sub_prediction) return prediction_to_world
def joints_2d_generator(joints_coords, pose3d_predictor, padding=False): """ 2d关节点坐标生成器 Args: joints_coords: 坐标 Returns: 生成器 """ if not padding: pad = 0 else: receive_field = pose3d_predictor.receptive_field() pad = receive_field // 2 causal_shift = 0 kps_left = [1, 3, 5, 7, 9, 11, 13, 15] kps_right = [2, 4, 6, 8, 10, 12, 14, 16] generator = UnchunkedGenerator(None, None, [joints_coords], pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right) return generator
def run_evaluation(actions, action_filter=None): errors_p1 = [] errors_p2 = [] errors_p3 = [] errors_vel = [] for action_key in actions.keys(): if action_filter is not None: found = False for a in action_filter: if action_key.startswith(a): found = True break if not found: continue poses_act, poses_2d_act = fetch_actions(actions[action_key]) gen = UnchunkedGenerator(None, poses_act, poses_2d_act, pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) #prediction=evaluate(gen,action_key) #np.save(('3dcoord'+action_key+'.npy'),prediction) #''' e1, e2, e3, ev = evaluate(gen, action_key) errors_p1.append(e1) errors_p2.append(e2) errors_p3.append(e3) errors_vel.append(ev) print('Protocol #1 (MPJPE) action-wise average:', round(np.mean(errors_p1), 1), 'mm') print('Protocol #2 (P-MPJPE) action-wise average:', round(np.mean(errors_p2), 1), 'mm') print('Protocol #3 (N-MPJPE) action-wise average:', round(np.mean(errors_p3), 1), 'mm') print('Velocity (MPJVE) action-wise average:', round(np.mean(errors_vel), 2), 'mm')
def interface(model_pos, keypoints, W, H): # input (N, 17, 2) return (N, 17, 3) if not isinstance(keypoints, np.ndarray): keypoints = np.array(keypoints) from common.camera import normalize_screen_coordinates_new, camera_to_world, normalize_screen_coordinates # keypoints = normalize_screen_coordinates_new(keypoints[..., :2], w=W, h=H) keypoints = normalize_screen_coordinates(keypoints[..., :2], w=1000, h=1002) input_keypoints = keypoints.copy() # test_time_augmentation True from common.generators import UnchunkedGenerator gen = UnchunkedGenerator(None, None, [input_keypoints], pad=common.pad, causal_shift=common.causal_shift, augment=True, kps_left=common.kps_left, kps_right=common.kps_right, joints_left=common.joints_left, joints_right=common.joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) prediction = camera_to_world(prediction, R=common.rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction
def predict(img_path): # 1.检测关节点并显示 # 预处理输入图像和检测人体 x, img = data.transforms.presets.yolo.load_test(img_path, short=256) # print("Shape of pre-processed image:", x.shape) start = time.time() # detect persons and bbox class_ids, scores, bounding_boxes = detector(x) # 2.预处理检测器的输出张量作为alpha_pose的输入 pose_input, upscale_bbox = detector_to_simple_pose(img, class_ids, scores, bounding_boxes) global detector_time detector_time += (time.time() - start) print("detector cost time: {:.3f} seconds".format(time.time() - start)) prepare_end = time.time() # 3.预测关节点 if pose_input is None: return None, None predicted_heatmap = pose_net(pose_input) pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox) global predictor_2d_time predictor_2d_time += (time.time() - prepare_end) print("2d pose predictor cost time: {:.3f} seconds".format(time.time() - prepare_end)) # 4.显示2d姿态 # utils.viz.plot_keypoints(img, pred_coords, confidence, class_IDs, bounding_boxes, scores, box_thresh=0.5, # keypoint_thresh=0.2) # 5.坐标标准化 prepare_end = time.time() kps = normalize_screen_coordinates(pred_coords.asnumpy(), w=img.shape[1], h=img.shape[0]) receptive_field = pose3d_predictor.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 # 6.创建生成器作为3d预测器的输入 generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False) # 7.3d姿势估计和显示 prediction = predict_3d_pos(generator, pose3d_predictor) global full_time, predictor_3d_time predictor_3d_time += time.time() - prepare_end full_time += time.time() - start print("3d predictor time: {:.3f} seconds".format(time.time() - prepare_end)) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction, img
def run_evaluation(actions, action_filter=None): errors_p1 = [] errors_p2 = [] errors_p3 = [] errors_vel = [] print('ACTIONS:::::', actions) for action_key in actions.keys(): if action_filter is not None: found = False for a in action_filter: if action_key.startswith(a): found = True break if not found: continue # poses_act, poses_2d_act = fetch_actions(actions[action_key]) print(sets['test']['sub']) print(sets['test']['act']) _, poses_act, poses_2d_act = fetch(sets['test']['sub'], sets['test']['act']) print('EVAL GEN:::::') print(np.shape(poses_act)) print(np.shape(poses_2d_act)) print(np.shape(poses_act[0])) print(np.shape(poses_2d_act[0])) gen = UnchunkedGenerator(None, poses_act, poses_2d_act, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) e1, e2, e3, ev = evaluate(gen, action_key) errors_p1.append(e1) errors_p2.append(e2) errors_p3.append(e3) errors_vel.append(ev) print('Protocol #1 (MPJPE) action-wise average:', round(np.mean(errors_p1), 1), 'mm') print('Protocol #2 (P-MPJPE) action-wise average:', round(np.mean(errors_p2), 1), 'mm') print('Protocol #3 (N-MPJPE) action-wise average:', round(np.mean(errors_p3), 1), 'mm') print('Velocity (MPJVE) action-wise average:', round(np.mean(errors_vel), 2), 'mm')
def predict_images(image_dir: str = '../images'): import os filenames = os.listdir(image_dir) image_files = [os.path.join(image_dir, fn) for fn in filenames] for i, img_file in enumerate(image_files): figure = plt.figure(figsize=(12, 6), dpi=100) img = cv2.imread(img_file) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) h, w, _ = img.shape hw = (512, int(h / w * 512)) img = cv2.resize(img, hw) # 二维姿态生成器 start = time.time() kps = predict_kps(kps_predictor, img) # print(kps) print("Spending {:.2f} seconds to predict 2d pose.".format(time.time() - start)) # 标准化,去掉概率列,只保留坐标值 kps = normalize_screen_coordinates(kps[..., :2], w=img.shape[1], h=img.shape[0]) kps = torch.from_numpy(kps).unsqueeze(0).numpy() # 创建生成器作为3d预测器的输入 generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False) # 三维姿态估计 start = time.time() prediction = predict_3d_pos(generator, pose3d_predictor) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) pos_3d = {'Reconstruction': prediction} # 渲染图像 render_image(pos_3d=pos_3d, skeleton=Skeleton(), azim=np.array(70., dtype=np.float32), input_video_frame=img, fig=figure) elapsed = time.time() - start print("Spending {:.2f} seconds to predict image: {}".format(elapsed, img_file)) figure.tight_layout() plt.savefig("images/" + str(i + 1) + '.png', bbox_inches='tight') plt.close()
def predict_3d_joints(predictor, coords_2d, w, h): # 坐标标准化 kps = normalize_screen_coordinates(coords_2d, w, h) # print('kps.type: {}, kps.shape: {}'.format(type(kps), kps.shape)) # 2d keypoints生成器 receptive_field = predictor.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 # 创建生成器作为3d预测器的输入 generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False) prediction = predict_3d_pos(generator, predictor) prediction = camera_to_world(prediction, R=rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction
def joints_2d_generator(joints_coords): """ 2d关节点坐标生成器 Args: joints_coords: 坐标 Returns: 生成器 """ pad = 0 causal_shift = 0 kps_left = [1, 3, 5, 7, 9, 11, 13, 15] kps_right = [2, 4, 6, 8, 10, 12, 14, 16] generator = UnchunkedGenerator(None, None, [joints_coords], pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right) return generator
def gen_pose(kpts, valid_frames, width, height, model_pos, pad, causal_shift=0): assert len(kpts.shape) == 4, 'The shape of kpts: {}'.format(kpts.shape) assert kpts.shape[0] == len(valid_frames) norm_seqs = [] for index, frames in enumerate(valid_frames): seq_kps = kpts[index, frames] norm_seq_kps = normalize_screen_coordinates(seq_kps, w=width, h=height) norm_seqs.append(norm_seq_kps) gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction_to_world = [] for i in range(len(prediction)): sub_prediction = prediction[i] sub_prediction = camera_to_world(sub_prediction, R=rot, t=0) # sub_prediction[:, :, 2] -= np.expand_dims(np.amin(sub_prediction[:, :, 2], axis=1), axis=1).repeat([17], axis=1) # sub_prediction[:, :, 2] -= np.amin(sub_prediction[:, :, 2]) prediction_to_world.append(sub_prediction) # prediction_to_world = np.asarray(prediction_to_world, dtype=np.float32) return prediction_to_world
def gen_pose_frame_(kpts, width, height, model_pos, pad, causal_shift=0): # input (N, 17, 2) return (N, 17, 3) if not isinstance(kpts, np.ndarray): kpts = np.array(kpts) keypoints = normalize_screen_coordinates(kpts[..., :2], w=width, h=height) input_keypoints = keypoints.copy() # test_time_augmentation True from common.generators import UnchunkedGenerator gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction = camera_to_world(prediction[0], R=rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction
dense=args.dense) if torch.cuda.is_available(): model_traj = model_traj.cuda() model_traj.load_state_dict(checkpoint['model_traj']) else: model_traj = None # print(poses_valid) # print(poses_valid_2d) # print(np.shape(poses_valid)) # print(np.shape(poses_valid_2d)) test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) if not args.evaluate: print(sets) cameras_train, poses_train, poses_train_2d = fetch(sets['train']['sub'], sets['train']['act'], subset=args.subset) lr = args.learning_rate optimizer = optim.Adam(model_pos_train.parameters(), lr=lr, amsgrad=True)
if args['resume'] or args['evaluate']: chk_filename = os.path.join( args['checkpoint'], args['resume'] if args['resume'] else args['evaluate']) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, pad=121, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('Evaluating...') all_actions = {} all_actions_by_subject = {} for subject in subjects_test: if subject not in all_actions_by_subject: all_actions_by_subject[subject] = {} for action in dataset[subject].keys(): action_name = action.split(' ')[0] if action_name not in all_actions: all_actions[action_name] = []
def predict(img_path): # 1.预处理输入图像和检测人体 x, img = data.transforms.presets.yolo.load_test(img_path, short=256) # detector.summary(x) # print("x.shape:", x.shape) start = time.time() # detect persons and bbox, class_ids, scores, bounding_boxes = detector(x) # shape: [sample_idx, class_idx, instance] # print("bounding_boxes.shape", bounding_boxes.shape, "bounding_boxes[0, 0]:", bounding_boxes[0, 0]) # 2.预处理检测器的输出张量作为mobile_pose的输入 pose_input, upscale_bbox = detector_to_mobile_pose(img, class_ids, scores, bounding_boxes) print("detector cost time: {:.3f} seconds".format(time.time() - start)) global detector_time detector_time += (time.time() - start) if pose_input is None: return None, None # 4.2d关节点预测 # pose_net.summary(pose_input) start_time = time.time() predicted_heatmap = pose_net(pose_input) pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox) # print("type(pre_coords): {}, shape(pre_coords): {}".format(type(pred_coords), pred_coords.shape)) # print("pred_coords: {}".format(pred_coords)) global predictor_2d_time predictor_2d_time += (time.time() - start_time) print("2d pose predictor cost time: {:.3f} seconds".format(time.time() - start_time)) # 5.显示2d姿态 # ax = utils.viz.plot_keypoints(img, pred_coords, confidence, class_IDs, bounding_boxes, scores, box_thresh=0.5, # keypoint_thresh=0.2) # print(pred_coords) # 6.坐标标准化 start_time = time.time() kps = normalize_screen_coordinates(pred_coords.asnumpy(), w=img.shape[1], h=img.shape[0]) # print('kps.type: {}, kps.shape: {}'.format(type(kps), kps.shape)) # 7.2d keypoints生成器 receptive_field = pose3d_predictor.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 # 创建生成器作为3d预测器的输入 generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False) # 8.3d姿势估计和显示 prediction = predict_3d_pos(generator, pose3d_predictor) global predictor_3d_time, full_time predictor_3d_time += (time.time() - start_time) full_time += (time.time() - start) print("3d pose predictor cost time: {:.3f} seconds".format(time.time() - start_time)) # print("prediction.shape: ", prediction.shape) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) elapsed = time.time() - start print("Total elapsed time of predicting image file {}: {:.3f} seconds".format(img_path, elapsed)) return prediction, img
causal_shift = 0 model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') weight_path = 'checkpoint/VideoPose_030.weights' model_pos.load_state_dict(torch.load(weight_path, map_location=device)) test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) # Evaluate def evaluate(test_generator, action=None, return_predictions=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): model_pos.eval() N = 0 for _, batch, batch_2d in test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
def main(): args = parse_args() args.input_npz = "data/VideoPose_test.npz" metadata = { 'layout_name': 'coco', 'num_joints': 17, 'keypoints_symmetry': [[1, 3, 5, 7, 9, 11, 13, 15], [2, 4, 6, 8, 10, 12, 14, 16]] } npz = np.load(args.input_npz) keypoints = npz['kpts'] keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list(keypoints_symmetry[0]), list( keypoints_symmetry[1]) #same with the original: list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right()) joints_left, joints_right = list([4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints Suppose using the camera parameter res_w = 1920 res_h = 1080 keypoints = normalize_screen_coordinates(keypoints[..., :2], w=res_w, h=res_h) #model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), # filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, # dense=args.dense) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side if args.causal: print('INFO: Using causal convolutions') causal_shift = pad else: causal_shift = 0 model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() #load model chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos.load_state_dict(checkpoint['model_pos']) #test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, # pad=pad, causal_shift=causal_shift, augment=False, # kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate_alphapose(gen, model_pos, return_predictions=True) print('INFO: Testing on {} frames'.format(gen.num_frames())) if args.viz_export is not None: print('Exporting joint positions to', args.viz_export) # Predictions are in camera space np.save(args.viz_export, prediction) if args.viz_output is not None: #from custom_dataset.py rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates(input_keypoints[..., :2], w=res_w, h=res_h) # Generate metadata: keypoints_metadata = {} keypoints_metadata['layout_name'] = 'coco' keypoints_metadata['num_joints'] = 17 keypoints_metadata['keypoints_symmetry'] = [[ 1, 3, 5, 7, 9, 11, 13, 15 ], [2, 4, 6, 8, 10, 12, 14, 16]] from common.visualization import render_animation #fps 25, azimuth 70 render_animation(input_keypoints, keypoints_metadata, anim_output, Skeleton(), 25, args.viz_bitrate, np.array(70., dtype=np.float32), args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(res_w, res_h), input_video_skip=args.viz_skip)
if args.resume or args.evaluate: chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) model_pos_train.load_state_dict(checkpoint['model_pos'], strict=False) model_pos.load_state_dict(checkpoint['model_pos'], strict=False) test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, action_class_valid, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) def eval_data_prepare(receptive_field, inputs_2d, inputs_3d, inputs_class_label): inputs_2d_p = torch.squeeze(inputs_2d) inputs_3d_p = inputs_3d.permute(1, 0, 2, 3) out_num = inputs_2d_p.shape[0] - receptive_field + 1 eval_input_2d = torch.empty(out_num, receptive_field, inputs_2d_p.shape[1], inputs_2d_p.shape[2])
def main(): args = parse_args() # 2D kpts loads or generate if not args.input_npz: # crate kpts by alphapose from joints_detectors.Alphapose.gene_npz import handle_video video_name = args.viz_video keypoints = handle_video(video_name) else: npz = np.load(args.input_npz) keypoints = npz['kpts'] #(N, 17, 2) keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list(keypoints_symmetry[0]), list( keypoints_symmetry[1]) joints_left, joints_right = list([4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints 假设use the camera parameter keypoints = normalize_screen_coordinates(keypoints[..., :2], w=1000, h=1002) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_pos = model_pos.cuda() ckpt, time1 = ckpt_time(time0) print('------- load data spends {:.2f} seconds'.format(ckpt)) # load trained model chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', os.path.join(main_path, chk_filename)) checkpoint = torch.load( os.path.join(main_path, chk_filename), map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time2 = ckpt_time(time1) print('------- load 3D model spends {:.2f} seconds'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates(input_keypoints[..., :2], w=1000, h=1002) ckpt, time3 = ckpt_time(time2) print( '------- generate reconstruction 3D data spends {:.2f} seconds'.format( ckpt)) if not args.viz_output: args.viz_output = 'result.mp4' from common.visualization import render_animation render_animation(input_keypoints, anim_output, skeleton(), 25, args.viz_bitrate, np.array(70., dtype=np.float32), args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(1000, 1002), input_video_skip=args.viz_skip) ckpt, time4 = ckpt_time(time3) print('total spend {:2f} second'.format(ckpt))
def videpose_infer(args): from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates from common.generators import UnchunkedGenerator from common.model import TemporalModel from common.utils import Timer, evaluate, add_path from videopose import get_detector_2d, ckpt_time, metadata, time0 import gene_npz gene_npz.args.outputpath = str(args.viz_output / "alpha_pose_kunkun_cut") print(gene_npz.args) # detector_2d = get_detector_2d(args.detector_2d) detector_2d = gene_npz.generate_kpts(args.detector_2d) assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)' # 2D kpts loads or generate if not args.input_npz: video_name = args.viz_video keypoints = detector_2d(video_name) else: npz = np.load(args.input_npz) keypoints = npz['kpts'] # (N, 17, 2) keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list( keypoints_symmetry[0]), list(keypoints_symmetry[1]) joints_left, joints_right = list( [4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints Suppose using the camera parameter keypoints = normalize_screen_coordinates( keypoints[..., :2], w=1000, h=1002) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_pos = model_pos.cuda() ckpt, time1 = ckpt_time(time0) print('-------------- load data spends {:.2f} seconds'.format(ckpt)) # load trained model chk_filename = os.path.join( args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load( chk_filename, map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time2 = ckpt_time(time1) print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) # save 3D joint points np.save(args.viz_output / "test_3d_output.npy", prediction, allow_pickle=True) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates( input_keypoints[..., :2], w=1000, h=1002) ckpt, time3 = ckpt_time(time2) print( '-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt)) ckpt, time4 = ckpt_time(time3) print('total spend {:2f} second'.format(ckpt))
model_pos = model_pos.cuda() if args.resume or args.evaluate: chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos.load_state_dict(checkpoint['model_pos']) test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) # Evaluate def evaluate(test_generator, action=None, return_predictions=False): with torch.no_grad(): model_pos.eval() N = 0 for _, batch, batch_2d in test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available():
receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 causal_shift = 0 if torch.cuda.is_available(): model_pos = model_pos.cuda() checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) model_pos.load_state_dict(checkpoint['model_pos']) test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) def evaluate(test_generator, action=None, return_predictions=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): model_pos.eval() N = 0 for _, batch, batch_2d in test_generator.next_epoch():
def analyze_frame(h, frame): boxes, keypoints = infer.inference_on_frame(h['predictor'], frame) # step 4: prepare data. # take 2d keypoints, that's it # first element is empty array, second is our actual frame data, a 3d numpy array with first dimension 1, second and third being the 17 joints of 3 doubles each. kp = keypoints[1][0][:2, :].T # extract (x, y) just like in prepare_data_2d_custom code # what to do if kp is NaN or missing data or something? # I guess just ignore it # they do this at the end of step4. but we keep it simple, and take the data from step2 directly into a variable. # output[canonical_name]['custom'] = [data[0]['keypoints'].astype('float32')] #output_custom_canonical_bullshit = kp.astype('float32') # this is what happens at the end of step4. which is a file that is loaded in the beginning of step 5. # np.savez_compressed(os.path.join(args.dataoutputdir, output_prefix_2d + args.output), positions_2d=output, metadata=metadata) # this is the bullshit they do in the original script. # confusingly, keypoints is actually just data, until it is set to keypoints[positions_2d] # keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True) # step 5: ..... all the other shit # starting to copy stuff over from run.py # extract dataset from the init dictionary dataset = h['dataset'] keypoints_metadata = h['keypoints_metadata'] keypoints_symmetry = h['keypoints_symmetry'] kps_left = h['kps_left'] kps_right = h['kps_right'] joints_left = h['joints_left'] joints_right = h['joints_right'] # normalize for i in range(len(kp)): koord = kp[i] kp[i] = normalize_screen_coordinates(koord, h['frame_metadata']['w'], h['frame_metadata']['h']) #for kps in enumerate(keypoints): # kps[..., :2] = normalize_screen_coordinates(kps[..., :2], frame_metadata['w'], frame_metadata['h']) # this is taken from the args.architecture and run.py and just hardcoded, skipping a lot of nonsense filter_widths = [int(x) for x in "3,3,3,3,3".split(',')] skeleton_num_joints = dataset.skeleton().num_joints() #skeleton_num_joints = 17 causal = True dropout = 0.25 channels = 1024 dense = False model_pos_train = TemporalModelOptimized1f(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels) model_pos = TemporalModel(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side #if args.causal: # print('INFO: Using causal convolutions') # causal_shift = pad #else: # causal_shift = 0 causal_shift = pad model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() model_pos_train = model_pos_train.cuda() #if args.resume or args.evaluate: if True: chk_filename = "checkpoint/pretrained_h36m_detectron_coco.bin" print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) # false in our particular case... we might benefit from getting rid of model_traj, # unless it's super fast then we should just keep it in case we ever upgrade if 'model_traj' in checkpoint: # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild) model_traj = TemporalModel(kp.shape[-2], kp.shape[-1], 1, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) if torch.cuda.is_available(): model_traj = model_traj.cuda() model_traj.load_state_dict(checkpoint['model_traj']) else: model_traj = None test_generator = UnchunkedGenerator(None, None, kp, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) # Evaluate def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): if not use_trajectory_model: model_pos.eval() else: model_traj.eval() N = 0 for _, batch, batch_2d in eval_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model if not use_trajectory_model: predicted_3d_pos = model_pos(inputs_2d) else: predicted_3d_pos = model_traj(inputs_2d) # Test-time augmentation (if enabled) if eval_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 if not use_trajectory_model: predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if eval_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----'+action+'----') e1 = (epoch_loss_3d_pos / N)*1000 e2 = (epoch_loss_3d_pos_procrustes / N)*1000 e3 = (epoch_loss_3d_pos_scale / N)*1000 ev = (epoch_loss_3d_vel / N)*1000 print('Test time augmentation:', eval_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev image_keypoints2d = kp gen = UnchunkedGenerator(None, None, [[image_keypoints2d]], pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # here is the data format # public enum VideoPose3dJointOrder # { # HIP = 0, # R_HIP = 1, # R_KNEE = 2, # R_FOOT = 3, # L_HIP = 4, # L_KNEE = 5, # L_FOOT = 6, # SPINE = 7, # THORAX = 8, # NOSE = 9, # HEAD = 10, # L_SHOULDER = 11, # L_ELBOW = 12, # L_WRIST = 13, # R_SHOULDER = 14, # R_ELBOW = 15, # R_WRIST = 16 # } # this bugs out. dunno what the hell they were trying to do. # anyway we can fix it by just getting width/height some other way. # Invert camera transformation cam = dataset.cameras() width = cam['frame'][0]['res_w'] height = cam['frame'][0]['res_h'] image_keypoints2d = image_coordinates(image_keypoints2d[..., :2], w=width, h=height) viz_camera = 0 # If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. for subject in dataset.cameras(): if 'orientation' in dataset.cameras()[subject][viz_camera]: rot = dataset.cameras()[subject][viz_camera]['orientation'] break prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) # because algo was meant for a list of frames, we take the first frame (our only frame) prediction3d = prediction[0] return prediction3d, image_keypoints2d # do we want to visualize? this code used to write to json and create a video for visualization #if args.viz_output is not None: if True: anim_output = {'Reconstruction': prediction} # format the data in the same format as mediapipe, so we can load it in unity with the same script # we need a list (frames) of lists of 3d landmarks. unity_landmarks = prediction.tolist() # how to send data? or display it? # maybe draw it on the webcam feed....?!?!?! #with open(args.output_json, "w") as json_file: # json.dump(unity_landmarks, json_file) #if args.rendervideo == "yes": # from common.visualization import render_animation # render_animation(input_keypoints, keypoints_metadata, anim_output, # dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, # limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, # input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), # input_video_skip=args.viz_skip) we_re_done_here = 1
def the_main_kaboose(args): print(args) try: # Create checkpoint directory if it does not exist os.makedirs(args.checkpoint) except OSError as e: if e.errno != errno.EEXIST: raise RuntimeError('Unable to create checkpoint directory:', args.checkpoint) print('Loading dataset...') dataset_path = 'data/data_3d_' + args.dataset + '.npz' if args.dataset == 'h36m': from common.h36m_dataset import Human36mDataset dataset = Human36mDataset(dataset_path) elif args.dataset.startswith('humaneva'): from common.humaneva_dataset import HumanEvaDataset dataset = HumanEvaDataset(dataset_path) elif args.dataset.startswith('custom'): from common.custom_dataset import CustomDataset dataset = CustomDataset('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz') else: raise KeyError('Invalid dataset') print('Preparing data...') for subject in dataset.subjects(): for action in dataset[subject].keys(): anim = dataset[subject][action] # this only works when training. if 'positions' in anim: positions_3d = [] for cam in anim['cameras']: pos_3d = world_to_camera(anim['positions'], R=cam['orientation'], t=cam['translation']) pos_3d[:, 1:] -= pos_3d[:, : 1] # Remove global offset, but keep trajectory in first position positions_3d.append(pos_3d) anim['positions_3d'] = positions_3d print('Loading 2D detections...') keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True) keypoints_metadata = keypoints['metadata'].item() keypoints_symmetry = keypoints_metadata['keypoints_symmetry'] kps_left, kps_right = list(keypoints_symmetry[0]), list( keypoints_symmetry[1]) joints_left, joints_right = list(dataset.skeleton().joints_left()), list( dataset.skeleton().joints_right()) keypoints = keypoints['positions_2d'].item() # THIS IS ABOUT TRAINING. ignore pls. for subject in dataset.subjects(): assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format( subject) for action in dataset[subject].keys(): assert action in keypoints[ subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format( action, subject) if 'positions_3d' not in dataset[subject][action]: continue for cam_idx in range(len(keypoints[subject][action])): # We check for >= instead of == because some videos in H3.6M contain extra frames mocap_length = dataset[subject][action]['positions_3d'][ cam_idx].shape[0] assert keypoints[subject][action][cam_idx].shape[ 0] >= mocap_length if keypoints[subject][action][cam_idx].shape[0] > mocap_length: # Shorten sequence keypoints[subject][action][cam_idx] = keypoints[subject][ action][cam_idx][:mocap_length] assert len(keypoints[subject][action]) == len( dataset[subject][action]['positions_3d']) # normalize camera frame? for subject in keypoints.keys(): for action in keypoints[subject]: for cam_idx, kps in enumerate(keypoints[subject][action]): # Normalize camera frame cam = dataset.cameras()[subject][cam_idx] kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) keypoints[subject][action][cam_idx] = kps subjects_train = args.subjects_train.split(',') subjects_semi = [] if not args.subjects_unlabeled else args.subjects_unlabeled.split( ',') if not args.render: subjects_test = args.subjects_test.split(',') else: subjects_test = [args.viz_subject] semi_supervised = len(subjects_semi) > 0 if semi_supervised and not dataset.supports_semi_supervised(): raise RuntimeError( 'Semi-supervised training is not implemented for this dataset') def fetch(subjects, action_filter=None, subset=1, parse_3d_poses=True): out_poses_3d = [] out_poses_2d = [] out_camera_params = [] for subject in subjects: print("gonna check actions for subject " + subject) for subject in subjects: for action in keypoints[subject].keys(): if action_filter is not None: found = False for a in action_filter: if action.startswith(a): found = True break if not found: continue poses_2d = keypoints[subject][action] for i in range(len(poses_2d)): # Iterate across cameras out_poses_2d.append(poses_2d[i]) if subject in dataset.cameras(): cams = dataset.cameras()[subject] assert len(cams) == len(poses_2d), 'Camera count mismatch' for cam in cams: if 'intrinsic' in cam: out_camera_params.append(cam['intrinsic']) if parse_3d_poses and 'positions_3d' in dataset[subject][ action]: poses_3d = dataset[subject][action]['positions_3d'] assert len(poses_3d) == len( poses_2d), 'Camera count mismatch' for i in range(len(poses_3d)): # Iterate across cameras out_poses_3d.append(poses_3d[i]) if len(out_camera_params) == 0: out_camera_params = None if len(out_poses_3d) == 0: out_poses_3d = None stride = args.downsample if subset < 1: for i in range(len(out_poses_2d)): n_frames = int( round(len(out_poses_2d[i]) // stride * subset) * stride) start = deterministic_random( 0, len(out_poses_2d[i]) - n_frames + 1, str(len(out_poses_2d[i]))) out_poses_2d[i] = out_poses_2d[i][start:start + n_frames:stride] if out_poses_3d is not None: out_poses_3d[i] = out_poses_3d[i][start:start + n_frames:stride] elif stride > 1: # Downsample as requested for i in range(len(out_poses_2d)): out_poses_2d[i] = out_poses_2d[i][::stride] if out_poses_3d is not None: out_poses_3d[i] = out_poses_3d[i][::stride] return out_camera_params, out_poses_3d, out_poses_2d action_filter = None if args.actions == '*' else args.actions.split(',') if action_filter is not None: print('Selected actions:', action_filter) # when you run inference, this returns None, None, and the keypoints array renamed as poses_valid_2d cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test, action_filter) filter_widths = [int(x) for x in args.architecture.split(',')] if not args.disable_optimizations and not args.dense and args.stride == 1: # Use optimized model for single-frame predictions shape_2 = poses_valid_2d[0].shape[-2] shape_1 = poses_valid_2d[0].shape[-1] numJoints = dataset.skeleton().num_joints() model_pos_train = TemporalModelOptimized1f(shape_2, shape_1, numJoints, filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels) else: # When incompatible settings are detected (stride > 1, dense filters, or disabled optimization) fall back to normal model model_pos_train = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side if args.causal: print('INFO: Using causal convolutions') causal_shift = pad else: causal_shift = 0 model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() model_pos_train = model_pos_train.cuda() if args.resume or args.evaluate: chk_filename = os.path.join( args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format( checkpoint['epoch'])) model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) if args.evaluate and 'model_traj' in checkpoint: # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild) model_traj = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1, filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_traj = model_traj.cuda() model_traj.load_state_dict(checkpoint['model_traj']) else: model_traj = None test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) # Evaluate def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): if not use_trajectory_model: model_pos.eval() else: model_traj.eval() N = 0 for _, batch, batch_2d in eval_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model if not use_trajectory_model: predicted_3d_pos = model_pos(inputs_2d) else: predicted_3d_pos = model_traj(inputs_2d) # Test-time augmentation (if enabled) if eval_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 if not use_trajectory_model: predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[ 1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if eval_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[ 0] * inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0] * inputs_3d.shape[ 1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape( -1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape( -1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[ 0] * inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0] * inputs_3d.shape[ 1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----' + action + '----') e1 = (epoch_loss_3d_pos / N) * 1000 e2 = (epoch_loss_3d_pos_procrustes / N) * 1000 e3 = (epoch_loss_3d_pos_scale / N) * 1000 ev = (epoch_loss_3d_vel / N) * 1000 print('Test time augmentation:', eval_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev if args.render: print('Rendering...') input_keypoints = keypoints[args.viz_subject][args.viz_action][ args.viz_camera].copy() ground_truth = None if args.viz_subject in dataset.subjects( ) and args.viz_action in dataset[args.viz_subject]: if 'positions_3d' in dataset[args.viz_subject][args.viz_action]: ground_truth = dataset[args.viz_subject][ args.viz_action]['positions_3d'][args.viz_camera].copy() if ground_truth is None: print( 'INFO: this action is unlabeled. Ground truth will not be rendered.' ) gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) if model_traj is not None and ground_truth is None: prediction_traj = evaluate(gen, return_predictions=True, use_trajectory_model=True) prediction += prediction_traj if args.viz_export is not None: print('Exporting joint positions to', args.viz_export) # Predictions are in camera space np.save(args.viz_export, prediction) if args.viz_output is not None: if ground_truth is not None: # Reapply trajectory trajectory = ground_truth[:, :1] ground_truth[:, 1:] += trajectory prediction += trajectory # Invert camera transformation cam = dataset.cameras()[args.viz_subject][args.viz_camera] if ground_truth is not None: prediction = camera_to_world(prediction, R=cam['orientation'], t=cam['translation']) ground_truth = camera_to_world(ground_truth, R=cam['orientation'], t=cam['translation']) else: # If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. for subject in dataset.cameras(): if 'orientation' in dataset.cameras()[subject][ args.viz_camera]: rot = dataset.cameras()[subject][ args.viz_camera]['orientation'] break prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} if ground_truth is not None and not args.viz_no_ground_truth: anim_output['Ground truth'] = ground_truth input_keypoints = image_coordinates(input_keypoints[..., :2], w=cam['res_w'], h=cam['res_h']) print("Writing to json") import json # format the data in the same format as mediapipe, so we can load it in unity with the same script # we need a list (frames) of lists of 3d landmarks. # but prediction[] only has 17 landmarks, and we need 25 in our unity script unity_landmarks = prediction.tolist() with open(args.output_json, "w") as json_file: json.dump(unity_landmarks, json_file) if args.rendervideo == "yes": from common.visualization import render_animation render_animation(input_keypoints, keypoints_metadata, anim_output, dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), input_video_skip=args.viz_skip)
if torch.cuda.is_available(): model_pos = model_pos.cuda() model_pos_train = model_pos_train.cuda() if args.resume or args.evaluate: chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) print('Checkpoint loading finished.') test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) if not args.evaluate: cameras_train, poses_train, poses_train_2d = fetch(subjects_train, action_filter, subset=args.subset) lr = args.learning_rate if semi_supervised: cameras_semi, _, poses_semi_2d = fetch(subjects_semi, action_filter, parse_3d_poses=False) if not args.disable_optimizations and not args.dense and args.stride == 1: # Use optimized model for single-frame predictions model_traj_train = TemporalModelOptimized1f(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1, filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels) else:
print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() # model_pos_train = model_pos_train.cuda() if args.resume or args.evaluate: chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) # model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) def evaluate(test_generator, action=None, return_predictions=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): model_pos.eval() N = 0 for _, batch, batch_2d in test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available():
model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time3 = ckpt_time(time2) print('load 3D pose spend {:2f} second'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') # import ipdb;ipdb.set_trace() input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. # for subject in dataset.cameras(): # if 'orientation' in dataset.cameras()[subject][args.viz_camera]: # rot = dataset.cameras()[subject][args.viz_camera]['orientation'] # break rot = cam['orientation'] tran = cam['translation'] prediction = camera_to_world(prediction, R=rot, t=tran) # We don't have the trajectory, but at least we can rebase the height
class Predictor: def __init__(self, dataset_path, checkpoint_path, input_video_path=None, export_path=None, output_path=None, with_cude=False): self.with_cuda = with_cude self.dataset_path = dataset_path self.export_path = export_path self.output_path = output_path self.input_video_path = input_video_path self.dataset = CustomDataset(self.dataset_path) self.keypoints = None self.keypoints_left = None self.keypoints_right = None self.joints_left = None self.joints_right = None self.checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) self.model = None self.init_keypoints() self.valid_poses = self.keypoints["detectron2"]["custom"] self.init_model() self.test_generator = None self.init_generator() self.prediction = None self.make_prediction() def export_prediction(self): if self.export_path is not None: np.save(self.export_path, self.prediction) def init_model(self): self.model = TemporalModel(self.valid_poses[0].shape[-2], self.valid_poses[0].shape[-1], self.dataset.skeleton().num_joints(), filter_widths=[3, 3, 3, 3, 3], causal=False, dropout=0.25, channels=1024, dense=False) self.model.load_state_dict(self.checkpoint['model_pos']) def init_keypoints(self): self.keypoints = np.load(self.dataset_path, allow_pickle=True) keypoints_metadata = self.keypoints['metadata'].item() keypoints_symmetry = keypoints_metadata['keypoints_symmetry'] self.keypoints_left, self.keypoints_right = list( keypoints_symmetry[0]), list(keypoints_symmetry[1]) self.joints_left, self.joints_right = list( self.dataset.skeleton().joints_left()), list( self.dataset.skeleton().joints_right()) self.keypoints = self.keypoints['positions_2d'].item() for subject in self.keypoints.keys(): for action in self.keypoints[subject]: for cam_idx, kps in enumerate(self.keypoints[subject][action]): # Normalize camera frame cam = self.dataset.cameras()[subject][cam_idx] kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) self.keypoints[subject][action][cam_idx] = kps def init_generator(self): receptive_field = self.model.receptive_field() pad = (receptive_field - 1) // 2 causal_shift = 0 self.test_generator = UnchunkedGenerator( None, None, self.valid_poses, pad=pad, causal_shift=causal_shift, augment=False, kps_left=self.keypoints_left, kps_right=self.keypoints_right, joints_left=self.joints_left, joints_right=self.joints_right) def make_prediction(self): if self.with_cuda: self.model = self.model.cuda() with torch.no_grad(): self.model.eval() for _, batch, batch_2d in self.test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if self.with_cuda: inputs_2d = inputs_2d.cuda() predicted_3d_pos = self.model(inputs_2d) if self.test_generator.augment_enabled(): predicted_3d_pos[1, :, :, 0] *= -1 predicted_3d_pos[1, :, self.joints_left + self.joints_right] = predicted_3d_pos[ 1, :, self.joints_right + self.joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) predicted_3d_pos = predicted_3d_pos.squeeze(0).cpu().numpy() rot = self.dataset.cameras()['detectron2'][0]['orientation'] predicted_3d_pos = camera_to_world(predicted_3d_pos, R=rot, t=0) predicted_3d_pos[:, :, 2] -= np.min(predicted_3d_pos[:, :, 2]) self.prediction = predicted_3d_pos def plot_pose(self, pose_index=0): pose = make_pose(self.prediction.tolist()[pose_index]) pose.prepare_plot() pose.plot()
if not args.render: print("Invalid argument:", args.render) # 渲染3D姿势 input_keypoints = keypoints[args.viz_subject][args.viz_action][ args.viz_camera].copy() print('INFO: this action is unlabeled. Ground truth will not be rendered.') print('kps_left:', kps_left, 'kps_right:', kps_right) print('joints_left:', joints_left, 'joints_right:', joints_right) gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(gen.num_frames())) prediction = evaluate(gen) print('prediction.shape: ', prediction.shape) if args.viz_export is not None: print('Exporting joint positions to', args.viz_export) # Predictions are in camera space np.save(args.viz_export, prediction) if args.viz_output is not None:
def main(args): # 第一步:检测2D关键点 detector_2d = get_detector_2d(args.detector_2d) assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)' # 2D kpts loads or generate if not args.input_npz: video_name = args.viz_video keypoints = detector_2d(video_name) else: npz = np.load(args.input_npz) keypoints = npz['kpts'] # (N, 17, 2) # 第二步:将2D关键点转换为3D关键点 keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list(keypoints_symmetry[0]), list( keypoints_symmetry[1]) joints_left, joints_right = list([4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints Suppose using the camera parameter keypoints = normalize_screen_coordinates(keypoints[..., :2], w=1000, h=1002) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_pos = model_pos.cuda() ckpt, time1 = ckpt_time(time0) print('-------------- load data spends {:.2f} seconds'.format(ckpt)) # load trained model chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load( chk_filename, map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time2 = ckpt_time(time1) print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) # save 3D joint points 保存三维关节点 np.save('outputs/test_3d_output.npy', prediction, allow_pickle=True) # 第三步:将预测的三维点从相机坐标系转换到世界坐标系 # (1)第一种转换方法 rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height将预测的三维点的Z值减去预测的三维点中Z的最小值,得到正向的Z值 prediction[:, :, 2] -= np.min(prediction[:, :, 2]) # (2)第二种转换方法 # subject = 'S1' # cam_id = '55011271' # cam_params = load_camera_params('./camera/cameras.h5')[subject][cam_id] # R = cam_params['R'] # T = 0 # azimuth = cam_params['azimuth'] # # prediction = camera2world(pose=prediction, R=R, T=T) # prediction[:, :, 2] -= np.min(prediction[:, :, 2]) # rebase the height # 第四步:将3D关键点输出并将预测的3D点转换为bvh骨骼 # 将三维预测点输出 write_3d_point(args.viz_output, prediction) # 将预测的三维骨骼点转换为bvh骨骼 prediction_copy = np.copy(prediction) write_standard_bvh(args.viz_output, prediction_copy) #转为标准bvh骨骼 write_smartbody_bvh(args.viz_output, prediction_copy) #转为SmartBody所需的bvh骨骼 anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates(input_keypoints[..., :2], w=1000, h=1002) ckpt, time3 = ckpt_time(time2) print( '-------------- generate reconstruction 3D data spends {:.2f} seconds'. format(ckpt)) if not args.viz_output: args.viz_output = 'outputs/outputvideo/alpha_result.mp4' # 第五步:生成输出视频 # from common.visualization import render_animation # render_animation(input_keypoints, anim_output, # Skeleton(), 25, args.viz_bitrate, np.array(70., dtype=np.float32), args.viz_output, # limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, # input_video_path=args.viz_video, viewport=(1000, 1002), # input_video_skip=args.viz_skip) ckpt, time4 = ckpt_time(time3) print('total spend {:2f} second'.format(ckpt))