Exemplo n.º 1
0
 def init_generator(self):
     receptive_field = self.model.receptive_field()
     pad = (receptive_field - 1) // 2
     causal_shift = 0
     self.test_generator = UnchunkedGenerator(
         None,
         None,
         self.valid_poses,
         pad=pad,
         causal_shift=causal_shift,
         augment=False,
         kps_left=self.keypoints_left,
         kps_right=self.keypoints_right,
         joints_left=self.joints_left,
         joints_right=self.joints_right)
Exemplo n.º 2
0
    def run_evaluation(actions, action_filter=None):
        errors_p1 = []
        errors_p2 = []

        for action_key in actions.keys():
            if action_filter is not None:
                found = False
                for a in action_filter:
                    if action_key.startswith(a):
                        found = True
                        break
                if not found:
                    continue

            poses_act, poses_2d_act = fetch_actions(actions[action_key])
            gen = UnchunkedGenerator(None,
                                     poses_act,
                                     poses_2d_act,
                                     pad=pad,
                                     causal_shift=causal_shift,
                                     augment=args.test_time_augmentation,
                                     kps_left=kps_left,
                                     kps_right=kps_right,
                                     joints_left=joints_left,
                                     joints_right=joints_right)
            e1, e2 = evaluate(gen, model_pos, joints_left, joints_right,
                              action_key)
            errors_p1.append(e1)
            errors_p2.append(e2)

        print('Protocol #1   (MPJPE) action-wise average:',
              round(np.mean(errors_p1), 1), 'mm')
        print('Protocol #2 (P-MPJPE) action-wise average:',
              round(np.mean(errors_p2), 1), 'mm')
Exemplo n.º 3
0
def gen_pose_frame(kpts, width, height, model_pos, pad, causal_shift=0):
    # kpts: (M, T, N, 2)
    norm_seqs = []
    for kpt in kpts:
        norm_kpt = normalize_screen_coordinates(kpt, w=width, h=height)
        norm_seqs.append(norm_kpt)

    gen = UnchunkedGenerator(None,
                             None,
                             norm_seqs,
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=True,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos)

    prediction_to_world = []
    for i in range(len(prediction)):
        sub_prediction = prediction[i][0]
        sub_prediction = camera_to_world(sub_prediction, R=rot, t=0)
        sub_prediction[:, 2] -= np.amin(sub_prediction[:, 2])
        prediction_to_world.append(sub_prediction)

    return prediction_to_world
def joints_2d_generator(joints_coords, pose3d_predictor, padding=False):
    """
    2d关节点坐标生成器
    Args:
        joints_coords: 坐标

    Returns: 生成器

    """
    if not padding:
        pad = 0
    else:
        receive_field = pose3d_predictor.receptive_field()
        pad = receive_field // 2

    causal_shift = 0
    kps_left = [1, 3, 5, 7, 9, 11, 13, 15]
    kps_right = [2, 4, 6, 8, 10, 12, 14, 16]
    generator = UnchunkedGenerator(None,
                                   None, [joints_coords],
                                   pad=pad,
                                   causal_shift=causal_shift,
                                   augment=True,
                                   kps_left=kps_left,
                                   kps_right=kps_right)
    return generator
Exemplo n.º 5
0
    def run_evaluation(actions, action_filter=None):
        errors_p1 = []
        errors_p2 = []
        errors_p3 = []
        errors_vel = []

        for action_key in actions.keys():
            if action_filter is not None:
                found = False
                for a in action_filter:
                    if action_key.startswith(a):
                        found = True
                        break
                if not found:
                    continue

            poses_act, poses_2d_act = fetch_actions(actions[action_key])
            gen = UnchunkedGenerator(None, poses_act, poses_2d_act,
                                     pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
                                     kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
            #prediction=evaluate(gen,action_key)
            #np.save(('3dcoord'+action_key+'.npy'),prediction)   
            #'''
            e1, e2, e3, ev = evaluate(gen, action_key)
            errors_p1.append(e1)
            errors_p2.append(e2)
            errors_p3.append(e3)
            errors_vel.append(ev)

        print('Protocol #1   (MPJPE) action-wise average:', round(np.mean(errors_p1), 1), 'mm')
        print('Protocol #2 (P-MPJPE) action-wise average:', round(np.mean(errors_p2), 1), 'mm')
        print('Protocol #3 (N-MPJPE) action-wise average:', round(np.mean(errors_p3), 1), 'mm')
        print('Velocity      (MPJVE) action-wise average:', round(np.mean(errors_vel), 2), 'mm')
Exemplo n.º 6
0
def interface(model_pos, keypoints, W, H):
    # input (N, 17, 2) return (N, 17, 3)
    if not isinstance(keypoints, np.ndarray):
        keypoints = np.array(keypoints)

    from common.camera import normalize_screen_coordinates_new, camera_to_world, normalize_screen_coordinates
    #  keypoints = normalize_screen_coordinates_new(keypoints[..., :2], w=W, h=H)
    keypoints = normalize_screen_coordinates(keypoints[..., :2],
                                             w=1000,
                                             h=1002)
    input_keypoints = keypoints.copy()
    # test_time_augmentation True
    from common.generators import UnchunkedGenerator
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=common.pad,
                             causal_shift=common.causal_shift,
                             augment=True,
                             kps_left=common.kps_left,
                             kps_right=common.kps_right,
                             joints_left=common.joints_left,
                             joints_right=common.joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)
    prediction = camera_to_world(prediction, R=common.rot, t=0)
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    return prediction
Exemplo n.º 7
0
def predict(img_path):
    # 1.检测关节点并显示
    # 预处理输入图像和检测人体
    x, img = data.transforms.presets.yolo.load_test(img_path, short=256)
    # print("Shape of pre-processed image:", x.shape)

    start = time.time()

    # detect persons and bbox
    class_ids, scores, bounding_boxes = detector(x)

    # 2.预处理检测器的输出张量作为alpha_pose的输入
    pose_input, upscale_bbox = detector_to_simple_pose(img, class_ids, scores, bounding_boxes)

    global detector_time
    detector_time += (time.time() - start)

    print("detector cost time: {:.3f} seconds".format(time.time() - start))
    prepare_end = time.time()

    # 3.预测关节点
    if pose_input is None:
        return None, None
    predicted_heatmap = pose_net(pose_input)
    pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox)
    global predictor_2d_time
    predictor_2d_time += (time.time() - prepare_end)
    print("2d pose predictor cost time: {:.3f} seconds".format(time.time() - prepare_end))

    # 4.显示2d姿态
    # utils.viz.plot_keypoints(img, pred_coords, confidence, class_IDs, bounding_boxes, scores, box_thresh=0.5,
    #                          keypoint_thresh=0.2)

    # 5.坐标标准化
    prepare_end = time.time()
    kps = normalize_screen_coordinates(pred_coords.asnumpy(), w=img.shape[1], h=img.shape[0])

    receptive_field = pose3d_predictor.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    # 6.创建生成器作为3d预测器的输入
    generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False)

    # 7.3d姿势估计和显示
    prediction = predict_3d_pos(generator, pose3d_predictor)
    global full_time, predictor_3d_time
    predictor_3d_time += time.time() - prepare_end
    full_time += time.time() - start
    print("3d predictor time: {:.3f} seconds".format(time.time() - prepare_end))

    rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    return prediction, img
Exemplo n.º 8
0
def run_evaluation(actions, action_filter=None):
    errors_p1 = []
    errors_p2 = []
    errors_p3 = []
    errors_vel = []

    print('ACTIONS:::::', actions)

    for action_key in actions.keys():
        if action_filter is not None:
            found = False
            for a in action_filter:
                if action_key.startswith(a):
                    found = True
                    break
            if not found:
                continue

        # poses_act, poses_2d_act = fetch_actions(actions[action_key])
        print(sets['test']['sub'])
        print(sets['test']['act'])
        _, poses_act, poses_2d_act = fetch(sets['test']['sub'],
                                           sets['test']['act'])
        print('EVAL GEN:::::')
        print(np.shape(poses_act))
        print(np.shape(poses_2d_act))
        print(np.shape(poses_act[0]))
        print(np.shape(poses_2d_act[0]))
        gen = UnchunkedGenerator(None,
                                 poses_act,
                                 poses_2d_act,
                                 pad=pad,
                                 causal_shift=causal_shift,
                                 augment=False,
                                 kps_left=kps_left,
                                 kps_right=kps_right,
                                 joints_left=joints_left,
                                 joints_right=joints_right)
        e1, e2, e3, ev = evaluate(gen, action_key)
        errors_p1.append(e1)
        errors_p2.append(e2)
        errors_p3.append(e3)
        errors_vel.append(ev)

    print('Protocol #1   (MPJPE) action-wise average:',
          round(np.mean(errors_p1), 1), 'mm')
    print('Protocol #2 (P-MPJPE) action-wise average:',
          round(np.mean(errors_p2), 1), 'mm')
    print('Protocol #3 (N-MPJPE) action-wise average:',
          round(np.mean(errors_p3), 1), 'mm')
    print('Velocity      (MPJVE) action-wise average:',
          round(np.mean(errors_vel), 2), 'mm')
def predict_images(image_dir: str = '../images'):
    import os
    filenames = os.listdir(image_dir)
    image_files = [os.path.join(image_dir, fn) for fn in filenames]
    for i, img_file in enumerate(image_files):
        figure = plt.figure(figsize=(12, 6), dpi=100)
        img = cv2.imread(img_file)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        h, w, _ = img.shape
        hw = (512, int(h / w * 512))
        img = cv2.resize(img, hw)

        # 二维姿态生成器
        start = time.time()
        kps = predict_kps(kps_predictor, img)
        # print(kps)
        print("Spending {:.2f} seconds to predict 2d pose.".format(time.time() - start))
        # 标准化,去掉概率列,只保留坐标值
        kps = normalize_screen_coordinates(kps[..., :2], w=img.shape[1], h=img.shape[0])

        kps = torch.from_numpy(kps).unsqueeze(0).numpy()
        # 创建生成器作为3d预测器的输入
        generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False)

        # 三维姿态估计
        start = time.time()

        prediction = predict_3d_pos(generator, pose3d_predictor)
        rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32)
        prediction = camera_to_world(prediction, R=rot, t=0)

        # We don't have the trajectory, but at least we can rebase the height
        prediction[:, :, 2] -= np.min(prediction[:, :, 2])
        pos_3d = {'Reconstruction': prediction}

        # 渲染图像
        render_image(pos_3d=pos_3d, skeleton=Skeleton(),
                     azim=np.array(70., dtype=np.float32),
                     input_video_frame=img, fig=figure)

        elapsed = time.time() - start
        print("Spending {:.2f} seconds to predict image: {}".format(elapsed, img_file))

        figure.tight_layout()

        plt.savefig("images/" + str(i + 1) + '.png', bbox_inches='tight')
        plt.close()
def predict_3d_joints(predictor, coords_2d, w, h):
    # 坐标标准化
    kps = normalize_screen_coordinates(coords_2d, w, h)
    # print('kps.type: {}, kps.shape: {}'.format(type(kps), kps.shape))

    # 2d keypoints生成器
    receptive_field = predictor.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    # 创建生成器作为3d预测器的输入
    generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False)
    prediction = predict_3d_pos(generator, predictor)
    prediction = camera_to_world(prediction, R=rot, t=0)
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    return prediction
Exemplo n.º 11
0
def joints_2d_generator(joints_coords):
    """
    2d关节点坐标生成器
    Args:
        joints_coords: 坐标

    Returns: 生成器

    """
    pad = 0
    causal_shift = 0
    kps_left = [1, 3, 5, 7, 9, 11, 13, 15]
    kps_right = [2, 4, 6, 8, 10, 12, 14, 16]
    generator = UnchunkedGenerator(None, None, [joints_coords], pad=pad, causal_shift=causal_shift, augment=True,
                                   kps_left=kps_left, kps_right=kps_right)
    return generator
Exemplo n.º 12
0
def gen_pose(kpts,
             valid_frames,
             width,
             height,
             model_pos,
             pad,
             causal_shift=0):
    assert len(kpts.shape) == 4, 'The shape of kpts: {}'.format(kpts.shape)
    assert kpts.shape[0] == len(valid_frames)

    norm_seqs = []
    for index, frames in enumerate(valid_frames):
        seq_kps = kpts[index, frames]
        norm_seq_kps = normalize_screen_coordinates(seq_kps, w=width, h=height)
        norm_seqs.append(norm_seq_kps)

    gen = UnchunkedGenerator(None,
                             None,
                             norm_seqs,
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=True,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos)

    prediction_to_world = []
    for i in range(len(prediction)):
        sub_prediction = prediction[i]

        sub_prediction = camera_to_world(sub_prediction, R=rot, t=0)

        # sub_prediction[:, :, 2] -= np.expand_dims(np.amin(sub_prediction[:, :, 2], axis=1), axis=1).repeat([17], axis=1)
        # sub_prediction[:, :, 2] -= np.amin(sub_prediction[:, :, 2])

        prediction_to_world.append(sub_prediction)

    # prediction_to_world = np.asarray(prediction_to_world, dtype=np.float32)
    return prediction_to_world
Exemplo n.º 13
0
def gen_pose_frame_(kpts, width, height, model_pos, pad, causal_shift=0):
    # input (N, 17, 2) return (N, 17, 3)
    if not isinstance(kpts, np.ndarray):
        kpts = np.array(kpts)

    keypoints = normalize_screen_coordinates(kpts[..., :2], w=width, h=height)

    input_keypoints = keypoints.copy()
    # test_time_augmentation True
    from common.generators import UnchunkedGenerator
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=True,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos)
    prediction = camera_to_world(prediction[0], R=rot, t=0)
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    return prediction
Exemplo n.º 14
0
                                   dense=args.dense)
        if torch.cuda.is_available():
            model_traj = model_traj.cuda()
        model_traj.load_state_dict(checkpoint['model_traj'])
    else:
        model_traj = None

# print(poses_valid)
# print(poses_valid_2d)
# print(np.shape(poses_valid))
# print(np.shape(poses_valid_2d))
test_generator = UnchunkedGenerator(cameras_valid,
                                    poses_valid,
                                    poses_valid_2d,
                                    pad=pad,
                                    causal_shift=causal_shift,
                                    augment=False,
                                    kps_left=kps_left,
                                    kps_right=kps_right,
                                    joints_left=joints_left,
                                    joints_right=joints_right)
print('INFO: Testing on {} frames'.format(test_generator.num_frames()))

if not args.evaluate:
    print(sets)
    cameras_train, poses_train, poses_train_2d = fetch(sets['train']['sub'],
                                                       sets['train']['act'],
                                                       subset=args.subset)

    lr = args.learning_rate

    optimizer = optim.Adam(model_pos_train.parameters(), lr=lr, amsgrad=True)
Exemplo n.º 15
0
if args['resume'] or args['evaluate']:
    chk_filename = os.path.join(
        args['checkpoint'],
        args['resume'] if args['resume'] else args['evaluate'])
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(chk_filename,
                            map_location=lambda storage, loc: storage)
    print('This model was trained for {} epochs'.format(checkpoint['epoch']))
    model_pos_train.load_state_dict(checkpoint['model_pos'])
    model_pos.load_state_dict(checkpoint['model_pos'])

test_generator = UnchunkedGenerator(cameras_valid,
                                    poses_valid,
                                    poses_valid_2d,
                                    pad=121,
                                    causal_shift=causal_shift,
                                    augment=False,
                                    kps_left=kps_left,
                                    kps_right=kps_right,
                                    joints_left=joints_left,
                                    joints_right=joints_right)
print('Evaluating...')
all_actions = {}
all_actions_by_subject = {}
for subject in subjects_test:
    if subject not in all_actions_by_subject:
        all_actions_by_subject[subject] = {}

    for action in dataset[subject].keys():
        action_name = action.split(' ')[0]
        if action_name not in all_actions:
            all_actions[action_name] = []
Exemplo n.º 16
0
def predict(img_path):
    # 1.预处理输入图像和检测人体
    x, img = data.transforms.presets.yolo.load_test(img_path, short=256)
    # detector.summary(x)
    # print("x.shape:", x.shape)

    start = time.time()

    # detect persons and bbox,
    class_ids, scores, bounding_boxes = detector(x)  # shape: [sample_idx, class_idx, instance]
    # print("bounding_boxes.shape", bounding_boxes.shape, "bounding_boxes[0, 0]:", bounding_boxes[0, 0])

    # 2.预处理检测器的输出张量作为mobile_pose的输入
    pose_input, upscale_bbox = detector_to_mobile_pose(img, class_ids, scores, bounding_boxes)
    print("detector cost time: {:.3f} seconds".format(time.time() - start))
    global detector_time
    detector_time += (time.time() - start)

    if pose_input is None:
        return None, None
    # 4.2d关节点预测
    # pose_net.summary(pose_input)
    start_time = time.time()
    predicted_heatmap = pose_net(pose_input)
    pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox)
    # print("type(pre_coords): {}, shape(pre_coords): {}".format(type(pred_coords), pred_coords.shape))
    # print("pred_coords: {}".format(pred_coords))
    global predictor_2d_time
    predictor_2d_time += (time.time() - start_time)
    print("2d pose predictor cost time: {:.3f} seconds".format(time.time() - start_time))

    # 5.显示2d姿态
    # ax = utils.viz.plot_keypoints(img, pred_coords, confidence, class_IDs, bounding_boxes, scores, box_thresh=0.5,
    #                               keypoint_thresh=0.2)
    # print(pred_coords)
    # 6.坐标标准化
    start_time = time.time()
    kps = normalize_screen_coordinates(pred_coords.asnumpy(), w=img.shape[1], h=img.shape[0])
    # print('kps.type: {}, kps.shape: {}'.format(type(kps), kps.shape))

    # 7.2d keypoints生成器
    receptive_field = pose3d_predictor.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    # 创建生成器作为3d预测器的输入
    generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False)

    # 8.3d姿势估计和显示
    prediction = predict_3d_pos(generator, pose3d_predictor)
    global predictor_3d_time, full_time
    predictor_3d_time += (time.time() - start_time)
    full_time += (time.time() - start)
    print("3d pose predictor cost time: {:.3f} seconds".format(time.time() - start_time))
    # print("prediction.shape: ", prediction.shape)

    rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)

    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    elapsed = time.time() - start
    print("Total elapsed time of predicting image file {}: {:.3f} seconds".format(img_path, elapsed))
    return prediction, img
Exemplo n.º 17
0
    causal_shift = 0

model_params = 0
for parameter in model_pos.parameters():
    model_params += parameter.numel()
print('INFO: Trainable parameter count:', model_params)

if torch.cuda.is_available():
    model_pos = model_pos.cuda()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
weight_path = 'checkpoint/VideoPose_030.weights'
model_pos.load_state_dict(torch.load(weight_path, map_location=device))

test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d,
                                    pad=pad, causal_shift=causal_shift, augment=False,
                                    kps_left=kps_left, kps_right=kps_right, joints_left=joints_left,
                                    joints_right=joints_right)
print('INFO: Testing on {} frames'.format(test_generator.num_frames()))


# Evaluate
def evaluate(test_generator, action=None, return_predictions=False):
    epoch_loss_3d_pos = 0
    epoch_loss_3d_pos_procrustes = 0
    epoch_loss_3d_pos_scale = 0
    epoch_loss_3d_vel = 0
    with torch.no_grad():
        model_pos.eval()
        N = 0
        for _, batch, batch_2d in test_generator.next_epoch():
            inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
Exemplo n.º 18
0
def main():
    args = parse_args()
    args.input_npz = "data/VideoPose_test.npz"
    metadata = {
        'layout_name':
        'coco',
        'num_joints':
        17,
        'keypoints_symmetry': [[1, 3, 5, 7, 9, 11, 13, 15],
                               [2, 4, 6, 8, 10, 12, 14, 16]]
    }

    npz = np.load(args.input_npz)
    keypoints = npz['kpts']
    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(keypoints_symmetry[0]), list(
        keypoints_symmetry[1])
    #same with the original: list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right())
    joints_left, joints_right = list([4, 5, 6, 11, 12,
                                      13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  Suppose using the camera parameter
    res_w = 1920
    res_h = 1080
    keypoints = normalize_screen_coordinates(keypoints[..., :2],
                                             w=res_w,
                                             h=res_h)

    #model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(),
    #                        filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
    #                        dense=args.dense)
    model_pos = TemporalModel(17,
                              2,
                              17,
                              filter_widths=[3, 3, 3, 3, 3],
                              causal=args.causal,
                              dropout=args.dropout,
                              channels=args.channels,
                              dense=args.dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    if args.causal:
        print('INFO: Using causal convolutions')
        causal_shift = pad
    else:
        causal_shift = 0

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

#load model
    chk_filename = os.path.join(args.checkpoint,
                                args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(chk_filename,
                            map_location=lambda storage, loc: storage)
    print('This model was trained for {} epochs'.format(checkpoint['epoch']))
    model_pos.load_state_dict(checkpoint['model_pos'])

    #test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d,
    #                                pad=pad, causal_shift=causal_shift, augment=False,
    #                                kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)

    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=args.test_time_augmentation,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate_alphapose(gen, model_pos, return_predictions=True)
    print('INFO: Testing on {} frames'.format(gen.num_frames()))

    if args.viz_export is not None:
        print('Exporting joint positions to', args.viz_export)
        # Predictions are in camera space
        np.save(args.viz_export, prediction)

    if args.viz_output is not None:
        #from custom_dataset.py
        rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804],
                       dtype=np.float32)
        prediction = camera_to_world(prediction, R=rot, t=0)

        # We don't have the trajectory, but at least we can rebase the height
        prediction[:, :, 2] -= np.min(prediction[:, :, 2])
        anim_output = {'Reconstruction': prediction}
        input_keypoints = image_coordinates(input_keypoints[..., :2],
                                            w=res_w,
                                            h=res_h)
        # Generate metadata:
        keypoints_metadata = {}
        keypoints_metadata['layout_name'] = 'coco'
        keypoints_metadata['num_joints'] = 17
        keypoints_metadata['keypoints_symmetry'] = [[
            1, 3, 5, 7, 9, 11, 13, 15
        ], [2, 4, 6, 8, 10, 12, 14, 16]]
        from common.visualization import render_animation
        #fps 25, azimuth 70
        render_animation(input_keypoints,
                         keypoints_metadata,
                         anim_output,
                         Skeleton(),
                         25,
                         args.viz_bitrate,
                         np.array(70., dtype=np.float32),
                         args.viz_output,
                         limit=args.viz_limit,
                         downsample=args.viz_downsample,
                         size=args.viz_size,
                         input_video_path=args.viz_video,
                         viewport=(res_w, res_h),
                         input_video_skip=args.viz_skip)
Exemplo n.º 19
0
if args.resume or args.evaluate:
    chk_filename = os.path.join(args.checkpoint,
                                args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(chk_filename,
                            map_location=lambda storage, loc: storage)
    model_pos_train.load_state_dict(checkpoint['model_pos'], strict=False)
    model_pos.load_state_dict(checkpoint['model_pos'], strict=False)

test_generator = UnchunkedGenerator(cameras_valid,
                                    poses_valid,
                                    poses_valid_2d,
                                    action_class_valid,
                                    pad=pad,
                                    causal_shift=causal_shift,
                                    augment=False,
                                    kps_left=kps_left,
                                    kps_right=kps_right,
                                    joints_left=joints_left,
                                    joints_right=joints_right)
print('INFO: Testing on {} frames'.format(test_generator.num_frames()))


def eval_data_prepare(receptive_field, inputs_2d, inputs_3d,
                      inputs_class_label):
    inputs_2d_p = torch.squeeze(inputs_2d)
    inputs_3d_p = inputs_3d.permute(1, 0, 2, 3)
    out_num = inputs_2d_p.shape[0] - receptive_field + 1
    eval_input_2d = torch.empty(out_num, receptive_field, inputs_2d_p.shape[1],
                                inputs_2d_p.shape[2])
Exemplo n.º 20
0
def main():
    args = parse_args()

    # 2D kpts loads or generate
    if not args.input_npz:
        # crate kpts by alphapose
        from joints_detectors.Alphapose.gene_npz import handle_video
        video_name = args.viz_video
        keypoints = handle_video(video_name)
    else:
        npz = np.load(args.input_npz)
        keypoints = npz['kpts']  #(N, 17, 2)

    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(keypoints_symmetry[0]), list(
        keypoints_symmetry[1])
    joints_left, joints_right = list([4, 5, 6, 11, 12,
                                      13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  假设use the camera parameter
    keypoints = normalize_screen_coordinates(keypoints[..., :2],
                                             w=1000,
                                             h=1002)

    model_pos = TemporalModel(17,
                              2,
                              17,
                              filter_widths=[3, 3, 3, 3, 3],
                              causal=args.causal,
                              dropout=args.dropout,
                              channels=args.channels,
                              dense=args.dense)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

    ckpt, time1 = ckpt_time(time0)
    print('------- load data spends {:.2f} seconds'.format(ckpt))

    # load trained model
    chk_filename = os.path.join(args.checkpoint,
                                args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', os.path.join(main_path, chk_filename))
    checkpoint = torch.load(
        os.path.join(main_path, chk_filename),
        map_location=lambda storage, loc: storage)  # 把loc映射到storage
    model_pos.load_state_dict(checkpoint['model_pos'])

    ckpt, time2 = ckpt_time(time1)
    print('------- load 3D model spends {:.2f} seconds'.format(ckpt))

    #  Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3]
    receptive_field = model_pos.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    print('Rendering...')
    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=args.test_time_augmentation,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)

    rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804],
                   dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)

    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    anim_output = {'Reconstruction': prediction}
    input_keypoints = image_coordinates(input_keypoints[..., :2],
                                        w=1000,
                                        h=1002)

    ckpt, time3 = ckpt_time(time2)
    print(
        '------- generate reconstruction 3D data spends {:.2f} seconds'.format(
            ckpt))

    if not args.viz_output:
        args.viz_output = 'result.mp4'

    from common.visualization import render_animation
    render_animation(input_keypoints,
                     anim_output,
                     skeleton(),
                     25,
                     args.viz_bitrate,
                     np.array(70., dtype=np.float32),
                     args.viz_output,
                     limit=args.viz_limit,
                     downsample=args.viz_downsample,
                     size=args.viz_size,
                     input_video_path=args.viz_video,
                     viewport=(1000, 1002),
                     input_video_skip=args.viz_skip)

    ckpt, time4 = ckpt_time(time3)
    print('total spend {:2f} second'.format(ckpt))
Exemplo n.º 21
0
def videpose_infer(args):
    from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates
    from common.generators import UnchunkedGenerator
    from common.model import TemporalModel
    from common.utils import Timer, evaluate, add_path
    from videopose import get_detector_2d, ckpt_time, metadata, time0

    import gene_npz

    gene_npz.args.outputpath = str(args.viz_output / "alpha_pose_kunkun_cut")
    print(gene_npz.args)
    # detector_2d = get_detector_2d(args.detector_2d)
    detector_2d = gene_npz.generate_kpts(args.detector_2d)

    assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)'

    # 2D kpts loads or generate
    if not args.input_npz:
        video_name = args.viz_video
        keypoints = detector_2d(video_name)
    else:
        npz = np.load(args.input_npz)
        keypoints = npz['kpts']  # (N, 17, 2)

    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(
        keypoints_symmetry[0]), list(keypoints_symmetry[1])
    joints_left, joints_right = list(
        [4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  Suppose using the camera parameter
    keypoints = normalize_screen_coordinates(
        keypoints[..., :2], w=1000, h=1002)

    model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels,
                              dense=args.dense)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

    ckpt, time1 = ckpt_time(time0)
    print('-------------- load data spends {:.2f} seconds'.format(ckpt))

    # load trained model
    chk_filename = os.path.join(
        args.checkpoint, args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(
        chk_filename, map_location=lambda storage, loc: storage)  # 把loc映射到storage
    model_pos.load_state_dict(checkpoint['model_pos'])

    ckpt, time2 = ckpt_time(time1)
    print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt))

    #  Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3]
    receptive_field = model_pos.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    print('Rendering...')
    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None, None, [input_keypoints],
                             pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)

    # save 3D joint points
    np.save(args.viz_output / "test_3d_output.npy",
            prediction, allow_pickle=True)

    rot = np.array([0.14070565, -0.15007018, -0.7552408,
                   0.62232804], dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)

    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    anim_output = {'Reconstruction': prediction}
    input_keypoints = image_coordinates(
        input_keypoints[..., :2], w=1000, h=1002)

    ckpt, time3 = ckpt_time(time2)
    print(
        '-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt))

    ckpt, time4 = ckpt_time(time3)
    print('total spend {:2f} second'.format(ckpt))
Exemplo n.º 22
0
    model_pos = model_pos.cuda()

if args.resume or args.evaluate:
    chk_filename = os.path.join(args.checkpoint,
                                args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(chk_filename,
                            map_location=lambda storage, loc: storage)
    print('This model was trained for {} epochs'.format(checkpoint['epoch']))
    model_pos.load_state_dict(checkpoint['model_pos'])

test_generator = UnchunkedGenerator(cameras_valid,
                                    poses_valid,
                                    poses_valid_2d,
                                    pad=pad,
                                    causal_shift=causal_shift,
                                    augment=False,
                                    kps_left=kps_left,
                                    kps_right=kps_right,
                                    joints_left=joints_left,
                                    joints_right=joints_right)
print('INFO: Testing on {} frames'.format(test_generator.num_frames()))


# Evaluate
def evaluate(test_generator, action=None, return_predictions=False):
    with torch.no_grad():
        model_pos.eval()
        N = 0
        for _, batch, batch_2d in test_generator.next_epoch():
            inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
            if torch.cuda.is_available():
Exemplo n.º 23
0
receptive_field = model_pos.receptive_field()
pad = (receptive_field - 1) // 2
causal_shift = 0
if torch.cuda.is_available():
    model_pos = model_pos.cuda()

checkpoint = torch.load(chk_filename,
                        map_location=lambda storage, loc: storage)
model_pos.load_state_dict(checkpoint['model_pos'])

test_generator = UnchunkedGenerator(cameras_valid,
                                    poses_valid,
                                    poses_valid_2d,
                                    pad=pad,
                                    causal_shift=causal_shift,
                                    augment=False,
                                    kps_left=kps_left,
                                    kps_right=kps_right,
                                    joints_left=joints_left,
                                    joints_right=joints_right)


def evaluate(test_generator, action=None, return_predictions=False):
    epoch_loss_3d_pos = 0
    epoch_loss_3d_pos_procrustes = 0
    epoch_loss_3d_pos_scale = 0
    epoch_loss_3d_vel = 0
    with torch.no_grad():
        model_pos.eval()
        N = 0
        for _, batch, batch_2d in test_generator.next_epoch():
Exemplo n.º 24
0
def analyze_frame(h, frame):

    boxes, keypoints = infer.inference_on_frame(h['predictor'], frame)

    # step 4: prepare data.
    # take 2d keypoints, that's it
    # first element is empty array, second is our actual frame data, a 3d numpy array with first dimension 1, second and third being the 17 joints of 3 doubles each.
    kp = keypoints[1][0][:2, :].T  # extract (x, y) just like in prepare_data_2d_custom code

    # what to do if kp is NaN or missing data or something?
    # I guess just ignore it

    # they do this  at the end of step4. but we keep it simple, and take the data from step2 directly into a variable.
    #     output[canonical_name]['custom'] = [data[0]['keypoints'].astype('float32')]
    #output_custom_canonical_bullshit = kp.astype('float32')

    # this is what happens at  the end of step4. which is a file that is loaded in the beginning of step 5.
    #     np.savez_compressed(os.path.join(args.dataoutputdir, output_prefix_2d + args.output), positions_2d=output, metadata=metadata)

    # this is the bullshit they do in the original script.
    # confusingly, keypoints is actually just data, until it is set to keypoints[positions_2d]
    # keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True)

    # step 5: ..... all the other shit
    # starting to copy stuff over from run.py

    # extract dataset from the init dictionary
    dataset = h['dataset']
    keypoints_metadata = h['keypoints_metadata']
    keypoints_symmetry = h['keypoints_symmetry']

    kps_left = h['kps_left']
    kps_right = h['kps_right']
    joints_left = h['joints_left']
    joints_right = h['joints_right']

    # normalize
    for i in range(len(kp)):
        koord = kp[i]
        kp[i] = normalize_screen_coordinates(koord, h['frame_metadata']['w'], h['frame_metadata']['h'])
    #for kps in enumerate(keypoints):
    #    kps[..., :2] = normalize_screen_coordinates(kps[..., :2], frame_metadata['w'], frame_metadata['h'])

    # this is taken from the args.architecture and run.py and just hardcoded, skipping a lot of nonsense
    filter_widths = [int(x) for x in "3,3,3,3,3".split(',')]
    skeleton_num_joints = dataset.skeleton().num_joints()
    #skeleton_num_joints = 17

    causal = True
    dropout = 0.25
    channels = 1024
    dense = False

    model_pos_train = TemporalModelOptimized1f(kp.shape[-2], kp.shape[-1], skeleton_num_joints,
                                               filter_widths=filter_widths, causal=causal, dropout=dropout,
                                               channels=channels)
    model_pos = TemporalModel(kp.shape[-2], kp.shape[-1], skeleton_num_joints,
                                         filter_widths=filter_widths, causal=causal, dropout=dropout,
                                         channels=channels, dense=dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    #if args.causal:
    #    print('INFO: Using causal convolutions')
    #    causal_shift = pad
    #else:
    #    causal_shift = 0
    causal_shift = pad

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
        model_pos_train = model_pos_train.cuda()

    #if args.resume or args.evaluate:
    if True:
        chk_filename = "checkpoint/pretrained_h36m_detectron_coco.bin"
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(checkpoint['epoch']))
        model_pos_train.load_state_dict(checkpoint['model_pos'])
        model_pos.load_state_dict(checkpoint['model_pos'])

        # false in our particular case... we might benefit from getting rid of model_traj,
        # unless it's super fast then we should just keep it in case we ever upgrade
        if 'model_traj' in checkpoint:
            # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild)
            model_traj = TemporalModel(kp.shape[-2], kp.shape[-1], 1,
                                filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels,
                                dense=dense)
            if torch.cuda.is_available():
                model_traj = model_traj.cuda()
            model_traj.load_state_dict(checkpoint['model_traj'])
        else:
            model_traj = None

    test_generator = UnchunkedGenerator(None, None, kp,
                                        pad=pad, causal_shift=causal_shift, augment=False,
                                        kps_left=kps_left, kps_right=kps_right,
                                        joints_left=joints_left, joints_right=joints_right)
    print('INFO: Testing on {} frames'.format(test_generator.num_frames()))

    # Evaluate
    def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False):
        epoch_loss_3d_pos = 0
        epoch_loss_3d_pos_procrustes = 0
        epoch_loss_3d_pos_scale = 0
        epoch_loss_3d_vel = 0
        with torch.no_grad():
            if not use_trajectory_model:
                model_pos.eval()
            else:
                model_traj.eval()
            N = 0
            for _, batch, batch_2d in eval_generator.next_epoch():
                inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
                if torch.cuda.is_available():
                    inputs_2d = inputs_2d.cuda()

                # Positional model
                if not use_trajectory_model:
                    predicted_3d_pos = model_pos(inputs_2d)
                else:
                    predicted_3d_pos = model_traj(inputs_2d)

                # Test-time augmentation (if enabled)
                if eval_generator.augment_enabled():
                    # Undo flipping and take average with non-flipped version
                    predicted_3d_pos[1, :, :, 0] *= -1
                    if not use_trajectory_model:
                        predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left]
                    predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True)

                if return_predictions:
                    return predicted_3d_pos.squeeze(0).cpu().numpy()

                inputs_3d = torch.from_numpy(batch.astype('float32'))
                if torch.cuda.is_available():
                    inputs_3d = inputs_3d.cuda()
                inputs_3d[:, :, 0] = 0
                if eval_generator.augment_enabled():
                    inputs_3d = inputs_3d[:1]

                error = mpjpe(predicted_3d_pos, inputs_3d)
                epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item()

                epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item()
                N += inputs_3d.shape[0] * inputs_3d.shape[1]

                inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])
                predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])

                epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs)

                # Compute velocity error
                epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs)

        if action is None:
            print('----------')
        else:
            print('----'+action+'----')
        e1 = (epoch_loss_3d_pos / N)*1000
        e2 = (epoch_loss_3d_pos_procrustes / N)*1000
        e3 = (epoch_loss_3d_pos_scale / N)*1000
        ev = (epoch_loss_3d_vel / N)*1000
        print('Test time augmentation:', eval_generator.augment_enabled())
        print('Protocol #1 Error (MPJPE):', e1, 'mm')
        print('Protocol #2 Error (P-MPJPE):', e2, 'mm')
        print('Protocol #3 Error (N-MPJPE):', e3, 'mm')
        print('Velocity Error (MPJVE):', ev, 'mm')
        print('----------')

        return e1, e2, e3, ev

    image_keypoints2d = kp
    gen = UnchunkedGenerator(None, None, [[image_keypoints2d]],
                             pad=pad, causal_shift=causal_shift, augment=False,
                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
    prediction = evaluate(gen, return_predictions=True)

    # here is the data format
    # public enum VideoPose3dJointOrder
    # {
    #     HIP = 0,
    #     R_HIP = 1,
    #     R_KNEE = 2,
    #     R_FOOT = 3,
    #     L_HIP = 4,
    #     L_KNEE = 5,
    #     L_FOOT = 6,
    #     SPINE = 7,
    #     THORAX = 8,
    #     NOSE = 9,
    #     HEAD = 10,
    #     L_SHOULDER = 11,
    #     L_ELBOW = 12,
    #     L_WRIST = 13,
    #     R_SHOULDER = 14,
    #     R_ELBOW = 15,
    #     R_WRIST = 16
    # }

    # this bugs out. dunno what the hell they were trying to do.
    # anyway we can fix it by just getting width/height some other way.

    # Invert camera transformation
    cam = dataset.cameras()

    width = cam['frame'][0]['res_w']
    height = cam['frame'][0]['res_h']

    image_keypoints2d = image_coordinates(image_keypoints2d[..., :2], w=width, h=height)

    viz_camera = 0

    # If the ground truth is not available, take the camera extrinsic params from a random subject.
    # They are almost the same, and anyway, we only need this for visualization purposes.
    for subject in dataset.cameras():
        if 'orientation' in dataset.cameras()[subject][viz_camera]:
            rot = dataset.cameras()[subject][viz_camera]['orientation']
            break
    prediction = camera_to_world(prediction, R=rot, t=0)
    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])

    # because algo was meant for a list of frames, we take the first frame (our only frame)
    prediction3d = prediction[0]

    return prediction3d, image_keypoints2d

    # do we want to visualize? this code used to write to json and create a video for visualization
    #if args.viz_output is not None:
    if True:

        anim_output = {'Reconstruction': prediction}

        # format the data in the same format as mediapipe, so we can load it in unity with the same script
        # we need a list (frames) of lists of 3d landmarks.
        unity_landmarks = prediction.tolist()

        # how to send data? or display it?
        # maybe draw it on the webcam feed....?!?!?!


        #with open(args.output_json, "w") as json_file:
        #    json.dump(unity_landmarks, json_file)

        #if args.rendervideo == "yes":
        #    from common.visualization import render_animation
        #    render_animation(input_keypoints, keypoints_metadata, anim_output,
        #                     dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output,
        #                     limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size,
        #                     input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']),
        #                     input_video_skip=args.viz_skip)

    we_re_done_here = 1
Exemplo n.º 25
0
def the_main_kaboose(args):
    print(args)

    try:
        # Create checkpoint directory if it does not exist
        os.makedirs(args.checkpoint)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise RuntimeError('Unable to create checkpoint directory:',
                               args.checkpoint)

    print('Loading dataset...')
    dataset_path = 'data/data_3d_' + args.dataset + '.npz'
    if args.dataset == 'h36m':
        from common.h36m_dataset import Human36mDataset
        dataset = Human36mDataset(dataset_path)
    elif args.dataset.startswith('humaneva'):
        from common.humaneva_dataset import HumanEvaDataset
        dataset = HumanEvaDataset(dataset_path)
    elif args.dataset.startswith('custom'):
        from common.custom_dataset import CustomDataset
        dataset = CustomDataset('data/data_2d_' + args.dataset + '_' +
                                args.keypoints + '.npz')
    else:
        raise KeyError('Invalid dataset')

    print('Preparing data...')
    for subject in dataset.subjects():
        for action in dataset[subject].keys():
            anim = dataset[subject][action]

            # this only works when training.
            if 'positions' in anim:
                positions_3d = []
                for cam in anim['cameras']:
                    pos_3d = world_to_camera(anim['positions'],
                                             R=cam['orientation'],
                                             t=cam['translation'])
                    pos_3d[:,
                           1:] -= pos_3d[:, :
                                         1]  # Remove global offset, but keep trajectory in first position
                    positions_3d.append(pos_3d)
                anim['positions_3d'] = positions_3d

    print('Loading 2D detections...')
    keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints +
                        '.npz',
                        allow_pickle=True)
    keypoints_metadata = keypoints['metadata'].item()
    keypoints_symmetry = keypoints_metadata['keypoints_symmetry']
    kps_left, kps_right = list(keypoints_symmetry[0]), list(
        keypoints_symmetry[1])
    joints_left, joints_right = list(dataset.skeleton().joints_left()), list(
        dataset.skeleton().joints_right())
    keypoints = keypoints['positions_2d'].item()

    # THIS IS ABOUT TRAINING. ignore pls.
    for subject in dataset.subjects():
        assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format(
            subject)
        for action in dataset[subject].keys():
            assert action in keypoints[
                subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format(
                    action, subject)
            if 'positions_3d' not in dataset[subject][action]:
                continue

            for cam_idx in range(len(keypoints[subject][action])):

                # We check for >= instead of == because some videos in H3.6M contain extra frames
                mocap_length = dataset[subject][action]['positions_3d'][
                    cam_idx].shape[0]
                assert keypoints[subject][action][cam_idx].shape[
                    0] >= mocap_length

                if keypoints[subject][action][cam_idx].shape[0] > mocap_length:
                    # Shorten sequence
                    keypoints[subject][action][cam_idx] = keypoints[subject][
                        action][cam_idx][:mocap_length]

            assert len(keypoints[subject][action]) == len(
                dataset[subject][action]['positions_3d'])

    # normalize camera frame?
    for subject in keypoints.keys():
        for action in keypoints[subject]:
            for cam_idx, kps in enumerate(keypoints[subject][action]):
                # Normalize camera frame
                cam = dataset.cameras()[subject][cam_idx]
                kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                            w=cam['res_w'],
                                                            h=cam['res_h'])
                keypoints[subject][action][cam_idx] = kps

    subjects_train = args.subjects_train.split(',')
    subjects_semi = [] if not args.subjects_unlabeled else args.subjects_unlabeled.split(
        ',')
    if not args.render:
        subjects_test = args.subjects_test.split(',')
    else:
        subjects_test = [args.viz_subject]

    semi_supervised = len(subjects_semi) > 0
    if semi_supervised and not dataset.supports_semi_supervised():
        raise RuntimeError(
            'Semi-supervised training is not implemented for this dataset')

    def fetch(subjects, action_filter=None, subset=1, parse_3d_poses=True):
        out_poses_3d = []
        out_poses_2d = []
        out_camera_params = []
        for subject in subjects:
            print("gonna check actions for subject " + subject)

        for subject in subjects:
            for action in keypoints[subject].keys():
                if action_filter is not None:
                    found = False
                    for a in action_filter:
                        if action.startswith(a):
                            found = True
                            break
                    if not found:
                        continue

                poses_2d = keypoints[subject][action]
                for i in range(len(poses_2d)):  # Iterate across cameras
                    out_poses_2d.append(poses_2d[i])

                if subject in dataset.cameras():
                    cams = dataset.cameras()[subject]
                    assert len(cams) == len(poses_2d), 'Camera count mismatch'
                    for cam in cams:
                        if 'intrinsic' in cam:
                            out_camera_params.append(cam['intrinsic'])

                if parse_3d_poses and 'positions_3d' in dataset[subject][
                        action]:
                    poses_3d = dataset[subject][action]['positions_3d']
                    assert len(poses_3d) == len(
                        poses_2d), 'Camera count mismatch'
                    for i in range(len(poses_3d)):  # Iterate across cameras
                        out_poses_3d.append(poses_3d[i])

        if len(out_camera_params) == 0:
            out_camera_params = None
        if len(out_poses_3d) == 0:
            out_poses_3d = None

        stride = args.downsample
        if subset < 1:
            for i in range(len(out_poses_2d)):
                n_frames = int(
                    round(len(out_poses_2d[i]) // stride * subset) * stride)
                start = deterministic_random(
                    0,
                    len(out_poses_2d[i]) - n_frames + 1,
                    str(len(out_poses_2d[i])))
                out_poses_2d[i] = out_poses_2d[i][start:start +
                                                  n_frames:stride]
                if out_poses_3d is not None:
                    out_poses_3d[i] = out_poses_3d[i][start:start +
                                                      n_frames:stride]
        elif stride > 1:
            # Downsample as requested
            for i in range(len(out_poses_2d)):
                out_poses_2d[i] = out_poses_2d[i][::stride]
                if out_poses_3d is not None:
                    out_poses_3d[i] = out_poses_3d[i][::stride]

        return out_camera_params, out_poses_3d, out_poses_2d

    action_filter = None if args.actions == '*' else args.actions.split(',')
    if action_filter is not None:
        print('Selected actions:', action_filter)

    # when you run inference, this returns None, None, and the keypoints array renamed as poses_valid_2d
    cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test,
                                                       action_filter)

    filter_widths = [int(x) for x in args.architecture.split(',')]
    if not args.disable_optimizations and not args.dense and args.stride == 1:
        # Use optimized model for single-frame predictions
        shape_2 = poses_valid_2d[0].shape[-2]
        shape_1 = poses_valid_2d[0].shape[-1]
        numJoints = dataset.skeleton().num_joints()
        model_pos_train = TemporalModelOptimized1f(shape_2,
                                                   shape_1,
                                                   numJoints,
                                                   filter_widths=filter_widths,
                                                   causal=args.causal,
                                                   dropout=args.dropout,
                                                   channels=args.channels)
    else:
        # When incompatible settings are detected (stride > 1, dense filters, or disabled optimization) fall back to normal model
        model_pos_train = TemporalModel(poses_valid_2d[0].shape[-2],
                                        poses_valid_2d[0].shape[-1],
                                        dataset.skeleton().num_joints(),
                                        filter_widths=filter_widths,
                                        causal=args.causal,
                                        dropout=args.dropout,
                                        channels=args.channels,
                                        dense=args.dense)

    model_pos = TemporalModel(poses_valid_2d[0].shape[-2],
                              poses_valid_2d[0].shape[-1],
                              dataset.skeleton().num_joints(),
                              filter_widths=filter_widths,
                              causal=args.causal,
                              dropout=args.dropout,
                              channels=args.channels,
                              dense=args.dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    if args.causal:
        print('INFO: Using causal convolutions')
        causal_shift = pad
    else:
        causal_shift = 0

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
        model_pos_train = model_pos_train.cuda()

    if args.resume or args.evaluate:
        chk_filename = os.path.join(
            args.checkpoint, args.resume if args.resume else args.evaluate)
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename,
                                map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(
            checkpoint['epoch']))
        model_pos_train.load_state_dict(checkpoint['model_pos'])
        model_pos.load_state_dict(checkpoint['model_pos'])

        if args.evaluate and 'model_traj' in checkpoint:
            # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild)
            model_traj = TemporalModel(poses_valid_2d[0].shape[-2],
                                       poses_valid_2d[0].shape[-1],
                                       1,
                                       filter_widths=filter_widths,
                                       causal=args.causal,
                                       dropout=args.dropout,
                                       channels=args.channels,
                                       dense=args.dense)
            if torch.cuda.is_available():
                model_traj = model_traj.cuda()
            model_traj.load_state_dict(checkpoint['model_traj'])
        else:
            model_traj = None

    test_generator = UnchunkedGenerator(cameras_valid,
                                        poses_valid,
                                        poses_valid_2d,
                                        pad=pad,
                                        causal_shift=causal_shift,
                                        augment=False,
                                        kps_left=kps_left,
                                        kps_right=kps_right,
                                        joints_left=joints_left,
                                        joints_right=joints_right)
    print('INFO: Testing on {} frames'.format(test_generator.num_frames()))

    # Evaluate
    def evaluate(eval_generator,
                 action=None,
                 return_predictions=False,
                 use_trajectory_model=False):
        epoch_loss_3d_pos = 0
        epoch_loss_3d_pos_procrustes = 0
        epoch_loss_3d_pos_scale = 0
        epoch_loss_3d_vel = 0
        with torch.no_grad():
            if not use_trajectory_model:
                model_pos.eval()
            else:
                model_traj.eval()
            N = 0
            for _, batch, batch_2d in eval_generator.next_epoch():
                inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
                if torch.cuda.is_available():
                    inputs_2d = inputs_2d.cuda()

                # Positional model
                if not use_trajectory_model:
                    predicted_3d_pos = model_pos(inputs_2d)
                else:
                    predicted_3d_pos = model_traj(inputs_2d)

                # Test-time augmentation (if enabled)
                if eval_generator.augment_enabled():
                    # Undo flipping and take average with non-flipped version
                    predicted_3d_pos[1, :, :, 0] *= -1
                    if not use_trajectory_model:
                        predicted_3d_pos[1, :, joints_left +
                                         joints_right] = predicted_3d_pos[
                                             1, :, joints_right + joints_left]
                    predicted_3d_pos = torch.mean(predicted_3d_pos,
                                                  dim=0,
                                                  keepdim=True)

                if return_predictions:
                    return predicted_3d_pos.squeeze(0).cpu().numpy()

                inputs_3d = torch.from_numpy(batch.astype('float32'))
                if torch.cuda.is_available():
                    inputs_3d = inputs_3d.cuda()
                inputs_3d[:, :, 0] = 0
                if eval_generator.augment_enabled():
                    inputs_3d = inputs_3d[:1]

                error = mpjpe(predicted_3d_pos, inputs_3d)
                epoch_loss_3d_pos_scale += inputs_3d.shape[
                    0] * inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos,
                                                      inputs_3d).item()

                epoch_loss_3d_pos += inputs_3d.shape[0] * inputs_3d.shape[
                    1] * error.item()
                N += inputs_3d.shape[0] * inputs_3d.shape[1]

                inputs = inputs_3d.cpu().numpy().reshape(
                    -1, inputs_3d.shape[-2], inputs_3d.shape[-1])
                predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(
                    -1, inputs_3d.shape[-2], inputs_3d.shape[-1])

                epoch_loss_3d_pos_procrustes += inputs_3d.shape[
                    0] * inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs)

                # Compute velocity error
                epoch_loss_3d_vel += inputs_3d.shape[0] * inputs_3d.shape[
                    1] * mean_velocity_error(predicted_3d_pos, inputs)

        if action is None:
            print('----------')
        else:
            print('----' + action + '----')
        e1 = (epoch_loss_3d_pos / N) * 1000
        e2 = (epoch_loss_3d_pos_procrustes / N) * 1000
        e3 = (epoch_loss_3d_pos_scale / N) * 1000
        ev = (epoch_loss_3d_vel / N) * 1000
        print('Test time augmentation:', eval_generator.augment_enabled())
        print('Protocol #1 Error (MPJPE):', e1, 'mm')
        print('Protocol #2 Error (P-MPJPE):', e2, 'mm')
        print('Protocol #3 Error (N-MPJPE):', e3, 'mm')
        print('Velocity Error (MPJVE):', ev, 'mm')
        print('----------')

        return e1, e2, e3, ev

    if args.render:
        print('Rendering...')

        input_keypoints = keypoints[args.viz_subject][args.viz_action][
            args.viz_camera].copy()
        ground_truth = None
        if args.viz_subject in dataset.subjects(
        ) and args.viz_action in dataset[args.viz_subject]:
            if 'positions_3d' in dataset[args.viz_subject][args.viz_action]:
                ground_truth = dataset[args.viz_subject][
                    args.viz_action]['positions_3d'][args.viz_camera].copy()
        if ground_truth is None:
            print(
                'INFO: this action is unlabeled. Ground truth will not be rendered.'
            )

        gen = UnchunkedGenerator(None,
                                 None, [input_keypoints],
                                 pad=pad,
                                 causal_shift=causal_shift,
                                 augment=args.test_time_augmentation,
                                 kps_left=kps_left,
                                 kps_right=kps_right,
                                 joints_left=joints_left,
                                 joints_right=joints_right)
        prediction = evaluate(gen, return_predictions=True)
        if model_traj is not None and ground_truth is None:
            prediction_traj = evaluate(gen,
                                       return_predictions=True,
                                       use_trajectory_model=True)
            prediction += prediction_traj

        if args.viz_export is not None:
            print('Exporting joint positions to', args.viz_export)
            # Predictions are in camera space
            np.save(args.viz_export, prediction)

        if args.viz_output is not None:
            if ground_truth is not None:
                # Reapply trajectory
                trajectory = ground_truth[:, :1]
                ground_truth[:, 1:] += trajectory
                prediction += trajectory

            # Invert camera transformation
            cam = dataset.cameras()[args.viz_subject][args.viz_camera]
            if ground_truth is not None:
                prediction = camera_to_world(prediction,
                                             R=cam['orientation'],
                                             t=cam['translation'])
                ground_truth = camera_to_world(ground_truth,
                                               R=cam['orientation'],
                                               t=cam['translation'])
            else:
                # If the ground truth is not available, take the camera extrinsic params from a random subject.
                # They are almost the same, and anyway, we only need this for visualization purposes.
                for subject in dataset.cameras():
                    if 'orientation' in dataset.cameras()[subject][
                            args.viz_camera]:
                        rot = dataset.cameras()[subject][
                            args.viz_camera]['orientation']
                        break
                prediction = camera_to_world(prediction, R=rot, t=0)
                # We don't have the trajectory, but at least we can rebase the height
                prediction[:, :, 2] -= np.min(prediction[:, :, 2])

            anim_output = {'Reconstruction': prediction}
            if ground_truth is not None and not args.viz_no_ground_truth:
                anim_output['Ground truth'] = ground_truth

            input_keypoints = image_coordinates(input_keypoints[..., :2],
                                                w=cam['res_w'],
                                                h=cam['res_h'])

            print("Writing to json")

            import json
            # format the data in the same format as mediapipe, so we can load it in unity with the same script
            # we need a list (frames) of lists of 3d landmarks.
            # but prediction[] only has 17 landmarks, and we need 25 in our unity script
            unity_landmarks = prediction.tolist()

            with open(args.output_json, "w") as json_file:
                json.dump(unity_landmarks, json_file)

            if args.rendervideo == "yes":

                from common.visualization import render_animation
                render_animation(input_keypoints,
                                 keypoints_metadata,
                                 anim_output,
                                 dataset.skeleton(),
                                 dataset.fps(),
                                 args.viz_bitrate,
                                 cam['azimuth'],
                                 args.viz_output,
                                 limit=args.viz_limit,
                                 downsample=args.viz_downsample,
                                 size=args.viz_size,
                                 input_video_path=args.viz_video,
                                 viewport=(cam['res_w'], cam['res_h']),
                                 input_video_skip=args.viz_skip)
Exemplo n.º 26
0
if torch.cuda.is_available():
    model_pos = model_pos.cuda()
    model_pos_train = model_pos_train.cuda()
    
if args.resume or args.evaluate:
    chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)
    print('This model was trained for {} epochs'.format(checkpoint['epoch']))
    model_pos_train.load_state_dict(checkpoint['model_pos'])
    model_pos.load_state_dict(checkpoint['model_pos'])
    print('Checkpoint loading finished.')
    
test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d,
                                    pad=pad, causal_shift=causal_shift, augment=False,
                                    kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
print('INFO: Testing on {} frames'.format(test_generator.num_frames()))

if not args.evaluate:
    cameras_train, poses_train, poses_train_2d = fetch(subjects_train, action_filter, subset=args.subset)

    lr = args.learning_rate
    if semi_supervised:
        cameras_semi, _, poses_semi_2d = fetch(subjects_semi, action_filter, parse_3d_poses=False)
        
        if not args.disable_optimizations and not args.dense and args.stride == 1:
            # Use optimized model for single-frame predictions
            model_traj_train = TemporalModelOptimized1f(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], 1,
                    filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels)
        else:
Exemplo n.º 27
0
print('INFO: Trainable parameter count:', model_params)

if torch.cuda.is_available():
    model_pos = model_pos.cuda()
#    model_pos_train = model_pos_train.cuda()
    
if args.resume or args.evaluate:
    chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)
    print('This model was trained for {} epochs'.format(checkpoint['epoch']))
#    model_pos_train.load_state_dict(checkpoint['model_pos'])
    model_pos.load_state_dict(checkpoint['model_pos'])
    
test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d,
                                    pad=pad, causal_shift=causal_shift, augment=False,
                                    kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
print('INFO: Testing on {} frames'.format(test_generator.num_frames()))


def evaluate(test_generator, action=None, return_predictions=False):
    epoch_loss_3d_pos = 0
    epoch_loss_3d_pos_procrustes = 0
    epoch_loss_3d_pos_scale = 0
    epoch_loss_3d_vel = 0
    with torch.no_grad():
        model_pos.eval()
        N = 0
        for _, batch, batch_2d in test_generator.next_epoch():
            inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
            if torch.cuda.is_available():
Exemplo n.º 28
0
model_pos.load_state_dict(checkpoint['model_pos'])

ckpt, time3 = ckpt_time(time2)
print('load 3D pose spend {:2f} second'.format(ckpt))

#  Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3]
receptive_field = model_pos.receptive_field()
pad = (receptive_field - 1) // 2  # Padding on each side
causal_shift = 0

print('Rendering...')
#  import ipdb;ipdb.set_trace()
input_keypoints = keypoints.copy()

gen = UnchunkedGenerator(None, None, [input_keypoints],
                         pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
                         kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
prediction = evaluate(gen, return_predictions=True)

# If the ground truth is not available, take the camera extrinsic params from a random subject.
# They are almost the same, and anyway, we only need this for visualization purposes.
# for subject in dataset.cameras():
#     if 'orientation' in dataset.cameras()[subject][args.viz_camera]:
#         rot = dataset.cameras()[subject][args.viz_camera]['orientation']
#         break

rot = cam['orientation']
tran = cam['translation']
prediction = camera_to_world(prediction, R=rot, t=tran)

# We don't have the trajectory, but at least we can rebase the height
Exemplo n.º 29
0
class Predictor:
    def __init__(self,
                 dataset_path,
                 checkpoint_path,
                 input_video_path=None,
                 export_path=None,
                 output_path=None,
                 with_cude=False):
        self.with_cuda = with_cude
        self.dataset_path = dataset_path
        self.export_path = export_path
        self.output_path = output_path
        self.input_video_path = input_video_path
        self.dataset = CustomDataset(self.dataset_path)
        self.keypoints = None
        self.keypoints_left = None
        self.keypoints_right = None
        self.joints_left = None
        self.joints_right = None
        self.checkpoint = torch.load(checkpoint_path,
                                     map_location=lambda storage, loc: storage)
        self.model = None
        self.init_keypoints()
        self.valid_poses = self.keypoints["detectron2"]["custom"]
        self.init_model()
        self.test_generator = None
        self.init_generator()
        self.prediction = None
        self.make_prediction()

    def export_prediction(self):
        if self.export_path is not None:
            np.save(self.export_path, self.prediction)

    def init_model(self):
        self.model = TemporalModel(self.valid_poses[0].shape[-2],
                                   self.valid_poses[0].shape[-1],
                                   self.dataset.skeleton().num_joints(),
                                   filter_widths=[3, 3, 3, 3, 3],
                                   causal=False,
                                   dropout=0.25,
                                   channels=1024,
                                   dense=False)
        self.model.load_state_dict(self.checkpoint['model_pos'])

    def init_keypoints(self):
        self.keypoints = np.load(self.dataset_path, allow_pickle=True)
        keypoints_metadata = self.keypoints['metadata'].item()
        keypoints_symmetry = keypoints_metadata['keypoints_symmetry']
        self.keypoints_left, self.keypoints_right = list(
            keypoints_symmetry[0]), list(keypoints_symmetry[1])
        self.joints_left, self.joints_right = list(
            self.dataset.skeleton().joints_left()), list(
                self.dataset.skeleton().joints_right())
        self.keypoints = self.keypoints['positions_2d'].item()

        for subject in self.keypoints.keys():
            for action in self.keypoints[subject]:
                for cam_idx, kps in enumerate(self.keypoints[subject][action]):
                    # Normalize camera frame
                    cam = self.dataset.cameras()[subject][cam_idx]
                    kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                                w=cam['res_w'],
                                                                h=cam['res_h'])
                    self.keypoints[subject][action][cam_idx] = kps

    def init_generator(self):
        receptive_field = self.model.receptive_field()
        pad = (receptive_field - 1) // 2
        causal_shift = 0
        self.test_generator = UnchunkedGenerator(
            None,
            None,
            self.valid_poses,
            pad=pad,
            causal_shift=causal_shift,
            augment=False,
            kps_left=self.keypoints_left,
            kps_right=self.keypoints_right,
            joints_left=self.joints_left,
            joints_right=self.joints_right)

    def make_prediction(self):
        if self.with_cuda:
            self.model = self.model.cuda()
        with torch.no_grad():
            self.model.eval()
            for _, batch, batch_2d in self.test_generator.next_epoch():
                inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
                if self.with_cuda:
                    inputs_2d = inputs_2d.cuda()

            predicted_3d_pos = self.model(inputs_2d)

            if self.test_generator.augment_enabled():
                predicted_3d_pos[1, :, :, 0] *= -1
                predicted_3d_pos[1, :, self.joints_left +
                                 self.joints_right] = predicted_3d_pos[
                                     1, :,
                                     self.joints_right + self.joints_left]
                predicted_3d_pos = torch.mean(predicted_3d_pos,
                                              dim=0,
                                              keepdim=True)

            predicted_3d_pos = predicted_3d_pos.squeeze(0).cpu().numpy()
            rot = self.dataset.cameras()['detectron2'][0]['orientation']
            predicted_3d_pos = camera_to_world(predicted_3d_pos, R=rot, t=0)
            predicted_3d_pos[:, :, 2] -= np.min(predicted_3d_pos[:, :, 2])
            self.prediction = predicted_3d_pos

    def plot_pose(self, pose_index=0):
        pose = make_pose(self.prediction.tolist()[pose_index])
        pose.prepare_plot()
        pose.plot()
Exemplo n.º 30
0
    if not args.render:
        print("Invalid argument:", args.render)

    # 渲染3D姿势
    input_keypoints = keypoints[args.viz_subject][args.viz_action][
        args.viz_camera].copy()

    print('INFO: this action is unlabeled. Ground truth will not be rendered.')
    print('kps_left:', kps_left, 'kps_right:', kps_right)
    print('joints_left:', joints_left, 'joints_right:', joints_right)
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=args.test_time_augmentation,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    print('INFO: Testing on {} frames'.format(gen.num_frames()))

    prediction = evaluate(gen)
    print('prediction.shape: ', prediction.shape)

    if args.viz_export is not None:
        print('Exporting joint positions to', args.viz_export)
        # Predictions are in camera space
        np.save(args.viz_export, prediction)

    if args.viz_output is not None:
Exemplo n.º 31
0
def main(args):
    # 第一步:检测2D关键点
    detector_2d = get_detector_2d(args.detector_2d)

    assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)'

    # 2D kpts loads or generate
    if not args.input_npz:
        video_name = args.viz_video
        keypoints = detector_2d(video_name)
    else:
        npz = np.load(args.input_npz)
        keypoints = npz['kpts']  # (N, 17, 2)

    # 第二步:将2D关键点转换为3D关键点
    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(keypoints_symmetry[0]), list(
        keypoints_symmetry[1])
    joints_left, joints_right = list([4, 5, 6, 11, 12,
                                      13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  Suppose using the camera parameter
    keypoints = normalize_screen_coordinates(keypoints[..., :2],
                                             w=1000,
                                             h=1002)

    model_pos = TemporalModel(17,
                              2,
                              17,
                              filter_widths=[3, 3, 3, 3, 3],
                              causal=args.causal,
                              dropout=args.dropout,
                              channels=args.channels,
                              dense=args.dense)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

    ckpt, time1 = ckpt_time(time0)
    print('-------------- load data spends {:.2f} seconds'.format(ckpt))

    # load trained model
    chk_filename = os.path.join(args.checkpoint,
                                args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(
        chk_filename,
        map_location=lambda storage, loc: storage)  # 把loc映射到storage
    model_pos.load_state_dict(checkpoint['model_pos'])

    ckpt, time2 = ckpt_time(time1)
    print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt))

    #  Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3]
    receptive_field = model_pos.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    print('Rendering...')
    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=args.test_time_augmentation,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)

    # save 3D joint points 保存三维关节点
    np.save('outputs/test_3d_output.npy', prediction, allow_pickle=True)

    # 第三步:将预测的三维点从相机坐标系转换到世界坐标系
    # (1)第一种转换方法
    rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804],
                   dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)
    # We don't have the trajectory, but at least we can rebase the height将预测的三维点的Z值减去预测的三维点中Z的最小值,得到正向的Z值
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])

    # (2)第二种转换方法
    # subject = 'S1'
    # cam_id = '55011271'
    # cam_params = load_camera_params('./camera/cameras.h5')[subject][cam_id]
    # R = cam_params['R']
    # T = 0
    # azimuth = cam_params['azimuth']
    #
    # prediction = camera2world(pose=prediction, R=R, T=T)
    # prediction[:, :, 2] -= np.min(prediction[:, :, 2])  # rebase the height

    # 第四步:将3D关键点输出并将预测的3D点转换为bvh骨骼
    # 将三维预测点输出
    write_3d_point(args.viz_output, prediction)

    # 将预测的三维骨骼点转换为bvh骨骼
    prediction_copy = np.copy(prediction)
    write_standard_bvh(args.viz_output, prediction_copy)  #转为标准bvh骨骼
    write_smartbody_bvh(args.viz_output, prediction_copy)  #转为SmartBody所需的bvh骨骼

    anim_output = {'Reconstruction': prediction}
    input_keypoints = image_coordinates(input_keypoints[..., :2],
                                        w=1000,
                                        h=1002)

    ckpt, time3 = ckpt_time(time2)
    print(
        '-------------- generate reconstruction 3D data spends {:.2f} seconds'.
        format(ckpt))

    if not args.viz_output:
        args.viz_output = 'outputs/outputvideo/alpha_result.mp4'

    # 第五步:生成输出视频
    # from common.visualization import render_animation
    # render_animation(input_keypoints, anim_output,
    #                  Skeleton(), 25, args.viz_bitrate, np.array(70., dtype=np.float32), args.viz_output,
    #                  limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size,
    #                  input_video_path=args.viz_video, viewport=(1000, 1002),
    #                  input_video_skip=args.viz_skip)

    ckpt, time4 = ckpt_time(time3)
    print('total spend {:2f} second'.format(ckpt))