示例#1
0
        # If the ground truth is not available, take the camera extrinsic params from a random subject.
        # They are almost the same, and anyway, we only need this for visualization purposes.
        rot = None
        for subject in dataset.cameras():
            if 'orientation' in dataset.cameras()[subject][args.viz_camera]:
                rot = dataset.cameras()[subject][
                    args.viz_camera]['orientation']
                break
        prediction = camera_to_world(prediction, R=rot, t=0)
        # We don't have the trajectory, but at least we can rebase the height
        prediction[:, :, 2] -= np.min(prediction[:, :, 2])

        anim_output = {'Reconstruction': prediction}

        input_keypoints = image_coordinates(input_keypoints[..., :2],
                                            w=cam['res_w'],
                                            h=cam['res_h'])
        # print('w, h:', cam['res_w'], cam['res_h'])
        from common.visualization import render_animation

        print("rot:", rot)
        print("cam['azimuth']:", cam['azimuth'])
        render_animation(input_keypoints,
                         keypoints_metadata,
                         anim_output,
                         dataset.skeleton(),
                         dataset.fps(),
                         args.viz_bitrate,
                         cam['azimuth'],
                         args.viz_output,
                         limit=args.viz_limit,
示例#2
0
    print('Computing ground-truth 2D poses...')
    dataset = Human36mDataset(output_filename + '.npz')
    output_2d_poses = {}
    for subject in dataset.subjects():
        output_2d_poses[subject] = {}
        for action in dataset[subject].keys():
            anim = dataset[subject][action]

            positions_2d = []
            for cam in anim['cameras']:
                pos_3d = world_to_camera(anim['positions'],
                                         R=cam['orientation'],
                                         t=cam['translation'])
                pos_2d = wrap(project_to_2d, True, pos_3d, cam['intrinsic'])
                pos_2d_pixel_space = image_coordinates(pos_2d,
                                                       w=cam['res_w'],
                                                       h=cam['res_h'])
                positions_2d.append(pos_2d_pixel_space.astype('float32'))
            output_2d_poses[subject][action] = positions_2d

    print('Saving...')
    metadata = {
        'num_joints':
        dataset.skeleton().num_joints(),
        'keypoints_symmetry':
        [dataset.skeleton().joints_left(),
         dataset.skeleton().joints_right()]
    }
    np.savez_compressed(output_filename_2d,
                        positions_2d=output_2d_poses,
                        metadata=metadata)
示例#3
0
def analyze_frame(h, frame):

    boxes, keypoints = infer.inference_on_frame(h['predictor'], frame)

    # step 4: prepare data.
    # take 2d keypoints, that's it
    # first element is empty array, second is our actual frame data, a 3d numpy array with first dimension 1, second and third being the 17 joints of 3 doubles each.
    kp = keypoints[1][0][:2, :].T  # extract (x, y) just like in prepare_data_2d_custom code

    # what to do if kp is NaN or missing data or something?
    # I guess just ignore it

    # they do this  at the end of step4. but we keep it simple, and take the data from step2 directly into a variable.
    #     output[canonical_name]['custom'] = [data[0]['keypoints'].astype('float32')]
    #output_custom_canonical_bullshit = kp.astype('float32')

    # this is what happens at  the end of step4. which is a file that is loaded in the beginning of step 5.
    #     np.savez_compressed(os.path.join(args.dataoutputdir, output_prefix_2d + args.output), positions_2d=output, metadata=metadata)

    # this is the bullshit they do in the original script.
    # confusingly, keypoints is actually just data, until it is set to keypoints[positions_2d]
    # keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True)

    # step 5: ..... all the other shit
    # starting to copy stuff over from run.py

    # extract dataset from the init dictionary
    dataset = h['dataset']
    keypoints_metadata = h['keypoints_metadata']
    keypoints_symmetry = h['keypoints_symmetry']

    kps_left = h['kps_left']
    kps_right = h['kps_right']
    joints_left = h['joints_left']
    joints_right = h['joints_right']

    # normalize
    for i in range(len(kp)):
        koord = kp[i]
        kp[i] = normalize_screen_coordinates(koord, h['frame_metadata']['w'], h['frame_metadata']['h'])
    #for kps in enumerate(keypoints):
    #    kps[..., :2] = normalize_screen_coordinates(kps[..., :2], frame_metadata['w'], frame_metadata['h'])

    # this is taken from the args.architecture and run.py and just hardcoded, skipping a lot of nonsense
    filter_widths = [int(x) for x in "3,3,3,3,3".split(',')]
    skeleton_num_joints = dataset.skeleton().num_joints()
    #skeleton_num_joints = 17

    causal = True
    dropout = 0.25
    channels = 1024
    dense = False

    model_pos_train = TemporalModelOptimized1f(kp.shape[-2], kp.shape[-1], skeleton_num_joints,
                                               filter_widths=filter_widths, causal=causal, dropout=dropout,
                                               channels=channels)
    model_pos = TemporalModel(kp.shape[-2], kp.shape[-1], skeleton_num_joints,
                                         filter_widths=filter_widths, causal=causal, dropout=dropout,
                                         channels=channels, dense=dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    #if args.causal:
    #    print('INFO: Using causal convolutions')
    #    causal_shift = pad
    #else:
    #    causal_shift = 0
    causal_shift = pad

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
        model_pos_train = model_pos_train.cuda()

    #if args.resume or args.evaluate:
    if True:
        chk_filename = "checkpoint/pretrained_h36m_detectron_coco.bin"
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(checkpoint['epoch']))
        model_pos_train.load_state_dict(checkpoint['model_pos'])
        model_pos.load_state_dict(checkpoint['model_pos'])

        # false in our particular case... we might benefit from getting rid of model_traj,
        # unless it's super fast then we should just keep it in case we ever upgrade
        if 'model_traj' in checkpoint:
            # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild)
            model_traj = TemporalModel(kp.shape[-2], kp.shape[-1], 1,
                                filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels,
                                dense=dense)
            if torch.cuda.is_available():
                model_traj = model_traj.cuda()
            model_traj.load_state_dict(checkpoint['model_traj'])
        else:
            model_traj = None

    test_generator = UnchunkedGenerator(None, None, kp,
                                        pad=pad, causal_shift=causal_shift, augment=False,
                                        kps_left=kps_left, kps_right=kps_right,
                                        joints_left=joints_left, joints_right=joints_right)
    print('INFO: Testing on {} frames'.format(test_generator.num_frames()))

    # Evaluate
    def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False):
        epoch_loss_3d_pos = 0
        epoch_loss_3d_pos_procrustes = 0
        epoch_loss_3d_pos_scale = 0
        epoch_loss_3d_vel = 0
        with torch.no_grad():
            if not use_trajectory_model:
                model_pos.eval()
            else:
                model_traj.eval()
            N = 0
            for _, batch, batch_2d in eval_generator.next_epoch():
                inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
                if torch.cuda.is_available():
                    inputs_2d = inputs_2d.cuda()

                # Positional model
                if not use_trajectory_model:
                    predicted_3d_pos = model_pos(inputs_2d)
                else:
                    predicted_3d_pos = model_traj(inputs_2d)

                # Test-time augmentation (if enabled)
                if eval_generator.augment_enabled():
                    # Undo flipping and take average with non-flipped version
                    predicted_3d_pos[1, :, :, 0] *= -1
                    if not use_trajectory_model:
                        predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left]
                    predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True)

                if return_predictions:
                    return predicted_3d_pos.squeeze(0).cpu().numpy()

                inputs_3d = torch.from_numpy(batch.astype('float32'))
                if torch.cuda.is_available():
                    inputs_3d = inputs_3d.cuda()
                inputs_3d[:, :, 0] = 0
                if eval_generator.augment_enabled():
                    inputs_3d = inputs_3d[:1]

                error = mpjpe(predicted_3d_pos, inputs_3d)
                epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item()

                epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item()
                N += inputs_3d.shape[0] * inputs_3d.shape[1]

                inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])
                predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])

                epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs)

                # Compute velocity error
                epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs)

        if action is None:
            print('----------')
        else:
            print('----'+action+'----')
        e1 = (epoch_loss_3d_pos / N)*1000
        e2 = (epoch_loss_3d_pos_procrustes / N)*1000
        e3 = (epoch_loss_3d_pos_scale / N)*1000
        ev = (epoch_loss_3d_vel / N)*1000
        print('Test time augmentation:', eval_generator.augment_enabled())
        print('Protocol #1 Error (MPJPE):', e1, 'mm')
        print('Protocol #2 Error (P-MPJPE):', e2, 'mm')
        print('Protocol #3 Error (N-MPJPE):', e3, 'mm')
        print('Velocity Error (MPJVE):', ev, 'mm')
        print('----------')

        return e1, e2, e3, ev

    image_keypoints2d = kp
    gen = UnchunkedGenerator(None, None, [[image_keypoints2d]],
                             pad=pad, causal_shift=causal_shift, augment=False,
                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
    prediction = evaluate(gen, return_predictions=True)

    # here is the data format
    # public enum VideoPose3dJointOrder
    # {
    #     HIP = 0,
    #     R_HIP = 1,
    #     R_KNEE = 2,
    #     R_FOOT = 3,
    #     L_HIP = 4,
    #     L_KNEE = 5,
    #     L_FOOT = 6,
    #     SPINE = 7,
    #     THORAX = 8,
    #     NOSE = 9,
    #     HEAD = 10,
    #     L_SHOULDER = 11,
    #     L_ELBOW = 12,
    #     L_WRIST = 13,
    #     R_SHOULDER = 14,
    #     R_ELBOW = 15,
    #     R_WRIST = 16
    # }

    # this bugs out. dunno what the hell they were trying to do.
    # anyway we can fix it by just getting width/height some other way.

    # Invert camera transformation
    cam = dataset.cameras()

    width = cam['frame'][0]['res_w']
    height = cam['frame'][0]['res_h']

    image_keypoints2d = image_coordinates(image_keypoints2d[..., :2], w=width, h=height)

    viz_camera = 0

    # If the ground truth is not available, take the camera extrinsic params from a random subject.
    # They are almost the same, and anyway, we only need this for visualization purposes.
    for subject in dataset.cameras():
        if 'orientation' in dataset.cameras()[subject][viz_camera]:
            rot = dataset.cameras()[subject][viz_camera]['orientation']
            break
    prediction = camera_to_world(prediction, R=rot, t=0)
    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])

    # because algo was meant for a list of frames, we take the first frame (our only frame)
    prediction3d = prediction[0]

    return prediction3d, image_keypoints2d

    # do we want to visualize? this code used to write to json and create a video for visualization
    #if args.viz_output is not None:
    if True:

        anim_output = {'Reconstruction': prediction}

        # format the data in the same format as mediapipe, so we can load it in unity with the same script
        # we need a list (frames) of lists of 3d landmarks.
        unity_landmarks = prediction.tolist()

        # how to send data? or display it?
        # maybe draw it on the webcam feed....?!?!?!


        #with open(args.output_json, "w") as json_file:
        #    json.dump(unity_landmarks, json_file)

        #if args.rendervideo == "yes":
        #    from common.visualization import render_animation
        #    render_animation(input_keypoints, keypoints_metadata, anim_output,
        #                     dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output,
        #                     limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size,
        #                     input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']),
        #                     input_video_skip=args.viz_skip)

    we_re_done_here = 1
示例#4
0
def main(args):
    print('==> Using settings {}'.format(args))

    convm = torch.zeros(3, 17, 17, dtype=torch.float)

    print('==> Loading dataset...')
    dataset_path = path.join('data', 'data_3d_' + args.dataset + '.npz')
    if args.dataset == 'h36m':
        from common.h36m_dataset import Human36mDataset
        dataset = Human36mDataset(dataset_path)
    else:
        raise KeyError('Invalid dataset')

    print('==> Preparing data...')
    dataset = read_3d_data(dataset)

    print('==> Loading 2D detections...')
    keypoints = create_2d_data(
        path.join('data',
                  'data_2d_' + args.dataset + '_' + args.keypoints + '.npz'),
        dataset)

    cudnn.benchmark = True
    device = torch.device("cuda")

    # Create model
    print("==> Creating model...")

    if args.architecture == 'linear':
        from models.linear_model import LinearModel, init_weights
        num_joints = dataset.skeleton().num_joints()
        model_pos = LinearModel(num_joints * 2,
                                (num_joints - 1) * 3).to(device)
        model_pos.apply(init_weights)
    elif args.architecture == 'gcn':
        from models.sem_gcn import SemGCN
        from common.graph_utils import adj_mx_from_skeleton
        p_dropout = (None if args.dropout == 0.0 else args.dropout)
        adj = adj_mx_from_skeleton(dataset.skeleton())
        model_pos = SemGCN(convm,
                           adj,
                           args.hid_dim,
                           num_layers=args.num_layers,
                           p_dropout=p_dropout,
                           nodes_group=dataset.skeleton().joints_group()
                           if args.non_local else None).to(device)
    else:
        raise KeyError('Invalid model architecture')

    print("==> Total parameters: {:.2f}M".format(
        sum(p.numel() for p in model_pos.parameters()) / 1000000.0))

    # Resume from a checkpoint
    ckpt_path = args.evaluate

    if path.isfile(ckpt_path):
        print("==> Loading checkpoint '{}'".format(ckpt_path))
        ckpt = torch.load(ckpt_path)
        start_epoch = ckpt['epoch']
        error_best = ckpt['error']
        model_pos.load_state_dict(ckpt['state_dict'])
        print("==> Loaded checkpoint (Epoch: {} | Error: {})".format(
            start_epoch, error_best))
    else:
        raise RuntimeError("==> No checkpoint found at '{}'".format(ckpt_path))

    print('==> Rendering...')

    poses_2d = keypoints[args.viz_subject][args.viz_action]
    out_poses_2d = poses_2d[args.viz_camera]
    out_actions = [args.viz_camera] * out_poses_2d.shape[0]

    poses_3d = dataset[args.viz_subject][args.viz_action]['positions_3d']
    assert len(poses_3d) == len(poses_2d), 'Camera count mismatch'
    out_poses_3d = poses_3d[args.viz_camera]

    ground_truth = dataset[args.viz_subject][args.viz_action]['positions_3d'][
        args.viz_camera].copy()

    input_keypoints = out_poses_2d.copy()
    render_loader = DataLoader(PoseGenerator([out_poses_3d], [out_poses_2d],
                                             [out_actions]),
                               batch_size=args.batch_size,
                               shuffle=False,
                               num_workers=args.num_workers,
                               pin_memory=True)

    prediction = evaluate(render_loader, model_pos, device,
                          args.architecture)[0]

    # Invert camera transformation
    cam = dataset.cameras()[args.viz_subject][args.viz_camera]
    prediction = camera_to_world(prediction, R=cam['orientation'], t=0)
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    ground_truth = camera_to_world(ground_truth, R=cam['orientation'], t=0)
    ground_truth[:, :, 2] -= np.min(ground_truth[:, :, 2])

    anim_output = {'Regression': prediction, 'Ground truth': ground_truth}
    input_keypoints = image_coordinates(input_keypoints[..., :2],
                                        w=cam['res_w'],
                                        h=cam['res_h'])
    render_animation(input_keypoints,
                     anim_output,
                     dataset.skeleton(),
                     dataset.fps(),
                     args.viz_bitrate,
                     cam['azimuth'],
                     args.viz_output,
                     limit=args.viz_limit,
                     downsample=args.viz_downsample,
                     size=args.viz_size,
                     input_video_path=args.viz_video,
                     viewport=(cam['res_w'], cam['res_h']),
                     input_video_skip=args.viz_skip)
示例#5
0
def videpose_infer(args):
    from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates
    from common.generators import UnchunkedGenerator
    from common.model import TemporalModel
    from common.utils import Timer, evaluate, add_path
    from videopose import get_detector_2d, ckpt_time, metadata, time0

    import gene_npz

    gene_npz.args.outputpath = str(args.viz_output / "alpha_pose_kunkun_cut")
    print(gene_npz.args)
    # detector_2d = get_detector_2d(args.detector_2d)
    detector_2d = gene_npz.generate_kpts(args.detector_2d)

    assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)'

    # 2D kpts loads or generate
    if not args.input_npz:
        video_name = args.viz_video
        keypoints = detector_2d(video_name)
    else:
        npz = np.load(args.input_npz)
        keypoints = npz['kpts']  # (N, 17, 2)

    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(
        keypoints_symmetry[0]), list(keypoints_symmetry[1])
    joints_left, joints_right = list(
        [4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  Suppose using the camera parameter
    keypoints = normalize_screen_coordinates(
        keypoints[..., :2], w=1000, h=1002)

    model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels,
                              dense=args.dense)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

    ckpt, time1 = ckpt_time(time0)
    print('-------------- load data spends {:.2f} seconds'.format(ckpt))

    # load trained model
    chk_filename = os.path.join(
        args.checkpoint, args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(
        chk_filename, map_location=lambda storage, loc: storage)  # 把loc映射到storage
    model_pos.load_state_dict(checkpoint['model_pos'])

    ckpt, time2 = ckpt_time(time1)
    print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt))

    #  Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3]
    receptive_field = model_pos.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    print('Rendering...')
    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None, None, [input_keypoints],
                             pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
                             kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)

    # save 3D joint points
    np.save(args.viz_output / "test_3d_output.npy",
            prediction, allow_pickle=True)

    rot = np.array([0.14070565, -0.15007018, -0.7552408,
                   0.62232804], dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)

    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    anim_output = {'Reconstruction': prediction}
    input_keypoints = image_coordinates(
        input_keypoints[..., :2], w=1000, h=1002)

    ckpt, time3 = ckpt_time(time2)
    print(
        '-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt))

    ckpt, time4 = ckpt_time(time3)
    print('total spend {:2f} second'.format(ckpt))