Пример #1
0
def inference_video(video_path, detector_2d):
    """
    Do image -> 2d points -> 3d points to video.
    :param detector_2d: used 2d joints detector. Can be {alpha_pose, hr_pose}
    :param video_path: relative to outputs
    :return: None
    """
    args = parse_args()

    args.detector_2d = detector_2d
    dir_name = os.path.dirname(video_path)
    dir_name_split = dir_name[:dir_name.rfind('/')]
    new_dir_name = os.path.join(dir_name_split, 'outputvideo')

    basename = os.path.basename(video_path)
    video_name = basename[:basename.rfind('.')]

    args.viz_video = video_path
    #args.viz_output = f'{dir_name}/{args.detector_2d}_{video_name}.mp4'
    args.viz_output = f'{new_dir_name}/{args.detector_2d}_{video_name}.mp4'

    # args.viz_limit = 20
    # args.input_npz = 'outputs/alpha_pose_dance/dance.npz'

    args.evaluate = 'pretrained_h36m_detectron_coco.bin'

    with Timer(video_path):
        main(args)
Пример #2
0
def inference_video(video_path):
    """
    Do image -> 2d points -> 3d points to video.
    :param video_path: relative to outputs
    """
    args = parse_args()
    dir_name = os.path.dirname(video_path)
    basename = os.path.basename(video_path)
    video_name = basename[:basename.rfind('.')]
    args.viz_video = video_path
    args.viz_output = '{0}/o_{1}.mp4'.format(dir_name, video_name)
    args.basename = video_name
    args.evaluate = 'pretrained_h36m_detectron_coco.bin'

    with Timer(video_path):
        main(args)
Пример #3
0
import os

from common.arguments import parse_args
from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates
from common.generators import UnchunkedGenerator
from common.loss import *
from common.mocap_dataset import MocapDataset
from common.model import TemporalModel
from common.utils import deterministic_random

args = parse_args()
print(args)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def load_dataset() -> MocapDataset:
    """
    加载数据集
    Returns: dataset
    """
    print('Loading custom dataset...')
    if args.dataset.startswith('custom'):
        # 自定义数据集是2d关键点集,用于预测3d关键点
        from common.custom_dataset import CustomDataset
        dataset_ = CustomDataset('data/data_2d_' + args.dataset + '_' +
                                 args.keypoints + '.npz')
    else:
        raise KeyError('Invalid dataset')

    return dataset_
Пример #4
0
def main():
    args = parse_args()

    # 2D kpts loads or generate
    if not args.input_npz:
        # crate kpts by alphapose
        from joints_detectors.Alphapose.gene_npz import handle_video
        video_name = args.viz_video
        keypoints = handle_video(video_name)
    else:
        npz = np.load(args.input_npz)
        keypoints = npz['kpts']  #(N, 17, 2)

    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(keypoints_symmetry[0]), list(
        keypoints_symmetry[1])
    joints_left, joints_right = list([4, 5, 6, 11, 12,
                                      13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  假设use the camera parameter
    keypoints = normalize_screen_coordinates(keypoints[..., :2],
                                             w=1000,
                                             h=1002)

    model_pos = TemporalModel(17,
                              2,
                              17,
                              filter_widths=[3, 3, 3, 3, 3],
                              causal=args.causal,
                              dropout=args.dropout,
                              channels=args.channels,
                              dense=args.dense)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

    ckpt, time1 = ckpt_time(time0)
    print('------- load data spends {:.2f} seconds'.format(ckpt))

    # load trained model
    chk_filename = os.path.join(args.checkpoint,
                                args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', os.path.join(main_path, chk_filename))
    checkpoint = torch.load(
        os.path.join(main_path, chk_filename),
        map_location=lambda storage, loc: storage)  # 把loc映射到storage
    model_pos.load_state_dict(checkpoint['model_pos'])

    ckpt, time2 = ckpt_time(time1)
    print('------- load 3D model spends {:.2f} seconds'.format(ckpt))

    #  Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3]
    receptive_field = model_pos.receptive_field()
    pad = (receptive_field - 1) // 2  # Padding on each side
    causal_shift = 0

    print('Rendering...')
    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=args.test_time_augmentation,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate(gen, model_pos, return_predictions=True)

    rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804],
                   dtype=np.float32)
    prediction = camera_to_world(prediction, R=rot, t=0)

    # We don't have the trajectory, but at least we can rebase the height
    prediction[:, :, 2] -= np.min(prediction[:, :, 2])
    anim_output = {'Reconstruction': prediction}
    input_keypoints = image_coordinates(input_keypoints[..., :2],
                                        w=1000,
                                        h=1002)

    ckpt, time3 = ckpt_time(time2)
    print(
        '------- generate reconstruction 3D data spends {:.2f} seconds'.format(
            ckpt))

    if not args.viz_output:
        args.viz_output = 'result.mp4'

    from common.visualization import render_animation
    render_animation(input_keypoints,
                     anim_output,
                     skeleton(),
                     25,
                     args.viz_bitrate,
                     np.array(70., dtype=np.float32),
                     args.viz_output,
                     limit=args.viz_limit,
                     downsample=args.viz_downsample,
                     size=args.viz_size,
                     input_video_path=args.viz_video,
                     viewport=(1000, 1002),
                     input_video_skip=args.viz_skip)

    ckpt, time4 = ckpt_time(time3)
    print('total spend {:2f} second'.format(ckpt))
Пример #5
0
def inference_camera():
    args = parse_args()

    main_cam(args)
Пример #6
0
def main():
    args = parse_args()
    args.input_npz = "data/VideoPose_test.npz"
    metadata = {
        'layout_name':
        'coco',
        'num_joints':
        17,
        'keypoints_symmetry': [[1, 3, 5, 7, 9, 11, 13, 15],
                               [2, 4, 6, 8, 10, 12, 14, 16]]
    }

    npz = np.load(args.input_npz)
    keypoints = npz['kpts']
    keypoints_symmetry = metadata['keypoints_symmetry']
    kps_left, kps_right = list(keypoints_symmetry[0]), list(
        keypoints_symmetry[1])
    #same with the original: list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right())
    joints_left, joints_right = list([4, 5, 6, 11, 12,
                                      13]), list([1, 2, 3, 14, 15, 16])

    # normlization keypoints  Suppose using the camera parameter
    res_w = 1920
    res_h = 1080
    keypoints = normalize_screen_coordinates(keypoints[..., :2],
                                             w=res_w,
                                             h=res_h)

    #model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(),
    #                        filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
    #                        dense=args.dense)
    model_pos = TemporalModel(17,
                              2,
                              17,
                              filter_widths=[3, 3, 3, 3, 3],
                              causal=args.causal,
                              dropout=args.dropout,
                              channels=args.channels,
                              dense=args.dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    if args.causal:
        print('INFO: Using causal convolutions')
        causal_shift = pad
    else:
        causal_shift = 0

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()

#load model
    chk_filename = os.path.join(args.checkpoint,
                                args.resume if args.resume else args.evaluate)
    print('Loading checkpoint', chk_filename)
    checkpoint = torch.load(chk_filename,
                            map_location=lambda storage, loc: storage)
    print('This model was trained for {} epochs'.format(checkpoint['epoch']))
    model_pos.load_state_dict(checkpoint['model_pos'])

    #test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d,
    #                                pad=pad, causal_shift=causal_shift, augment=False,
    #                                kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)

    input_keypoints = keypoints.copy()
    gen = UnchunkedGenerator(None,
                             None, [input_keypoints],
                             pad=pad,
                             causal_shift=causal_shift,
                             augment=args.test_time_augmentation,
                             kps_left=kps_left,
                             kps_right=kps_right,
                             joints_left=joints_left,
                             joints_right=joints_right)
    prediction = evaluate_alphapose(gen, model_pos, return_predictions=True)
    print('INFO: Testing on {} frames'.format(gen.num_frames()))

    if args.viz_export is not None:
        print('Exporting joint positions to', args.viz_export)
        # Predictions are in camera space
        np.save(args.viz_export, prediction)

    if args.viz_output is not None:
        #from custom_dataset.py
        rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804],
                       dtype=np.float32)
        prediction = camera_to_world(prediction, R=rot, t=0)

        # We don't have the trajectory, but at least we can rebase the height
        prediction[:, :, 2] -= np.min(prediction[:, :, 2])
        anim_output = {'Reconstruction': prediction}
        input_keypoints = image_coordinates(input_keypoints[..., :2],
                                            w=res_w,
                                            h=res_h)
        # Generate metadata:
        keypoints_metadata = {}
        keypoints_metadata['layout_name'] = 'coco'
        keypoints_metadata['num_joints'] = 17
        keypoints_metadata['keypoints_symmetry'] = [[
            1, 3, 5, 7, 9, 11, 13, 15
        ], [2, 4, 6, 8, 10, 12, 14, 16]]
        from common.visualization import render_animation
        #fps 25, azimuth 70
        render_animation(input_keypoints,
                         keypoints_metadata,
                         anim_output,
                         Skeleton(),
                         25,
                         args.viz_bitrate,
                         np.array(70., dtype=np.float32),
                         args.viz_output,
                         limit=args.viz_limit,
                         downsample=args.viz_downsample,
                         size=args.viz_size,
                         input_video_path=args.viz_video,
                         viewport=(res_w, res_h),
                         input_video_skip=args.viz_skip)
Пример #7
0
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import sys
import errno

from common.camera import *
from common.model import *
from common.loss import *
from common.generators import ChunkedGenerator, UnchunkedGenerator
from time import time
from common.utils import deterministic_random

args = parse_args()
print(args)

try:
    # Create checkpoint directory if it does not exist
    os.makedirs(args.checkpoint)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise RuntimeError('Unable to create checkpoint directory:', args.checkpoint)


print('Loading dataset...')
dataset_path = 'data/data_3d_' + args.dataset + '.npz'
if args.dataset == 'h36m':
    from common.h36m_dataset import Human36mDataset
    dataset = Human36mDataset(dataset_path)
Пример #8
0
            import json
            # format the data in the same format as mediapipe, so we can load it in unity with the same script
            # we need a list (frames) of lists of 3d landmarks.
            # but prediction[] only has 17 landmarks, and we need 25 in our unity script
            unity_landmarks = prediction.tolist()

            with open(args.output_json, "w") as json_file:
                json.dump(unity_landmarks, json_file)

            if args.rendervideo == "yes":

                from common.visualization import render_animation
                render_animation(input_keypoints,
                                 keypoints_metadata,
                                 anim_output,
                                 dataset.skeleton(),
                                 dataset.fps(),
                                 args.viz_bitrate,
                                 cam['azimuth'],
                                 args.viz_output,
                                 limit=args.viz_limit,
                                 downsample=args.viz_downsample,
                                 size=args.viz_size,
                                 input_video_path=args.viz_video,
                                 viewport=(cam['res_w'], cam['res_h']),
                                 input_video_skip=args.viz_skip)


if __name__ == '__main__':
    the_main_kaboose(parse_args())
Пример #9
0
                
            if self.FRAME % self.LOG_INTERVAL == 0 and len(self.PREVIOUS_REWARD) >= self.EVAL_LENGTH:
                print('Frame %7d, epoch %6d, %5d steps, %.1f steps/s, loss %4.4f, %4.2f'
                      % (self.FRAME, len(self.PREVIOUS_REWARD), self.FRAME * self.threads,
                         (self.FRAME * self.threads - last_step) / (time.time() - last_time), last_loss,
                         sum(self.PREVIOUS_REWARD[-self.EVAL_LENGTH:]) / self.EVAL_LENGTH
                        )
                     )
                last_time = time.time()
                last_step = self.FRAME * self.threads

            state = next_s

    def evaluate_terminate(self, evaluate_length = 10):
        if len(self.PREVIOUS_REWARD) > evaluate_length:
            now = sum(self.PREVIOUS_REWARD[-evaluate_length:]) / evaluate_length
            if now > self.BEST_RESULT:
                if self.BEST_RESULT != -1e100:
                    print('best result updated, %.4f -> %.4f.' % (self.BEST_RESULT, now))
                self.BEST_RESULT = now
                if self.MODEL_SAVE_PATH != '':
                    torch.save(self.model.state_dict(), self.MODEL_SAVE_PATH)
                if now > self.TARGET_REWARD:
                    print('Problem solved, stop training.')
                    self.TXSW.close()
                    return True
        return False

if __name__ == "__main__":
    main = ActorCriticMain(**vars(parse_args()))
    main.main()
Пример #10
0
def main():
    #cap = cv2.VideoCapture(0)
    cap = cv2.VideoCapture('D://data//videos//VID_29551_cam0_crop.mkv')

    #parser = argparse.ArgumentParser()
    opWrapper = op.WrapperPython()

    params = dict()
    params["model_folder"] = "D://models//"

    opWrapper.configure(params)

    opWrapper.start()

    if not glfw.init():
        return

    window = glfw.create_window(w_width, w_height, "My OpenGL window", None,
                                None)

    if not window:
        glfw.terminate()
        return

    glfw.make_context_current(window)
    glfw.set_window_size_callback(window, window_resize)

    vertex_shader = """
    #version 330
    in vec3 position;

    uniform mat4 view;
    uniform mat4 model;
    uniform mat4 projection;

    void main()
    {
        gl_Position = projection * view * model * vec4(position, 1.0f);
    }
    """

    fragment_shader = """
    #version 330
    out vec4 outColor;
    void main()
    {
        outColor = vec4(1.0f,1.0f,1.0f,1.0f);
    }
    """
    shader = OpenGL.GL.shaders.compileProgram(
        OpenGL.GL.shaders.compileShader(vertex_shader, GL_VERTEX_SHADER),
        OpenGL.GL.shaders.compileShader(fragment_shader, GL_FRAGMENT_SHADER))

    VBO = glGenBuffers(1)
    glBindBuffer(GL_ARRAY_BUFFER, VBO)
    glBufferData(GL_ARRAY_BUFFER, 17 * 3 * 4, None, GL_DYNAMIC_DRAW)

    EBO = glGenBuffers(1)
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, EBO)
    glBufferData(GL_ELEMENT_ARRAY_BUFFER, 32 * 8, parentsIndices,
                 GL_STATIC_DRAW)

    position = glGetAttribLocation(shader, "position")
    glVertexAttribPointer(position, 3, GL_FLOAT, GL_FALSE, 0,
                          ctypes.c_void_p(0))
    glEnableVertexAttribArray(position)

    glUseProgram(shader)

    view = pyrr.matrix44.create_from_translation(pyrr.Vector3([0.0, 0.0,
                                                               -3.0]))
    projection = pyrr.matrix44.create_perspective_projection_matrix(
        45.0, w_width / w_height, 0.1, 100.0)
    model = pyrr.matrix44.create_from_translation(pyrr.Vector3([0.0, 0.0,
                                                                0.0]))

    view_loc = glGetUniformLocation(shader, "view")
    proj_loc = glGetUniformLocation(shader, "projection")
    model_loc = glGetUniformLocation(shader, "model")

    glUniformMatrix4fv(view_loc, 1, GL_FALSE, view)
    glUniformMatrix4fv(proj_loc, 1, GL_FALSE, projection)
    glUniformMatrix4fv(model_loc, 1, GL_FALSE, model)

    glClearColor(114.0 / 255.0, 144.0 / 255.0, 154.0 / 255.0, 1.0)
    glEnable(GL_DEPTH_TEST)
    glViewport(0, 0, w_width, w_height)

    args = parse_args()
    print(args)

    try:
        # Create checkpoint directory if it does not exist
        os.makedirs(args.checkpoint)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise RuntimeError('Unable to create checkpoint directory:',
                               args.checkpoint)

    print('Loading 2D detections...')

    keypoints = np.load('data/data_2d_' + args.keypoints + '.npz')

    keypoints = keypoints['positions_2d'].item()

    subject = 'S1'

    action = 'Directions 1'

    width_of = 410
    height_of = 374

    for cam_idx, kps in enumerate(keypoints[subject][action]):

        # Normalize camera frame
        # cam = dataset.cameras()[subject][cam_idx]
        kps[..., :2] = normalize_screen_coordinates(kps[..., :2],
                                                    w=width_of,
                                                    h=height_of)
        keypoints[subject][action][cam_idx] = kps

    subjects_train = args.subjects_train.split(',')
    subjects_semi = [] if not args.subjects_unlabeled else args.subjects_unlabeled.split(
        ',')
    subjects_test = args.subjects_test.split(',')

    semi_supervised = len(subjects_semi) > 0
    if semi_supervised and not dataset.supports_semi_supervised():
        raise RuntimeError(
            'Semi-supervised training is not implemented for this dataset')

    action_filter = None if args.actions == '*' else args.actions.split(',')
    if action_filter is not None:
        print('Selected actions:', action_filter)

    cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test,
                                                       keypoints,
                                                       args.downsample,
                                                       action_filter)

    filter_widths = [int(x) for x in args.architecture.split(',')]

    # IF RENDERING TO A VIDEO
    if args.viz_output:
        model_pos = TemporalModel(poses_valid_2d[0].shape[1],
                                  poses_valid_2d[0].shape[2],
                                  17,
                                  filter_widths=filter_widths,
                                  causal=args.causal,
                                  dropout=args.dropout,
                                  channels=args.channels,
                                  dense=args.dense)
    else:
        model_pos = TemporalModelOptimized1f(poses_valid_2d[0].shape[1],
                                             poses_valid_2d[0].shape[2],
                                             17,
                                             filter_widths=filter_widths,
                                             causal=args.causal,
                                             dropout=args.dropout,
                                             channels=args.channels)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2  # Padding on each side
    if args.causal:
        print('INFO: Using causal convolutions')
        causal_shift = pad
    else:
        causal_shift = 0

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
    #    model_pos_train = model_pos_train.cuda()

    if args.resume or args.evaluate:
        chk_filename = os.path.join(
            args.checkpoint, args.resume if args.resume else args.evaluate)
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename,
                                map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(
            checkpoint['epoch']))
        model_pos.load_state_dict(checkpoint['model_pos'])

    # IF RENDERING TO A VIDEO
    if args.viz_output:

        print('Rendering...')
        my_action = 'Directions 1'

        input_keypoints = keypoints[args.viz_subject][my_action][
            args.viz_camera].copy()

        gen = UnchunkedGenerator(None,
                                 None, [input_keypoints],
                                 pad=pad,
                                 causal_shift=causal_shift,
                                 augment=args.test_time_augmentation,
                                 kps_left=kps_left,
                                 kps_right=kps_right,
                                 joints_left=joints_left,
                                 joints_right=joints_right)

        prediction = evaluate(gen, model_pos, return_predictions=True)

        ground_truth = None

        # these values taken from a camera in the h36m dataset, would be good to get/determine values rom stereo calibration of the pip cameras
        prediction = camera_to_world(
            prediction,
            R=[0.14070565, -0.15007018, -0.7552408, 0.62232804],
            t=[1.841107, 4.9552846, 0.5634454])
        # We don't have the trajectory, but at least we can rebase the height
        prediction[:, :, 2] -= np.min(prediction[:, :, 2])

        anim_output = {'Reconstruction': prediction}

        input_keypoints = image_coordinates(input_keypoints[..., :2],
                                            w=width_of,
                                            h=height_of)

        manual_fps = 25

        np.savez('out_3D_vp3d', anim_output['Reconstruction'])
        camAzimuth = 70.0
        from common.visualization import render_animation
        render_animation(input_keypoints,
                         anim_output,
                         manual_fps,
                         args.viz_bitrate,
                         camAzimuth,
                         args.viz_output,
                         limit=args.viz_limit,
                         downsample=args.viz_downsample,
                         size=args.viz_size,
                         input_video_path=args.viz_video,
                         viewport=(width_of, height_of),
                         input_video_skip=args.viz_skip)
    # IF RENDERING LIVE

    else:
        print('Rendering...')
        my_action = 'Directions 1'

        input_keypoints = keypoints[args.viz_subject][my_action][
            args.viz_camera].copy()

        gen = UnchunkedGenerator(None,
                                 None, [input_keypoints],
                                 pad=pad,
                                 causal_shift=causal_shift,
                                 augment=args.test_time_augmentation,
                                 kps_left=kps_left,
                                 kps_right=kps_right,
                                 joints_left=joints_left,
                                 joints_right=joints_right)

        prediction = evaluateLive(gen,
                                  model_pos,
                                  VBO,
                                  window,
                                  model_loc,
                                  cap,
                                  opWrapper,
                                  return_predictions=True)
        glfw.terminate()
        cap.release()
        cv2.destroyAllWindows()
Пример #11
0
def main(input_args):
    vp3d_dir = input_args.vp3d_dir
    sys.path.append(vp3d_dir)

    from common.camera import normalize_screen_coordinates
    from common.model import TemporalModel
    from common.generators import UnchunkedGenerator
    from common.arguments import parse_args

    args = parse_args()
    print(args)

    kps_left = [4, 5, 6, 11, 12, 13]
    kps_right = [1, 2, 3, 14, 15, 16]
    joints_left = [4, 5, 6, 11, 12, 13]
    joints_right = [1, 2, 3, 14, 15, 16]

    filter_widths = [int(x) for x in args.architecture.split(',')]

    num_joints_in = 17
    in_features = 2
    num_joints_out = 17
        
    model_pos = TemporalModel(num_joints_in, in_features, num_joints_out,
                                filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels,
                                dense=args.dense)

    receptive_field = model_pos.receptive_field()
    print('INFO: Receptive field: {} frames'.format(receptive_field))
    pad = (receptive_field - 1) // 2 # Padding on each side
    if args.causal:
        print('INFO: Using causal convolutions')
        causal_shift = pad
    else:
        causal_shift = 0

    model_params = 0
    for parameter in model_pos.parameters():
        model_params += parameter.numel()
    print('INFO: Trainable parameter count:', model_params)

    if torch.cuda.is_available():
        model_pos = model_pos.cuda()
        
    if args.resume or args.evaluate:
        chk_filename = os.path.join(vp3d_dir, args.checkpoint, args.resume if args.resume else args.evaluate)
        print('Loading checkpoint', chk_filename)
        checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage)
        print('This model was trained for {} epochs'.format(checkpoint['epoch']))
        model_pos.load_state_dict(checkpoint['model_pos'])

    # Evaluate
    def evaluate(test_generator, action=None, return_predictions=False):
        epoch_loss_3d_pos = 0
        epoch_loss_3d_pos_procrustes = 0
        epoch_loss_3d_pos_scale = 0
        epoch_loss_3d_vel = 0
        with torch.no_grad():
            model_pos.eval()
            N = 0
            for _, batch, batch_2d in test_generator.next_epoch():
                inputs_2d = torch.from_numpy(batch_2d.astype('float32'))
                if torch.cuda.is_available():
                    inputs_2d = inputs_2d.cuda()
                # Positional model
                predicted_3d_pos = model_pos(inputs_2d)

                # Test-time augmentation (if enabled)
                if test_generator.augment_enabled():
                    # Undo flipping and take average with non-flipped version
                    predicted_3d_pos[1, :, :, 0] *= -1
                    predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left]
                    predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True)
                    
                if return_predictions:
                    return predicted_3d_pos.squeeze(0).cpu().numpy()
                    
                inputs_3d = torch.from_numpy(batch.astype('float32'))
                if torch.cuda.is_available():
                    inputs_3d = inputs_3d.cuda()
                inputs_3d[:, :, 0] = 0    
                if test_generator.augment_enabled():
                    inputs_3d = inputs_3d[:1]

                error = mpjpe(predicted_3d_pos, inputs_3d)
                epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item()

                epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item()
                N += inputs_3d.shape[0] * inputs_3d.shape[1]
                
                inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])
                predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1])

                epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs)

                # Compute velocity error
                epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs)
                
        if action is None:
            print('----------')
        else:
            print('----'+action+'----')
        e1 = (epoch_loss_3d_pos / N)*1000
        e2 = (epoch_loss_3d_pos_procrustes / N)*1000
        e3 = (epoch_loss_3d_pos_scale / N)*1000
        ev = (epoch_loss_3d_vel / N)*1000
        print('Test time augmentation:', test_generator.augment_enabled())
        print('Protocol #1 Error (MPJPE):', e1, 'mm')
        print('Protocol #2 Error (P-MPJPE):', e2, 'mm')
        print('Protocol #3 Error (N-MPJPE):', e3, 'mm')
        print('Velocity Error (MPJVE):', ev, 'mm')
        print('----------')

        return e1, e2, e3, ev

    def get_gt_dirs(input_path, camera_id='dev3'):
        """Get all directories with ground-truth 2D human pose annotations
        """
        gt_path_list = []
        category_path_list = get_subdirs(input_path)
        for category in category_path_list:
            if os.path.basename(category) != 'Calibration':
                category_scans = get_subdirs(category)
                for category_scan in category_scans:
                    device_list = get_subdirs(category_scan)
                    for device_path in device_list:
                        if camera_id in device_path:
                            if os.path.exists(os.path.join(device_path, 'pose2d')): # 2D annotations exist
                                gt_path_list.append(device_path) # eg <root>/Lack_TV_Bench/0007_white_floor_08_04_2019_08_28_10_47/dev3
        return gt_path_list

    def get_subdirs(input_path):
        '''
        get a list of subdirectories in input_path directory
        :param input_path: parent directory (in which to get the subdirectories)
        :return:
        subdirs: list of subdirectories in input_path
        '''
        subdirs = [os.path.join(input_path, dir_i) for dir_i in os.listdir(input_path)
                   if os.path.isdir(os.path.join(input_path, dir_i))]
        subdirs.sort()
        return subdirs

    fps = 30
    frame_width = 1920.0
    frame_height = 1080.0

    h36m_joint_names = get_h36m_joint_names()
    h36m_joint_names_dict = {name: i for i, name in enumerate(h36m_joint_names)}
    joint_names = get_body25_joint_names()
    joint_names_dict = {name: i for i, name in enumerate(joint_names)}

    dataset_dir = input_args.dataset_dir
    camera_id = input_args.camera_id

    gt_dirs = get_gt_dirs(dataset_dir, camera_id)
    for i, gt_dir in enumerate(gt_dirs):
        print(f"\nProcessing {i} of {len(gt_dirs)}: {' '.join(gt_dir.split('/')[-3:-1])}")
        
        input_dir = os.path.join(gt_dir, 'predictions', 'pose2d', 'openpose')
        output_dir = os.path.join(gt_dir, 'predictions', 'pose3d', 'vp3d')
        os.makedirs(output_dir, exist_ok=True)

        json_mask = os.path.join(input_dir, 'scan_video_00000000????_keypoints.json')
        json_files = sorted(glob(json_mask))
        input_keypoints = []
        for json_file in json_files:
            with open(json_file, 'r') as f:
                pose2d = json.load(f)
            if len(pose2d["people"]) == 0:
                keypoints_op = np.zeros((19, 3))
            else:
                keypoints_op = np.array(pose2d["people"][0]["pose_keypoints_2d"]).reshape(-1, 3) # Takes first detected person every time...
            keypoints = np.zeros((17, 3))
            for i, joint_name in enumerate(h36m_joint_names):
                if joint_name == 'spine' or joint_name == 'head':
                    continue
                joint_id = joint_names_dict[joint_name]
                keypoints[i, :] = keypoints_op[joint_id, :]
            keypoints[h36m_joint_names_dict['mid hip'], :] = np.mean((keypoints[h36m_joint_names_dict['left hip'], :], keypoints[h36m_joint_names_dict['right hip'], :]), axis=0) # mid hip = mean(left hip, right hip)
            keypoints[h36m_joint_names_dict['spine'], :] = np.mean((keypoints[h36m_joint_names_dict['neck'], :], keypoints[h36m_joint_names_dict['mid hip'], :]), axis=0) # spine = mean(neck, mid hip)
            keypoints[h36m_joint_names_dict['head'], :] = np.mean((keypoints_op[joint_names_dict['left ear'], :], keypoints_op[joint_names_dict['right ear'], :]), axis=0) # head = mean(left ear, right ear)
            input_keypoints.append(keypoints)
        input_keypoints = np.array(input_keypoints)

        input_keypoints = input_keypoints[:, :, :2] # For pretrained_h36m_cpn.bin and cpn_ft_h36m_dbb

        input_keypoints[..., :2] = normalize_screen_coordinates(input_keypoints[..., :2], w=frame_width, h=frame_height)

        args.test_time_augmentation=True
        gen = UnchunkedGenerator(None, None, [input_keypoints],
                                 pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation,
                                 kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right)
        prediction = evaluate(gen, return_predictions=True) # Nx17x3

        pickle.dump(prediction, open(os.path.join(output_dir, 'vp3d_output.pkl'), "wb"))