def inference_video(video_path, detector_2d): """ Do image -> 2d points -> 3d points to video. :param detector_2d: used 2d joints detector. Can be {alpha_pose, hr_pose} :param video_path: relative to outputs :return: None """ args = parse_args() args.detector_2d = detector_2d dir_name = os.path.dirname(video_path) dir_name_split = dir_name[:dir_name.rfind('/')] new_dir_name = os.path.join(dir_name_split, 'outputvideo') basename = os.path.basename(video_path) video_name = basename[:basename.rfind('.')] args.viz_video = video_path #args.viz_output = f'{dir_name}/{args.detector_2d}_{video_name}.mp4' args.viz_output = f'{new_dir_name}/{args.detector_2d}_{video_name}.mp4' # args.viz_limit = 20 # args.input_npz = 'outputs/alpha_pose_dance/dance.npz' args.evaluate = 'pretrained_h36m_detectron_coco.bin' with Timer(video_path): main(args)
def inference_video(video_path): """ Do image -> 2d points -> 3d points to video. :param video_path: relative to outputs """ args = parse_args() dir_name = os.path.dirname(video_path) basename = os.path.basename(video_path) video_name = basename[:basename.rfind('.')] args.viz_video = video_path args.viz_output = '{0}/o_{1}.mp4'.format(dir_name, video_name) args.basename = video_name args.evaluate = 'pretrained_h36m_detectron_coco.bin' with Timer(video_path): main(args)
import os from common.arguments import parse_args from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates from common.generators import UnchunkedGenerator from common.loss import * from common.mocap_dataset import MocapDataset from common.model import TemporalModel from common.utils import deterministic_random args = parse_args() print(args) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def load_dataset() -> MocapDataset: """ 加载数据集 Returns: dataset """ print('Loading custom dataset...') if args.dataset.startswith('custom'): # 自定义数据集是2d关键点集,用于预测3d关键点 from common.custom_dataset import CustomDataset dataset_ = CustomDataset('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz') else: raise KeyError('Invalid dataset') return dataset_
def main(): args = parse_args() # 2D kpts loads or generate if not args.input_npz: # crate kpts by alphapose from joints_detectors.Alphapose.gene_npz import handle_video video_name = args.viz_video keypoints = handle_video(video_name) else: npz = np.load(args.input_npz) keypoints = npz['kpts'] #(N, 17, 2) keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list(keypoints_symmetry[0]), list( keypoints_symmetry[1]) joints_left, joints_right = list([4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints 假设use the camera parameter keypoints = normalize_screen_coordinates(keypoints[..., :2], w=1000, h=1002) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_pos = model_pos.cuda() ckpt, time1 = ckpt_time(time0) print('------- load data spends {:.2f} seconds'.format(ckpt)) # load trained model chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', os.path.join(main_path, chk_filename)) checkpoint = torch.load( os.path.join(main_path, chk_filename), map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time2 = ckpt_time(time1) print('------- load 3D model spends {:.2f} seconds'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates(input_keypoints[..., :2], w=1000, h=1002) ckpt, time3 = ckpt_time(time2) print( '------- generate reconstruction 3D data spends {:.2f} seconds'.format( ckpt)) if not args.viz_output: args.viz_output = 'result.mp4' from common.visualization import render_animation render_animation(input_keypoints, anim_output, skeleton(), 25, args.viz_bitrate, np.array(70., dtype=np.float32), args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(1000, 1002), input_video_skip=args.viz_skip) ckpt, time4 = ckpt_time(time3) print('total spend {:2f} second'.format(ckpt))
def inference_camera(): args = parse_args() main_cam(args)
def main(): args = parse_args() args.input_npz = "data/VideoPose_test.npz" metadata = { 'layout_name': 'coco', 'num_joints': 17, 'keypoints_symmetry': [[1, 3, 5, 7, 9, 11, 13, 15], [2, 4, 6, 8, 10, 12, 14, 16]] } npz = np.load(args.input_npz) keypoints = npz['kpts'] keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list(keypoints_symmetry[0]), list( keypoints_symmetry[1]) #same with the original: list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right()) joints_left, joints_right = list([4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints Suppose using the camera parameter res_w = 1920 res_h = 1080 keypoints = normalize_screen_coordinates(keypoints[..., :2], w=res_w, h=res_h) #model_pos = TemporalModel(poses_valid_2d[0].shape[-2], poses_valid_2d[0].shape[-1], dataset.skeleton().num_joints(), # filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, # dense=args.dense) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side if args.causal: print('INFO: Using causal convolutions') causal_shift = pad else: causal_shift = 0 model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() #load model chk_filename = os.path.join(args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos.load_state_dict(checkpoint['model_pos']) #test_generator = UnchunkedGenerator(cameras_valid, poses_valid, poses_valid_2d, # pad=pad, causal_shift=causal_shift, augment=False, # kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate_alphapose(gen, model_pos, return_predictions=True) print('INFO: Testing on {} frames'.format(gen.num_frames())) if args.viz_export is not None: print('Exporting joint positions to', args.viz_export) # Predictions are in camera space np.save(args.viz_export, prediction) if args.viz_output is not None: #from custom_dataset.py rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates(input_keypoints[..., :2], w=res_w, h=res_h) # Generate metadata: keypoints_metadata = {} keypoints_metadata['layout_name'] = 'coco' keypoints_metadata['num_joints'] = 17 keypoints_metadata['keypoints_symmetry'] = [[ 1, 3, 5, 7, 9, 11, 13, 15 ], [2, 4, 6, 8, 10, 12, 14, 16]] from common.visualization import render_animation #fps 25, azimuth 70 render_animation(input_keypoints, keypoints_metadata, anim_output, Skeleton(), 25, args.viz_bitrate, np.array(70., dtype=np.float32), args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(res_w, res_h), input_video_skip=args.viz_skip)
import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import os import sys import errno from common.camera import * from common.model import * from common.loss import * from common.generators import ChunkedGenerator, UnchunkedGenerator from time import time from common.utils import deterministic_random args = parse_args() print(args) try: # Create checkpoint directory if it does not exist os.makedirs(args.checkpoint) except OSError as e: if e.errno != errno.EEXIST: raise RuntimeError('Unable to create checkpoint directory:', args.checkpoint) print('Loading dataset...') dataset_path = 'data/data_3d_' + args.dataset + '.npz' if args.dataset == 'h36m': from common.h36m_dataset import Human36mDataset dataset = Human36mDataset(dataset_path)
import json # format the data in the same format as mediapipe, so we can load it in unity with the same script # we need a list (frames) of lists of 3d landmarks. # but prediction[] only has 17 landmarks, and we need 25 in our unity script unity_landmarks = prediction.tolist() with open(args.output_json, "w") as json_file: json.dump(unity_landmarks, json_file) if args.rendervideo == "yes": from common.visualization import render_animation render_animation(input_keypoints, keypoints_metadata, anim_output, dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), input_video_skip=args.viz_skip) if __name__ == '__main__': the_main_kaboose(parse_args())
if self.FRAME % self.LOG_INTERVAL == 0 and len(self.PREVIOUS_REWARD) >= self.EVAL_LENGTH: print('Frame %7d, epoch %6d, %5d steps, %.1f steps/s, loss %4.4f, %4.2f' % (self.FRAME, len(self.PREVIOUS_REWARD), self.FRAME * self.threads, (self.FRAME * self.threads - last_step) / (time.time() - last_time), last_loss, sum(self.PREVIOUS_REWARD[-self.EVAL_LENGTH:]) / self.EVAL_LENGTH ) ) last_time = time.time() last_step = self.FRAME * self.threads state = next_s def evaluate_terminate(self, evaluate_length = 10): if len(self.PREVIOUS_REWARD) > evaluate_length: now = sum(self.PREVIOUS_REWARD[-evaluate_length:]) / evaluate_length if now > self.BEST_RESULT: if self.BEST_RESULT != -1e100: print('best result updated, %.4f -> %.4f.' % (self.BEST_RESULT, now)) self.BEST_RESULT = now if self.MODEL_SAVE_PATH != '': torch.save(self.model.state_dict(), self.MODEL_SAVE_PATH) if now > self.TARGET_REWARD: print('Problem solved, stop training.') self.TXSW.close() return True return False if __name__ == "__main__": main = ActorCriticMain(**vars(parse_args())) main.main()
def main(): #cap = cv2.VideoCapture(0) cap = cv2.VideoCapture('D://data//videos//VID_29551_cam0_crop.mkv') #parser = argparse.ArgumentParser() opWrapper = op.WrapperPython() params = dict() params["model_folder"] = "D://models//" opWrapper.configure(params) opWrapper.start() if not glfw.init(): return window = glfw.create_window(w_width, w_height, "My OpenGL window", None, None) if not window: glfw.terminate() return glfw.make_context_current(window) glfw.set_window_size_callback(window, window_resize) vertex_shader = """ #version 330 in vec3 position; uniform mat4 view; uniform mat4 model; uniform mat4 projection; void main() { gl_Position = projection * view * model * vec4(position, 1.0f); } """ fragment_shader = """ #version 330 out vec4 outColor; void main() { outColor = vec4(1.0f,1.0f,1.0f,1.0f); } """ shader = OpenGL.GL.shaders.compileProgram( OpenGL.GL.shaders.compileShader(vertex_shader, GL_VERTEX_SHADER), OpenGL.GL.shaders.compileShader(fragment_shader, GL_FRAGMENT_SHADER)) VBO = glGenBuffers(1) glBindBuffer(GL_ARRAY_BUFFER, VBO) glBufferData(GL_ARRAY_BUFFER, 17 * 3 * 4, None, GL_DYNAMIC_DRAW) EBO = glGenBuffers(1) glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, EBO) glBufferData(GL_ELEMENT_ARRAY_BUFFER, 32 * 8, parentsIndices, GL_STATIC_DRAW) position = glGetAttribLocation(shader, "position") glVertexAttribPointer(position, 3, GL_FLOAT, GL_FALSE, 0, ctypes.c_void_p(0)) glEnableVertexAttribArray(position) glUseProgram(shader) view = pyrr.matrix44.create_from_translation(pyrr.Vector3([0.0, 0.0, -3.0])) projection = pyrr.matrix44.create_perspective_projection_matrix( 45.0, w_width / w_height, 0.1, 100.0) model = pyrr.matrix44.create_from_translation(pyrr.Vector3([0.0, 0.0, 0.0])) view_loc = glGetUniformLocation(shader, "view") proj_loc = glGetUniformLocation(shader, "projection") model_loc = glGetUniformLocation(shader, "model") glUniformMatrix4fv(view_loc, 1, GL_FALSE, view) glUniformMatrix4fv(proj_loc, 1, GL_FALSE, projection) glUniformMatrix4fv(model_loc, 1, GL_FALSE, model) glClearColor(114.0 / 255.0, 144.0 / 255.0, 154.0 / 255.0, 1.0) glEnable(GL_DEPTH_TEST) glViewport(0, 0, w_width, w_height) args = parse_args() print(args) try: # Create checkpoint directory if it does not exist os.makedirs(args.checkpoint) except OSError as e: if e.errno != errno.EEXIST: raise RuntimeError('Unable to create checkpoint directory:', args.checkpoint) print('Loading 2D detections...') keypoints = np.load('data/data_2d_' + args.keypoints + '.npz') keypoints = keypoints['positions_2d'].item() subject = 'S1' action = 'Directions 1' width_of = 410 height_of = 374 for cam_idx, kps in enumerate(keypoints[subject][action]): # Normalize camera frame # cam = dataset.cameras()[subject][cam_idx] kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=width_of, h=height_of) keypoints[subject][action][cam_idx] = kps subjects_train = args.subjects_train.split(',') subjects_semi = [] if not args.subjects_unlabeled else args.subjects_unlabeled.split( ',') subjects_test = args.subjects_test.split(',') semi_supervised = len(subjects_semi) > 0 if semi_supervised and not dataset.supports_semi_supervised(): raise RuntimeError( 'Semi-supervised training is not implemented for this dataset') action_filter = None if args.actions == '*' else args.actions.split(',') if action_filter is not None: print('Selected actions:', action_filter) cameras_valid, poses_valid, poses_valid_2d = fetch(subjects_test, keypoints, args.downsample, action_filter) filter_widths = [int(x) for x in args.architecture.split(',')] # IF RENDERING TO A VIDEO if args.viz_output: model_pos = TemporalModel(poses_valid_2d[0].shape[1], poses_valid_2d[0].shape[2], 17, filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) else: model_pos = TemporalModelOptimized1f(poses_valid_2d[0].shape[1], poses_valid_2d[0].shape[2], 17, filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side if args.causal: print('INFO: Using causal convolutions') causal_shift = pad else: causal_shift = 0 model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() # model_pos_train = model_pos_train.cuda() if args.resume or args.evaluate: chk_filename = os.path.join( args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format( checkpoint['epoch'])) model_pos.load_state_dict(checkpoint['model_pos']) # IF RENDERING TO A VIDEO if args.viz_output: print('Rendering...') my_action = 'Directions 1' input_keypoints = keypoints[args.viz_subject][my_action][ args.viz_camera].copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) ground_truth = None # these values taken from a camera in the h36m dataset, would be good to get/determine values rom stereo calibration of the pip cameras prediction = camera_to_world( prediction, R=[0.14070565, -0.15007018, -0.7552408, 0.62232804], t=[1.841107, 4.9552846, 0.5634454]) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates(input_keypoints[..., :2], w=width_of, h=height_of) manual_fps = 25 np.savez('out_3D_vp3d', anim_output['Reconstruction']) camAzimuth = 70.0 from common.visualization import render_animation render_animation(input_keypoints, anim_output, manual_fps, args.viz_bitrate, camAzimuth, args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(width_of, height_of), input_video_skip=args.viz_skip) # IF RENDERING LIVE else: print('Rendering...') my_action = 'Directions 1' input_keypoints = keypoints[args.viz_subject][my_action][ args.viz_camera].copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluateLive(gen, model_pos, VBO, window, model_loc, cap, opWrapper, return_predictions=True) glfw.terminate() cap.release() cv2.destroyAllWindows()
def main(input_args): vp3d_dir = input_args.vp3d_dir sys.path.append(vp3d_dir) from common.camera import normalize_screen_coordinates from common.model import TemporalModel from common.generators import UnchunkedGenerator from common.arguments import parse_args args = parse_args() print(args) kps_left = [4, 5, 6, 11, 12, 13] kps_right = [1, 2, 3, 14, 15, 16] joints_left = [4, 5, 6, 11, 12, 13] joints_right = [1, 2, 3, 14, 15, 16] filter_widths = [int(x) for x in args.architecture.split(',')] num_joints_in = 17 in_features = 2 num_joints_out = 17 model_pos = TemporalModel(num_joints_in, in_features, num_joints_out, filter_widths=filter_widths, causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side if args.causal: print('INFO: Using causal convolutions') causal_shift = pad else: causal_shift = 0 model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() if args.resume or args.evaluate: chk_filename = os.path.join(vp3d_dir, args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos.load_state_dict(checkpoint['model_pos']) # Evaluate def evaluate(test_generator, action=None, return_predictions=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): model_pos.eval() N = 0 for _, batch, batch_2d in test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model predicted_3d_pos = model_pos(inputs_2d) # Test-time augmentation (if enabled) if test_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if test_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----'+action+'----') e1 = (epoch_loss_3d_pos / N)*1000 e2 = (epoch_loss_3d_pos_procrustes / N)*1000 e3 = (epoch_loss_3d_pos_scale / N)*1000 ev = (epoch_loss_3d_vel / N)*1000 print('Test time augmentation:', test_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev def get_gt_dirs(input_path, camera_id='dev3'): """Get all directories with ground-truth 2D human pose annotations """ gt_path_list = [] category_path_list = get_subdirs(input_path) for category in category_path_list: if os.path.basename(category) != 'Calibration': category_scans = get_subdirs(category) for category_scan in category_scans: device_list = get_subdirs(category_scan) for device_path in device_list: if camera_id in device_path: if os.path.exists(os.path.join(device_path, 'pose2d')): # 2D annotations exist gt_path_list.append(device_path) # eg <root>/Lack_TV_Bench/0007_white_floor_08_04_2019_08_28_10_47/dev3 return gt_path_list def get_subdirs(input_path): ''' get a list of subdirectories in input_path directory :param input_path: parent directory (in which to get the subdirectories) :return: subdirs: list of subdirectories in input_path ''' subdirs = [os.path.join(input_path, dir_i) for dir_i in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, dir_i))] subdirs.sort() return subdirs fps = 30 frame_width = 1920.0 frame_height = 1080.0 h36m_joint_names = get_h36m_joint_names() h36m_joint_names_dict = {name: i for i, name in enumerate(h36m_joint_names)} joint_names = get_body25_joint_names() joint_names_dict = {name: i for i, name in enumerate(joint_names)} dataset_dir = input_args.dataset_dir camera_id = input_args.camera_id gt_dirs = get_gt_dirs(dataset_dir, camera_id) for i, gt_dir in enumerate(gt_dirs): print(f"\nProcessing {i} of {len(gt_dirs)}: {' '.join(gt_dir.split('/')[-3:-1])}") input_dir = os.path.join(gt_dir, 'predictions', 'pose2d', 'openpose') output_dir = os.path.join(gt_dir, 'predictions', 'pose3d', 'vp3d') os.makedirs(output_dir, exist_ok=True) json_mask = os.path.join(input_dir, 'scan_video_00000000????_keypoints.json') json_files = sorted(glob(json_mask)) input_keypoints = [] for json_file in json_files: with open(json_file, 'r') as f: pose2d = json.load(f) if len(pose2d["people"]) == 0: keypoints_op = np.zeros((19, 3)) else: keypoints_op = np.array(pose2d["people"][0]["pose_keypoints_2d"]).reshape(-1, 3) # Takes first detected person every time... keypoints = np.zeros((17, 3)) for i, joint_name in enumerate(h36m_joint_names): if joint_name == 'spine' or joint_name == 'head': continue joint_id = joint_names_dict[joint_name] keypoints[i, :] = keypoints_op[joint_id, :] keypoints[h36m_joint_names_dict['mid hip'], :] = np.mean((keypoints[h36m_joint_names_dict['left hip'], :], keypoints[h36m_joint_names_dict['right hip'], :]), axis=0) # mid hip = mean(left hip, right hip) keypoints[h36m_joint_names_dict['spine'], :] = np.mean((keypoints[h36m_joint_names_dict['neck'], :], keypoints[h36m_joint_names_dict['mid hip'], :]), axis=0) # spine = mean(neck, mid hip) keypoints[h36m_joint_names_dict['head'], :] = np.mean((keypoints_op[joint_names_dict['left ear'], :], keypoints_op[joint_names_dict['right ear'], :]), axis=0) # head = mean(left ear, right ear) input_keypoints.append(keypoints) input_keypoints = np.array(input_keypoints) input_keypoints = input_keypoints[:, :, :2] # For pretrained_h36m_cpn.bin and cpn_ft_h36m_dbb input_keypoints[..., :2] = normalize_screen_coordinates(input_keypoints[..., :2], w=frame_width, h=frame_height) args.test_time_augmentation=True gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # Nx17x3 pickle.dump(prediction, open(os.path.join(output_dir, 'vp3d_output.pkl'), "wb"))