def make_prediction(self): if self.with_cuda: self.model = self.model.cuda() with torch.no_grad(): self.model.eval() for _, batch, batch_2d in self.test_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if self.with_cuda: inputs_2d = inputs_2d.cuda() predicted_3d_pos = self.model(inputs_2d) if self.test_generator.augment_enabled(): predicted_3d_pos[1, :, :, 0] *= -1 predicted_3d_pos[1, :, self.joints_left + self.joints_right] = predicted_3d_pos[ 1, :, self.joints_right + self.joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) predicted_3d_pos = predicted_3d_pos.squeeze(0).cpu().numpy() rot = self.dataset.cameras()['detectron2'][0]['orientation'] predicted_3d_pos = camera_to_world(predicted_3d_pos, R=rot, t=0) predicted_3d_pos[:, :, 2] -= np.min(predicted_3d_pos[:, :, 2]) self.prediction = predicted_3d_pos
def interface(model_pos, keypoints, W, H): # input (N, 17, 2) return (N, 17, 3) if not isinstance(keypoints, np.ndarray): keypoints = np.array(keypoints) from common.camera import normalize_screen_coordinates_new, camera_to_world, normalize_screen_coordinates # keypoints = normalize_screen_coordinates_new(keypoints[..., :2], w=W, h=H) keypoints = normalize_screen_coordinates(keypoints[..., :2], w=1000, h=1002) input_keypoints = keypoints.copy() # test_time_augmentation True from common.generators import UnchunkedGenerator gen = UnchunkedGenerator(None, None, [input_keypoints], pad=common.pad, causal_shift=common.causal_shift, augment=True, kps_left=common.kps_left, kps_right=common.kps_right, joints_left=common.joints_left, joints_right=common.joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) prediction = camera_to_world(prediction, R=common.rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction
def gen_pose_frame(kpts, width, height, model_pos, pad, causal_shift=0): # kpts: (M, T, N, 2) norm_seqs = [] for kpt in kpts: norm_kpt = normalize_screen_coordinates(kpt, w=width, h=height) norm_seqs.append(norm_kpt) gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction_to_world = [] for i in range(len(prediction)): sub_prediction = prediction[i][0] sub_prediction = camera_to_world(sub_prediction, R=rot, t=0) sub_prediction[:, 2] -= np.amin(sub_prediction[:, 2]) prediction_to_world.append(sub_prediction) return prediction_to_world
def predict(img_path): # 1.检测关节点并显示 # 预处理输入图像和检测人体 x, img = data.transforms.presets.yolo.load_test(img_path, short=256) # print("Shape of pre-processed image:", x.shape) start = time.time() # detect persons and bbox class_ids, scores, bounding_boxes = detector(x) # 2.预处理检测器的输出张量作为alpha_pose的输入 pose_input, upscale_bbox = detector_to_simple_pose(img, class_ids, scores, bounding_boxes) global detector_time detector_time += (time.time() - start) print("detector cost time: {:.3f} seconds".format(time.time() - start)) prepare_end = time.time() # 3.预测关节点 if pose_input is None: return None, None predicted_heatmap = pose_net(pose_input) pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox) global predictor_2d_time predictor_2d_time += (time.time() - prepare_end) print("2d pose predictor cost time: {:.3f} seconds".format(time.time() - prepare_end)) # 4.显示2d姿态 # utils.viz.plot_keypoints(img, pred_coords, confidence, class_IDs, bounding_boxes, scores, box_thresh=0.5, # keypoint_thresh=0.2) # 5.坐标标准化 prepare_end = time.time() kps = normalize_screen_coordinates(pred_coords.asnumpy(), w=img.shape[1], h=img.shape[0]) receptive_field = pose3d_predictor.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 # 6.创建生成器作为3d预测器的输入 generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False) # 7.3d姿势估计和显示 prediction = predict_3d_pos(generator, pose3d_predictor) global full_time, predictor_3d_time predictor_3d_time += time.time() - prepare_end full_time += time.time() - start print("3d predictor time: {:.3f} seconds".format(time.time() - prepare_end)) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction, img
def predict_3d_joints(predictor, coords_2d, w, h): # 坐标标准化 kps = normalize_screen_coordinates(coords_2d, w, h) # print('kps.type: {}, kps.shape: {}'.format(type(kps), kps.shape)) # 2d keypoints生成器 receptive_field = predictor.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 # 创建生成器作为3d预测器的输入 generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False) prediction = predict_3d_pos(generator, predictor) prediction = camera_to_world(prediction, R=rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction
def gen_pose(kpts, valid_frames, width, height, model_pos, pad, causal_shift=0): assert len(kpts.shape) == 4, 'The shape of kpts: {}'.format(kpts.shape) assert kpts.shape[0] == len(valid_frames) norm_seqs = [] for index, frames in enumerate(valid_frames): seq_kps = kpts[index, frames] norm_seq_kps = normalize_screen_coordinates(seq_kps, w=width, h=height) norm_seqs.append(norm_seq_kps) gen = UnchunkedGenerator(None, None, norm_seqs, pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction_to_world = [] for i in range(len(prediction)): sub_prediction = prediction[i] sub_prediction = camera_to_world(sub_prediction, R=rot, t=0) # sub_prediction[:, :, 2] -= np.expand_dims(np.amin(sub_prediction[:, :, 2], axis=1), axis=1).repeat([17], axis=1) # sub_prediction[:, :, 2] -= np.amin(sub_prediction[:, :, 2]) prediction_to_world.append(sub_prediction) # prediction_to_world = np.asarray(prediction_to_world, dtype=np.float32) return prediction_to_world
def gen_pose_frame_(kpts, width, height, model_pos, pad, causal_shift=0): # input (N, 17, 2) return (N, 17, 3) if not isinstance(kpts, np.ndarray): kpts = np.array(kpts) keypoints = normalize_screen_coordinates(kpts[..., :2], w=width, h=height) input_keypoints = keypoints.copy() # test_time_augmentation True from common.generators import UnchunkedGenerator gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos) prediction = camera_to_world(prediction[0], R=rot, t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) return prediction
def video_pose(filepath, ckpt_dir, ckpt_name, filter_widths, show=False, channels=1024, save_file='output.mp4'): # 加载3d姿势估计器 pose3d_predictor = get_pose3d_predictor(ckpt_dir, ckpt_name, filter_widths, channels=channels) receive_field = 1 for i in filter_widths: receive_field *= i # print(receive_field) half = receive_field // 2 # 读取视频 cap = cv2.VideoCapture(filepath) # 设置分辨率 cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 360) # 帧率和帧数 fps = cap.get(cv2.CAP_PROP_FPS) frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) print("Original FPS: {}, frame count: {}".format(fps, frame_count)) # pause = int(1000 / fps) if show: # 宽高 cv2.namedWindow('Video', 0) cv2.resizeWindow('Video', 960, 540) # 保存视频文件 print("Save the result to {}.".format(save_file)) wh = (1280, 720) fourcc = cv2.VideoWriter_fourcc(*'mp4v') output_mp4 = cv2.VideoWriter(save_file, fourcc, fps, wh) coords_2d_list = [] dicts = [] i = 0 # 因为设置了数据生成器的pad=0,因此需要获取前receive_field//2帧做准备 elapsed_time = 0 print("Preparing...") while i < half: ret_val, frame = cap.read() if ret_val != 1: print("Video is too short!") output_mp4.release() cap.release() cv2.destroyAllWindows() return # noinspection PyBroadException try: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) except: continue # 生成2d关键点 current_frame = cap.get(cv2.CAP_PROP_POS_FRAMES) joints_dict = detect_2d_joints(frame, current_frame) dicts.append(joints_dict) img, predict_coords = joints_dict['img'], joints_dict['coords'] normalized_coords = normalize_screen_coordinates( predict_coords.asnumpy()[0], w=img.shape[1], h=img.shape[0]) coords_2d_list.append(normalized_coords) i += 1 print("Starting to predict 3d pose...") fps_time = time.time() while True: # 获取帧 i += 1 if i > receive_field and len(dicts) < 1: break ret_val, frame = cap.read() if ret_val == 1: try: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) except: continue # 生成2d关键点 current_frame = cap.get(cv2.CAP_PROP_POS_FRAMES) joints_dict = detect_2d_joints(frame, current_frame) dicts.append(joints_dict) img, predict_coords = joints_dict['img'], joints_dict['coords'] normalized_coords = normalize_screen_coordinates( predict_coords.asnumpy()[0], w=img.shape[1], h=img.shape[0]) coords_2d_list.append(normalized_coords) joints_dict = dicts[0] if i > half + 1: # 去除最左端的无用帧 coords_2d_list = coords_2d_list[1:] dicts = dicts[1:] if len(coords_2d_list) < receive_field: if i < receive_field: # 视频开头小于receive_field帧时,在左边进行pad操作 # print("kps_list length is {}, padding {} frames to left end.".format(len(kps_list), half)) while len(coords_2d_list) < receive_field: coords_2d_list.insert(0, coords_2d_list[0]) elif len(coords_2d_list) > 0: # 视频末尾不足receive_field帧时,在右边进行pad操作 # print("kps_list length is {}, padding 1 frames to right end.".format(len(kps_list))) coords_2d_list.append(coords_2d_list[-1]) else: break # 构造2d关键点生成器 kps_2d = np.stack(coords_2d_list) generator = joints_2d_generator(kps_2d, pose3d_predictor) # print(generator.num_frames()) # 3d关键点预测 predictions = predict_3d_pos(generator, pose3d_predictor) # print('predictions.shape: ', predictions.shape) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) predictions = camera_to_world(predictions, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height predictions[:, :, 2] -= np.min(predictions[:, :, 2]) coords_3d = predictions[0] # print('predicted {} frame, elapsed time: {:.3f} seconds.'.format(predictions.shape[0], time.time() - fps_time)) interval = time.time() - fps_time elapsed_time += interval fps = 1.0 / interval # 渲染图像 result_image = render_image(coords_3d=coords_3d, skeleton=Skeleton(), **joints_dict) cv2.putText(result_image, "FPS: %.3f" % fps, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) result_image = cv2.cvtColor(result_image, cv2.COLOR_RGB2BGR) if show: # 实时显示 cv2.imshow('Video', result_image) if cv2.waitKey(1) & 0xff == ord('q'): break # resize and write to_write = cv2.resize(result_image, wh) output_mp4.write(to_write) fps_time = time.time() output_mp4.release() cap.release() cv2.destroyAllWindows() print("Average Fps: {:.3f}".format(frame_count / elapsed_time))
def reconstruction(args): """ Generate 3D poses from 2D keypoints detected from video, and visualize it :param chk_file: The file path of model weight :param kps_file: The file path of 2D keypoints :param viz_output: The output path of animation :param video_path: The input video path :param kpts_format: The format of 2D keypoints, like MSCOCO, MPII, H36M, OpenPose. The default format is H36M """ print('Loading 2D keypoints ...') keypoints, scores, _, _ = load_json(args.keypoints_file) # Loading only one person's keypoints if len(keypoints.shape) == 4: keypoints = keypoints[0] assert len(keypoints.shape) == 3 # Transform the keypoints format from different dataset (MSCOCO, MPII) to h36m format if args.kpts_format == 'coco': keypoints, valid_frames = coco_h36m(keypoints) elif args.kpts_format == 'mpii': keypoints, valid_frames = mpii_h36m(keypoints) elif args.kpts_format == 'openpose': # Convert 'Openpose' format to MSCOCO order_coco = [i for i in range(17) if i != 1] keypoints = keypoints[:order_coco] keypoints, valid_frames = coco_h36m(keypoints) else: valid_frames = np.where( np.sum(keypoints.reshape(-1, 34), axis=1) != 0)[0] assert args.kpts_format == 'h36m' # Get the width and height of video cap = cv2.VideoCapture(args.video_path) width = int(round(cap.get(cv2.CAP_PROP_FRAME_WIDTH))) height = int(round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))) # normalize keypoints input_keypoints = normalize_screen_coordinates(keypoints[..., :2], w=width, h=height) if args.frames == 27: filter_widths = [3, 3, 3] channels = 128 elif args.frames == 81: filter_widths = [3, 3, 3, 3] channels = 64 else: filter_widths = [3, 3, 3, 3, 3] channels = 32 model_pos = SpatioTemporalModel(adj, 17, 2, 17, filter_widths=filter_widths, channels=channels, dropout=0.05) if torch.cuda.is_available(): model_pos = model_pos.cuda() # load trained model print('Loading checkpoint', args.weight) chk_file = os.path.join('./checkpoint', args.weight) checkpoint = torch.load(chk_file, map_location=lambda storage, loc: storage) model_pos.load_state_dict(checkpoint['model_pos']) receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Reconstructing ...') gen = UnchunkedGenerator(None, None, [input_keypoints[valid_frames]], pad=pad, causal_shift=causal_shift, augment=True, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) prediction_new = np.zeros((*input_keypoints.shape[:-1], 3), dtype=np.float32) prediction_new[valid_frames] = prediction print('Rendering ...') anim_output = {'Reconstruction': prediction_new} render_animation(keypoints, keypoints_metadata, anim_output, h36m_skeleton, 25, 3000, np.array(70., dtype=np.float32), args.viz_output, limit=-1, downsample=1, size=5, input_video_path=args.video_path, viewport=(width, height), input_video_skip=0)
def main(): dataset_path = "./data/data_3d_h36m.npz" # 加载数据 from common.h36m_dataset import Human36mDataset dataset = Human36mDataset(dataset_path) dataset = read_3d_data(dataset) cudnn.benchmark = True device = torch.device("cpu") from models.sem_gcn import SemGCN from common.graph_utils import adj_mx_from_skeleton p_dropout = None adj = adj_mx_from_skeleton(dataset.skeleton()) model_pos = SemGCN(adj, 128, num_layers=4, p_dropout=p_dropout, nodes_group=dataset.skeleton().joints_group()).to(device) ckpt_path = "./checkpoint/pretrained/ckpt_semgcn_nonlocal_sh.pth.tar" ckpt = torch.load(ckpt_path, map_location='cpu') model_pos.load_state_dict(ckpt['state_dict'], False) model_pos.eval() # ============ 新增代码 ============== # 从项目处理2d数据的代码中输出的一个人体数据 inputs_2d = [[483.0, 450], [503, 450], [503, 539], [496, 622], [469, 450], [462, 546], [469, 622], [483, 347], [483, 326], [489, 264], [448, 347], [448, 408], [441, 463], [517, 347], [524, 408], [538, 463]] # # openpose的测试样例识别结果 # inputs_2d = [[86.0, 137], [99, 128], [94, 127], [97, 110], [89, 105], [102, 129], [116, 116], [99, 110], # [105, 93], [117, 69], [147, 63], [104, 93], [89, 69], [82, 38], [89, 139], [94, 140]] inputs_2d = np.array(inputs_2d) # inputs_2d[:, 1] = np.max(inputs_2d[:, 1]) - inputs_2d[:, 1] # 变成正的人体姿态,原始数据为倒立的 cam = dataset.cameras()['S1'][0] # 获取相机参数 inputs_2d[..., :2] = normalize_screen_coordinates(inputs_2d[..., :2], w=cam['res_w'], h=cam['res_h']) # 2d坐标处理 # 画出归一化屏幕坐标并且标记序号的二维关键点图像 print(inputs_2d) # 打印归一化后2d关键点坐标 d_x = inputs_2d[:, 0] d_y = inputs_2d[:, 1] plt.figure() plt.scatter(d_x, d_y) for i, txt in enumerate(np.arange(inputs_2d.shape[0])): plt.annotate(txt, (d_x[i], d_y[i])) # 标号 # plt.show() # 显示2d关键点归一化后的图像 # 获取3d结果 inputs_2d = torch.tensor(inputs_2d, dtype=torch.float32) # 转换为张量 outputs_3d = model_pos(inputs_2d).cpu() # 加载模型 outputs_3d[:, :, :] -= outputs_3d[:, :1, :] # Remove global offset / 移除全球偏移 predictions = [outputs_3d.detach().numpy()] # 预测结果 prediction = np.concatenate(predictions)[0] # 累加取第一个 # Invert camera transformation / 反相机的转换 prediction = camera_to_world(prediction, R=cam['orientation'], t=0) # R和t的参数设置影响不大,有多种写法和选取的相机参数有关,有些S没有t等等问题 prediction[:, 2] -= np.min(prediction[:, 2]) # 向上偏移min(prediction[:, 2]),作用是把坐标变为正数 print('prediction') print(prediction) # 打印画图的3d坐标 plt.figure() ax = plt.subplot(111, projection='3d') # 创建一个三维的绘图工程 o_x = prediction[:, 0] o_y = prediction[:, 1] o_z = prediction[:, 2] print(o_x) print(o_y) print(o_z) ax.scatter(o_x, o_y, o_z) temp = o_x x = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]] temp = o_y y = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]] temp = o_z z = [temp[9], temp[8], temp[7], temp[10], temp[11], temp[12]] ax.plot(x, y, z) temp = o_x x = [temp[7], temp[0], temp[4], temp[5], temp[6]] temp = o_y y = [temp[7], temp[0], temp[4], temp[5], temp[6]] temp = o_z z = [temp[7], temp[0], temp[4], temp[5], temp[6]] ax.plot(x, y, z) temp = o_x x = [temp[0], temp[1], temp[2], temp[3]] temp = o_y y = [temp[0], temp[1], temp[2], temp[3]] temp = o_z z = [temp[0], temp[1], temp[2], temp[3]] ax.plot(x, y, z) temp = o_x x = [temp[7], temp[13], temp[14], temp[15]] temp = o_y y = [temp[7], temp[13], temp[14], temp[15]] temp = o_z z = [temp[7], temp[13], temp[14], temp[15]] ax.plot(x, y, z) # temp = o_x # x = [temp[0], temp[14]] # temp = o_y # y = [temp[0], temp[14]] # temp = o_z # z = [temp[0], temp[14]] # ax.plot(y, x, z) # # temp = o_x # x = [temp[0], temp[15]] # temp = o_y # y = [temp[0], temp[15]] # temp = o_z # z = [temp[0], temp[15]] # ax.plot(y, x, z) # 改变坐标比例的代码,该代码的效果是z坐标轴是其他坐标的两倍 from matplotlib.pyplot import MultipleLocatort major_locator = MultipleLocator(0.5) ax.xaxis.set_major_locator(major_locator) ax.yaxis.set_major_locator(major_locator) ax.zaxis.set_major_locator(major_locator) ax.get_proj = lambda: np.dot(Axes3D.get_proj(ax), np.diag([0.5, 0.5, 1, 1])) plt.show()
action_list.append(action) print(action_list) print(len(action_list)) for action in action_list: position_list = [] for idx,carema_id in enumerate(id_order): f = args.from_source + '/' + subject + '/MyPoseFeatures/D3_Positions_mono/'+action+'.'+carema_id+'.cdf.mat' # if subject == 'S11' and action == 'Directions': # continue # Discard corrupted video # Use consistent naming convention hf = loadmat(f) positions = hf['data'][0, 0].reshape(-1, 32, 3) positions /= 1000 # Meters instead of millimeters positions_universal = camera_to_world(positions,R=np.array(camera_info[idx]['orientation']),t=np.array(camera_info[idx]['translation'])/1000) position_list.append(positions_universal.astype('float32')) canonical_name = action.replace('TakingPhoto', 'Photo') \ .replace('WalkingDog', 'WalkDog') # if action == 'Directions 1': # print('checking...') # print(position_list[0]-position_list[1]) # print(position_list[1]-position_list[2]) # print(position_list[2]-position_list[3]) output[subject][canonical_name] = sum(position_list)/4 if action == 'Directions 1': print(output[subject][canonical_name]) print(output[subject][canonical_name]-position_list[0]) print('Saving...') #np.savez_compressed(output_filename, positions_3d=output)
def draw_3Dimg_c3dpo(pos, image, display=None, kpt2D=None, shape=None): from mpl_toolkits.mplot3d import Axes3D # projection 3D 必须要这个 from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas fig = plt.figure(figsize=(12, 6)) canvas = FigureCanvas(fig) # 2D fig.add_subplot(131) if isinstance(kpt2D, np.ndarray): plt.imshow(draw_2Dimg(image, kpt2D)) else: plt.imshow(image) # c3dpo 矫正 # if shape is not None: # index_list = [0, 5, 7, 9, 6, 8, 10, 11, 13, 15, 12, 14, 16] # pos[index_list] = shape # 3D ax = fig.add_subplot(132, projection='3d') radius = 1.7 ax.view_init(elev=15., azim=70.) ax.set_xlim3d([-radius / 2, radius / 2]) ax.set_zlim3d([0, radius]) ax.set_ylim3d([-radius / 2, radius / 2]) ax.set_aspect('equal') # 坐标轴刻度 # ax.set_xticklabels([]) # ax.set_yticklabels([]) # ax.set_zticklabels([]) # ax.dist = 7.5 parents = common.skeleton_parents joints_right = common.joints_right for j, j_parent in enumerate(parents): if j_parent == -1: continue col = 'red' if j in joints_right else 'black' # 画图3D ax.plot([pos[j, 0], pos[j_parent, 0]], [pos[j, 1], pos[j_parent, 1]], [pos[j, 2], pos[j_parent, 2]], zdir='z', c=col) # c3dpo bx = fig.add_subplot(133, projection='3d') bx.view_init(elev=15., azim=15.) bx.set_xlim3d([-radius / 2, radius / 2]) bx.set_zlim3d([0, radius]) bx.set_ylim3d([-radius / 2, radius / 2]) bx.set_aspect('equal') from common.camera import camera_to_world shape = camera_to_world(shape, R=common.c3dpo_rot, t=0) shape[:, 2] -= np.min(shape[:, 2]) order_pair = ((1, 2), (2, 3), (4, 5), (5, 6), (0, 1), (0, 4), (0, 7), (7, 8), (8, 9), (11, 12), (12, 13), (14, 15), (15, 16), (11, 8), (14, 8)) # scale # shape *= 0.7 for j, j_parent in order_pair: bx.plot([shape[j, 0], shape[j_parent, 0]], [shape[j, 1], shape[j_parent, 1]], [shape[j, 2], shape[j_parent, 2]], zdir='z', c='red') # bx.scatter(shape[:, 0], shape[:, 1], shape[:, 2]) width, height = fig.get_size_inches() * fig.get_dpi() canvas.draw() # draw the canvas, cache the renderer image = np.fromstring(canvas.tostring_rgb(), dtype='uint8').reshape(int(height), int(width), 3) if display: cv2.imshow('im', image) cv2.waitKey(1) return image
def predict(img_path): # 1.预处理输入图像和检测人体 x, img = data.transforms.presets.yolo.load_test(img_path, short=256) # detector.summary(x) # print("x.shape:", x.shape) start = time.time() # detect persons and bbox, class_ids, scores, bounding_boxes = detector(x) # shape: [sample_idx, class_idx, instance] # print("bounding_boxes.shape", bounding_boxes.shape, "bounding_boxes[0, 0]:", bounding_boxes[0, 0]) # 2.预处理检测器的输出张量作为mobile_pose的输入 pose_input, upscale_bbox = detector_to_mobile_pose(img, class_ids, scores, bounding_boxes) print("detector cost time: {:.3f} seconds".format(time.time() - start)) global detector_time detector_time += (time.time() - start) if pose_input is None: return None, None # 4.2d关节点预测 # pose_net.summary(pose_input) start_time = time.time() predicted_heatmap = pose_net(pose_input) pred_coords, confidence = heatmap_to_coord(predicted_heatmap, upscale_bbox) # print("type(pre_coords): {}, shape(pre_coords): {}".format(type(pred_coords), pred_coords.shape)) # print("pred_coords: {}".format(pred_coords)) global predictor_2d_time predictor_2d_time += (time.time() - start_time) print("2d pose predictor cost time: {:.3f} seconds".format(time.time() - start_time)) # 5.显示2d姿态 # ax = utils.viz.plot_keypoints(img, pred_coords, confidence, class_IDs, bounding_boxes, scores, box_thresh=0.5, # keypoint_thresh=0.2) # print(pred_coords) # 6.坐标标准化 start_time = time.time() kps = normalize_screen_coordinates(pred_coords.asnumpy(), w=img.shape[1], h=img.shape[0]) # print('kps.type: {}, kps.shape: {}'.format(type(kps), kps.shape)) # 7.2d keypoints生成器 receptive_field = pose3d_predictor.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 # 创建生成器作为3d预测器的输入 generator = UnchunkedGenerator(None, None, [kps], pad=pad, causal_shift=causal_shift, augment=False) # 8.3d姿势估计和显示 prediction = predict_3d_pos(generator, pose3d_predictor) global predictor_3d_time, full_time predictor_3d_time += (time.time() - start_time) full_time += (time.time() - start) print("3d pose predictor cost time: {:.3f} seconds".format(time.time() - start_time)) # print("prediction.shape: ", prediction.shape) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) elapsed = time.time() - start print("Total elapsed time of predicting image file {}: {:.3f} seconds".format(img_path, elapsed)) return prediction, img
def videpose_infer(args): from common.camera import normalize_screen_coordinates, camera_to_world, image_coordinates from common.generators import UnchunkedGenerator from common.model import TemporalModel from common.utils import Timer, evaluate, add_path from videopose import get_detector_2d, ckpt_time, metadata, time0 import gene_npz gene_npz.args.outputpath = str(args.viz_output / "alpha_pose_kunkun_cut") print(gene_npz.args) # detector_2d = get_detector_2d(args.detector_2d) detector_2d = gene_npz.generate_kpts(args.detector_2d) assert detector_2d, 'detector_2d should be in ({alpha, hr, open}_pose)' # 2D kpts loads or generate if not args.input_npz: video_name = args.viz_video keypoints = detector_2d(video_name) else: npz = np.load(args.input_npz) keypoints = npz['kpts'] # (N, 17, 2) keypoints_symmetry = metadata['keypoints_symmetry'] kps_left, kps_right = list( keypoints_symmetry[0]), list(keypoints_symmetry[1]) joints_left, joints_right = list( [4, 5, 6, 11, 12, 13]), list([1, 2, 3, 14, 15, 16]) # normlization keypoints Suppose using the camera parameter keypoints = normalize_screen_coordinates( keypoints[..., :2], w=1000, h=1002) model_pos = TemporalModel(17, 2, 17, filter_widths=[3, 3, 3, 3, 3], causal=args.causal, dropout=args.dropout, channels=args.channels, dense=args.dense) if torch.cuda.is_available(): model_pos = model_pos.cuda() ckpt, time1 = ckpt_time(time0) print('-------------- load data spends {:.2f} seconds'.format(ckpt)) # load trained model chk_filename = os.path.join( args.checkpoint, args.resume if args.resume else args.evaluate) print('Loading checkpoint', chk_filename) checkpoint = torch.load( chk_filename, map_location=lambda storage, loc: storage) # 把loc映射到storage model_pos.load_state_dict(checkpoint['model_pos']) ckpt, time2 = ckpt_time(time1) print('-------------- load 3D model spends {:.2f} seconds'.format(ckpt)) # Receptive field: 243 frames for args.arc [3, 3, 3, 3, 3] receptive_field = model_pos.receptive_field() pad = (receptive_field - 1) // 2 # Padding on each side causal_shift = 0 print('Rendering...') input_keypoints = keypoints.copy() gen = UnchunkedGenerator(None, None, [input_keypoints], pad=pad, causal_shift=causal_shift, augment=args.test_time_augmentation, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, model_pos, return_predictions=True) # save 3D joint points np.save(args.viz_output / "test_3d_output.npy", prediction, allow_pickle=True) rot = np.array([0.14070565, -0.15007018, -0.7552408, 0.62232804], dtype=np.float32) prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates( input_keypoints[..., :2], w=1000, h=1002) ckpt, time3 = ckpt_time(time2) print( '-------------- generate reconstruction 3D data spends {:.2f} seconds'.format(ckpt)) ckpt, time4 = ckpt_time(time3) print('total spend {:2f} second'.format(ckpt))
def main(args): print('==> Using settings {}'.format(args)) convm = torch.zeros(3, 17, 17, dtype=torch.float) print('==> Loading dataset...') dataset_path = path.join('data', 'data_3d_' + args.dataset + '.npz') if args.dataset == 'h36m': from common.h36m_dataset import Human36mDataset dataset = Human36mDataset(dataset_path) else: raise KeyError('Invalid dataset') print('==> Preparing data...') dataset = read_3d_data(dataset) print('==> Loading 2D detections...') keypoints = create_2d_data( path.join('data', 'data_2d_' + args.dataset + '_' + args.keypoints + '.npz'), dataset) cudnn.benchmark = True device = torch.device("cuda") # Create model print("==> Creating model...") if args.architecture == 'linear': from models.linear_model import LinearModel, init_weights num_joints = dataset.skeleton().num_joints() model_pos = LinearModel(num_joints * 2, (num_joints - 1) * 3).to(device) model_pos.apply(init_weights) elif args.architecture == 'gcn': from models.sem_gcn import SemGCN from common.graph_utils import adj_mx_from_skeleton p_dropout = (None if args.dropout == 0.0 else args.dropout) adj = adj_mx_from_skeleton(dataset.skeleton()) model_pos = SemGCN(convm, adj, args.hid_dim, num_layers=args.num_layers, p_dropout=p_dropout, nodes_group=dataset.skeleton().joints_group() if args.non_local else None).to(device) else: raise KeyError('Invalid model architecture') print("==> Total parameters: {:.2f}M".format( sum(p.numel() for p in model_pos.parameters()) / 1000000.0)) # Resume from a checkpoint ckpt_path = args.evaluate if path.isfile(ckpt_path): print("==> Loading checkpoint '{}'".format(ckpt_path)) ckpt = torch.load(ckpt_path) start_epoch = ckpt['epoch'] error_best = ckpt['error'] model_pos.load_state_dict(ckpt['state_dict']) print("==> Loaded checkpoint (Epoch: {} | Error: {})".format( start_epoch, error_best)) else: raise RuntimeError("==> No checkpoint found at '{}'".format(ckpt_path)) print('==> Rendering...') poses_2d = keypoints[args.viz_subject][args.viz_action] out_poses_2d = poses_2d[args.viz_camera] out_actions = [args.viz_camera] * out_poses_2d.shape[0] poses_3d = dataset[args.viz_subject][args.viz_action]['positions_3d'] assert len(poses_3d) == len(poses_2d), 'Camera count mismatch' out_poses_3d = poses_3d[args.viz_camera] ground_truth = dataset[args.viz_subject][args.viz_action]['positions_3d'][ args.viz_camera].copy() input_keypoints = out_poses_2d.copy() render_loader = DataLoader(PoseGenerator([out_poses_3d], [out_poses_2d], [out_actions]), batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) prediction = evaluate(render_loader, model_pos, device, args.architecture)[0] # Invert camera transformation cam = dataset.cameras()[args.viz_subject][args.viz_camera] prediction = camera_to_world(prediction, R=cam['orientation'], t=0) prediction[:, :, 2] -= np.min(prediction[:, :, 2]) ground_truth = camera_to_world(ground_truth, R=cam['orientation'], t=0) ground_truth[:, :, 2] -= np.min(ground_truth[:, :, 2]) anim_output = {'Regression': prediction, 'Ground truth': ground_truth} input_keypoints = image_coordinates(input_keypoints[..., :2], w=cam['res_w'], h=cam['res_h']) render_animation(input_keypoints, anim_output, dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), input_video_skip=args.viz_skip)
def analyze_frame(h, frame): boxes, keypoints = infer.inference_on_frame(h['predictor'], frame) # step 4: prepare data. # take 2d keypoints, that's it # first element is empty array, second is our actual frame data, a 3d numpy array with first dimension 1, second and third being the 17 joints of 3 doubles each. kp = keypoints[1][0][:2, :].T # extract (x, y) just like in prepare_data_2d_custom code # what to do if kp is NaN or missing data or something? # I guess just ignore it # they do this at the end of step4. but we keep it simple, and take the data from step2 directly into a variable. # output[canonical_name]['custom'] = [data[0]['keypoints'].astype('float32')] #output_custom_canonical_bullshit = kp.astype('float32') # this is what happens at the end of step4. which is a file that is loaded in the beginning of step 5. # np.savez_compressed(os.path.join(args.dataoutputdir, output_prefix_2d + args.output), positions_2d=output, metadata=metadata) # this is the bullshit they do in the original script. # confusingly, keypoints is actually just data, until it is set to keypoints[positions_2d] # keypoints = np.load('data/data_2d_' + args.dataset + '_' + args.keypoints + '.npz', allow_pickle=True) # step 5: ..... all the other shit # starting to copy stuff over from run.py # extract dataset from the init dictionary dataset = h['dataset'] keypoints_metadata = h['keypoints_metadata'] keypoints_symmetry = h['keypoints_symmetry'] kps_left = h['kps_left'] kps_right = h['kps_right'] joints_left = h['joints_left'] joints_right = h['joints_right'] # normalize for i in range(len(kp)): koord = kp[i] kp[i] = normalize_screen_coordinates(koord, h['frame_metadata']['w'], h['frame_metadata']['h']) #for kps in enumerate(keypoints): # kps[..., :2] = normalize_screen_coordinates(kps[..., :2], frame_metadata['w'], frame_metadata['h']) # this is taken from the args.architecture and run.py and just hardcoded, skipping a lot of nonsense filter_widths = [int(x) for x in "3,3,3,3,3".split(',')] skeleton_num_joints = dataset.skeleton().num_joints() #skeleton_num_joints = 17 causal = True dropout = 0.25 channels = 1024 dense = False model_pos_train = TemporalModelOptimized1f(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels) model_pos = TemporalModel(kp.shape[-2], kp.shape[-1], skeleton_num_joints, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) receptive_field = model_pos.receptive_field() print('INFO: Receptive field: {} frames'.format(receptive_field)) pad = (receptive_field - 1) // 2 # Padding on each side #if args.causal: # print('INFO: Using causal convolutions') # causal_shift = pad #else: # causal_shift = 0 causal_shift = pad model_params = 0 for parameter in model_pos.parameters(): model_params += parameter.numel() print('INFO: Trainable parameter count:', model_params) if torch.cuda.is_available(): model_pos = model_pos.cuda() model_pos_train = model_pos_train.cuda() #if args.resume or args.evaluate: if True: chk_filename = "checkpoint/pretrained_h36m_detectron_coco.bin" print('Loading checkpoint', chk_filename) checkpoint = torch.load(chk_filename, map_location=lambda storage, loc: storage) print('This model was trained for {} epochs'.format(checkpoint['epoch'])) model_pos_train.load_state_dict(checkpoint['model_pos']) model_pos.load_state_dict(checkpoint['model_pos']) # false in our particular case... we might benefit from getting rid of model_traj, # unless it's super fast then we should just keep it in case we ever upgrade if 'model_traj' in checkpoint: # Load trajectory model if it contained in the checkpoint (e.g. for inference in the wild) model_traj = TemporalModel(kp.shape[-2], kp.shape[-1], 1, filter_widths=filter_widths, causal=causal, dropout=dropout, channels=channels, dense=dense) if torch.cuda.is_available(): model_traj = model_traj.cuda() model_traj.load_state_dict(checkpoint['model_traj']) else: model_traj = None test_generator = UnchunkedGenerator(None, None, kp, pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) print('INFO: Testing on {} frames'.format(test_generator.num_frames())) # Evaluate def evaluate(eval_generator, action=None, return_predictions=False, use_trajectory_model=False): epoch_loss_3d_pos = 0 epoch_loss_3d_pos_procrustes = 0 epoch_loss_3d_pos_scale = 0 epoch_loss_3d_vel = 0 with torch.no_grad(): if not use_trajectory_model: model_pos.eval() else: model_traj.eval() N = 0 for _, batch, batch_2d in eval_generator.next_epoch(): inputs_2d = torch.from_numpy(batch_2d.astype('float32')) if torch.cuda.is_available(): inputs_2d = inputs_2d.cuda() # Positional model if not use_trajectory_model: predicted_3d_pos = model_pos(inputs_2d) else: predicted_3d_pos = model_traj(inputs_2d) # Test-time augmentation (if enabled) if eval_generator.augment_enabled(): # Undo flipping and take average with non-flipped version predicted_3d_pos[1, :, :, 0] *= -1 if not use_trajectory_model: predicted_3d_pos[1, :, joints_left + joints_right] = predicted_3d_pos[1, :, joints_right + joints_left] predicted_3d_pos = torch.mean(predicted_3d_pos, dim=0, keepdim=True) if return_predictions: return predicted_3d_pos.squeeze(0).cpu().numpy() inputs_3d = torch.from_numpy(batch.astype('float32')) if torch.cuda.is_available(): inputs_3d = inputs_3d.cuda() inputs_3d[:, :, 0] = 0 if eval_generator.augment_enabled(): inputs_3d = inputs_3d[:1] error = mpjpe(predicted_3d_pos, inputs_3d) epoch_loss_3d_pos_scale += inputs_3d.shape[0]*inputs_3d.shape[1] * n_mpjpe(predicted_3d_pos, inputs_3d).item() epoch_loss_3d_pos += inputs_3d.shape[0]*inputs_3d.shape[1] * error.item() N += inputs_3d.shape[0] * inputs_3d.shape[1] inputs = inputs_3d.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) predicted_3d_pos = predicted_3d_pos.cpu().numpy().reshape(-1, inputs_3d.shape[-2], inputs_3d.shape[-1]) epoch_loss_3d_pos_procrustes += inputs_3d.shape[0]*inputs_3d.shape[1] * p_mpjpe(predicted_3d_pos, inputs) # Compute velocity error epoch_loss_3d_vel += inputs_3d.shape[0]*inputs_3d.shape[1] * mean_velocity_error(predicted_3d_pos, inputs) if action is None: print('----------') else: print('----'+action+'----') e1 = (epoch_loss_3d_pos / N)*1000 e2 = (epoch_loss_3d_pos_procrustes / N)*1000 e3 = (epoch_loss_3d_pos_scale / N)*1000 ev = (epoch_loss_3d_vel / N)*1000 print('Test time augmentation:', eval_generator.augment_enabled()) print('Protocol #1 Error (MPJPE):', e1, 'mm') print('Protocol #2 Error (P-MPJPE):', e2, 'mm') print('Protocol #3 Error (N-MPJPE):', e3, 'mm') print('Velocity Error (MPJVE):', ev, 'mm') print('----------') return e1, e2, e3, ev image_keypoints2d = kp gen = UnchunkedGenerator(None, None, [[image_keypoints2d]], pad=pad, causal_shift=causal_shift, augment=False, kps_left=kps_left, kps_right=kps_right, joints_left=joints_left, joints_right=joints_right) prediction = evaluate(gen, return_predictions=True) # here is the data format # public enum VideoPose3dJointOrder # { # HIP = 0, # R_HIP = 1, # R_KNEE = 2, # R_FOOT = 3, # L_HIP = 4, # L_KNEE = 5, # L_FOOT = 6, # SPINE = 7, # THORAX = 8, # NOSE = 9, # HEAD = 10, # L_SHOULDER = 11, # L_ELBOW = 12, # L_WRIST = 13, # R_SHOULDER = 14, # R_ELBOW = 15, # R_WRIST = 16 # } # this bugs out. dunno what the hell they were trying to do. # anyway we can fix it by just getting width/height some other way. # Invert camera transformation cam = dataset.cameras() width = cam['frame'][0]['res_w'] height = cam['frame'][0]['res_h'] image_keypoints2d = image_coordinates(image_keypoints2d[..., :2], w=width, h=height) viz_camera = 0 # If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. for subject in dataset.cameras(): if 'orientation' in dataset.cameras()[subject][viz_camera]: rot = dataset.cameras()[subject][viz_camera]['orientation'] break prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) # because algo was meant for a list of frames, we take the first frame (our only frame) prediction3d = prediction[0] return prediction3d, image_keypoints2d # do we want to visualize? this code used to write to json and create a video for visualization #if args.viz_output is not None: if True: anim_output = {'Reconstruction': prediction} # format the data in the same format as mediapipe, so we can load it in unity with the same script # we need a list (frames) of lists of 3d landmarks. unity_landmarks = prediction.tolist() # how to send data? or display it? # maybe draw it on the webcam feed....?!?!?! #with open(args.output_json, "w") as json_file: # json.dump(unity_landmarks, json_file) #if args.rendervideo == "yes": # from common.visualization import render_animation # render_animation(input_keypoints, keypoints_metadata, anim_output, # dataset.skeleton(), dataset.fps(), args.viz_bitrate, cam['azimuth'], args.viz_output, # limit=args.viz_limit, downsample=args.viz_downsample, size=args.viz_size, # input_video_path=args.viz_video, viewport=(cam['res_w'], cam['res_h']), # input_video_skip=args.viz_skip) we_re_done_here = 1
# Predictions are in camera space np.save(args.viz_export, prediction) if args.viz_output is not None: # Invert camera transformation cam = dataset.cameras()[args.viz_subject][args.viz_camera] # If the ground truth is not available, take the camera extrinsic params from a random subject. # They are almost the same, and anyway, we only need this for visualization purposes. rot = None for subject in dataset.cameras(): if 'orientation' in dataset.cameras()[subject][args.viz_camera]: rot = dataset.cameras()[subject][ args.viz_camera]['orientation'] break prediction = camera_to_world(prediction, R=rot, t=0) # We don't have the trajectory, but at least we can rebase the height prediction[:, :, 2] -= np.min(prediction[:, :, 2]) anim_output = {'Reconstruction': prediction} input_keypoints = image_coordinates(input_keypoints[..., :2], w=cam['res_w'], h=cam['res_h']) # print('w, h:', cam['res_w'], cam['res_h']) from common.visualization import render_animation print("rot:", rot) print("cam['azimuth']:", cam['azimuth']) render_animation(input_keypoints, keypoints_metadata,
def draw_3Dimg_adjust(pos, image, display=None, kpt2D=None, shapes=None): from mpl_toolkits.mplot3d import Axes3D # projection 3D 必须要这个 from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas fig = plt.figure(figsize=(12, 6)) canvas = FigureCanvas(fig) # 2D fig.add_subplot(131) if isinstance(kpt2D, np.ndarray): plt.imshow(draw_2Dimg(image, kpt2D)) else: plt.imshow(image) # nrsfm 矫正 # if shapes is not None: # index_list = [0, 5, 7, 9, 6, 8, 10, 11, 13, 15, 12, 14, 16] # pos[index_list] = shapes[-1] # 3D ax = fig.add_subplot(132, projection='3d') radius = 1.7 ax.view_init(elev=15., azim=70.) ax.set_xlim3d([-radius / 2, radius / 2]) ax.set_zlim3d([0, radius]) ax.set_ylim3d([-radius / 2, radius / 2]) ax.set_aspect('equal') # 坐标轴刻度 # ax.set_xticklabels([]) # ax.set_yticklabels([]) # ax.set_zticklabels([]) # ax.dist = 7.5 parents = common.skeleton_parents joints_right = common.joints_right for j, j_parent in enumerate(parents): if j_parent == -1: continue col = 'red' if j in joints_right else 'black' # 画图3D ax.plot([pos[j, 0], pos[j_parent, 0]], [pos[j, 1], pos[j_parent, 1]], [pos[j, 2], pos[j_parent, 2]], zdir='z', c=col) # nrsfm bx = fig.add_subplot(133, projection='3d') bx.view_init(elev=15., azim=70.) bx.set_xlim3d([-radius / 2, radius / 2]) bx.set_zlim3d([0, radius]) bx.set_ylim3d([-radius / 2, radius / 2]) bx.set_aspect('equal') # to_world from common.camera import camera_to_world shapes = np.array(shapes) shapes = shapes[:, :, [0, 2, 1]] shapes = camera_to_world(shapes, R=common.rot, t=0) shapes[:, :, 2] -= np.min(shapes[:, :, 2]) one_shape = shapes[-1] # one_shape = one_shape[:, [0, 2, 1]] for j, j_parent in common.my_connections: bx.plot([one_shape[j, 0], one_shape[j_parent, 0]], [one_shape[j, 1], one_shape[j_parent, 1]], [one_shape[j, 2], one_shape[j_parent, 2]], zdir='z', c='black') width, height = fig.get_size_inches() * fig.get_dpi() canvas.draw() # draw the canvas, cache the renderer image = np.fromstring(canvas.tostring_rgb(), dtype='uint8').reshape(int(height), int(width), 3) if display: cv2.imshow('im', image) cv2.waitKey(1) return image