def main(): # model files check and download check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) check_file_existance(FILE_PATH) # prepare input data canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8) plotter = Plotter3d(canvas_3d.shape[:2]) canvas_3d_window_name = 'Canvas3D' cv2.namedWindow(canvas_3d_window_name) cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback) with open(FILE_PATH, 'r') as f: extrinsics = json.load(f) R = np.array(extrinsics['R'], dtype=np.float32) t = np.array(extrinsics['t'], dtype=np.float32) if args.video is None: frame_provider = ImageReader([args.input]) is_video = False else: frame_provider = VideoReader(args.video) is_video = True fx = -1 delay = 1 esc_code = 27 p_code = 112 space_code = 32 mean_time = 0 img_mean = np.array([128, 128, 128], dtype=np.float32) base_width_calculated = False # net initialize env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id) # inference for frame_id, frame in enumerate(frame_provider): current_time = cv2.getTickCount() if frame is None: break if not base_width_calculated: IMAGE_WIDTH = frame.shape[1] * (IMAGE_HEIGHT / frame.shape[0]) IMAGE_WIDTH = int(IMAGE_WIDTH / STRIDE) * STRIDE net.set_input_shape((1, 3, IMAGE_HEIGHT, IMAGE_WIDTH)) base_width_calculated = True input_scale = IMAGE_HEIGHT / frame.shape[0] scaled_img = cv2.resize(frame, dsize=None, fx=input_scale, fy=input_scale) # better to pad, but cut out for demo scaled_img = scaled_img[:, 0:scaled_img.shape[1] - (scaled_img.shape[1] % STRIDE)] if fx < 0: # Focal length is unknown fx = np.float32(0.8 * frame.shape[1]) normalized_img = (scaled_img.astype(np.float32) - img_mean) / 255.0 normalized_img = np.expand_dims(normalized_img.transpose(2, 0, 1), axis=0) # exectution if is_video: input_blobs = net.get_input_blob_list() net.set_input_blob_data(normalized_img, input_blobs[0]) net.update() features, heatmaps, pafs = net.get_results() else: print('Start inference...') if args.benchmark: print('BENCHMARK mode') for i in range(5): start = int(round(time.time() * 1000)) features, heatmaps, pafs = net.predict([normalized_img]) end = int(round(time.time() * 1000)) print(f'\tailia processing time {end - start} ms') else: features, heatmaps, pafs = net.predict([normalized_img]) inference_result = (features[-1].squeeze(), heatmaps[-1].squeeze(), pafs[-1].squeeze()) poses_3d, poses_2d = parse_poses(inference_result, input_scale, STRIDE, fx, is_video) edges = [] if len(poses_3d): poses_3d = rotate_poses(poses_3d, R, t) poses_3d_copy = poses_3d.copy() x = poses_3d_copy[:, 0::4] y = poses_3d_copy[:, 1::4] z = poses_3d_copy[:, 2::4] poses_3d[:, 0::4], poses_3d[:, 1::4], poses_3d[:, 2::4] = -z, x, -y poses_3d = poses_3d.reshape(poses_3d.shape[0], 19, -1)[:, :, 0:3] edges = (Plotter3d.SKELETON_EDGES + 19 * np.arange(poses_3d.shape[0]).reshape( (-1, 1, 1))).reshape((-1, 2)) plotter.plot(canvas_3d, poses_3d, edges) if is_video: cv2.imshow(canvas_3d_window_name, canvas_3d) else: cv2.imwrite(f'Canvas3D_{frame_id}.png', canvas_3d) draw_poses(frame, poses_2d) current_time = (cv2.getTickCount() - current_time) / cv2.getTickFrequency() if mean_time == 0: mean_time = current_time else: mean_time = mean_time * 0.95 + current_time * 0.05 cv2.putText(frame, 'FPS: {}'.format(int(1 / mean_time * 10) / 10), (40, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255)) if is_video: cv2.imshow('ICV 3D Human Pose Estimation', frame) else: cv2.imwrite(args.savepath, frame) key = cv2.waitKey(delay) if key == esc_code: break if key == p_code: if delay == 1: delay = 0 else: delay = 1 if delay == 0 and args.rotate3d: key = 0 while (key != p_code and key != esc_code and key != space_code): plotter.plot(canvas_3d, poses_3d, edges) cv2.imshow(canvas_3d_window_name, canvas_3d) key = cv2.waitKey(33) if key == esc_code: break else: delay = 1 print('Script finished successfully.')
canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8) plotter = Plotter3d(canvas_3d.shape[:2]) canvas_3d_window_name = 'Canvas 3D' cv2.namedWindow(canvas_3d_window_name) cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback) file_path = args.extrinsics_path if file_path is None: file_path = os.path.join('data', 'extrinsics.json') with open(file_path, 'r') as f: extrinsics = json.load(f) R = np.array(extrinsics['R'], dtype=np.float32) t = np.array(extrinsics['t'], dtype=np.float32) frame_provider = ImageReader(args.images) is_video = False if args.video != '': frame_provider = VideoReader(args.video) is_video = True base_height = args.height_size fx = args.fx delay = 1 esc_code = 27 p_code = 112 space_code = 32 mean_time = 0 for frame in frame_provider: current_time = cv2.getTickCount() if frame is None:
def run_inference(args): from modules.inference_engine_pytorch import InferenceEnginePyTorch socket_server = SocketServer(args.port) joint_angle_calculator = JointAngleCalculator() stride = 8 model_path = os.path.join('models', 'human-pose-estimation-3d.pth') net = InferenceEnginePyTorch(model_path, "GPU") canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8) plotter = Plotter3d(canvas_3d.shape[:2]) canvas_3d_window_name = 'Canvas 3D' cv2.namedWindow(canvas_3d_window_name) cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback) file_path = None if file_path is None: file_path = os.path.join('data', 'extrinsics.json') with open(file_path, 'r') as f: extrinsics = json.load(f) R = np.array(extrinsics['R'], dtype=np.float32) t = np.array(extrinsics['t'], dtype=np.float32) frame_provider = ImageReader(args.images) is_video = False if args.video != '': frame_provider = VideoReader(args.video) is_video = True base_height = args.height_size fx = 1 # focal length delay = 1 esc_code = 27 p_code = 112 space_code = 32 mean_time = 0 for frame in frame_provider: current_time = cv2.getTickCount() if frame is None: break input_scale = base_height / frame.shape[0] scaled_img = cv2.resize(frame, dsize=None, fx=input_scale, fy=input_scale) scaled_img = scaled_img[:, 0:scaled_img.shape[1] - (scaled_img.shape[1] % stride)] # better to pad, but cut out for demo if fx < 0: # Focal length is unknown fx = np.float32(0.8 * frame.shape[1]) inference_result = net.infer(scaled_img) poses_3d, poses_2d = parse_poses(inference_result, input_scale, stride, fx, is_video) edges = [] if len(poses_3d): poses_3d = rotate_poses(poses_3d, R, t) poses_3d_copy = poses_3d.copy() x = poses_3d_copy[:, 0::4] y = poses_3d_copy[:, 1::4] z = poses_3d_copy[:, 2::4] poses_3d[:, 0::4], poses_3d[:, 1::4], poses_3d[:, 2::4] = -z, x, -y poses_3d = poses_3d.reshape(poses_3d.shape[0], 19, -1)[:, :, 0:3] edges = (Plotter3d.SKELETON_EDGES + 19 * np.arange(poses_3d.shape[0]).reshape((-1, 1, 1))).reshape((-1, 2)) plotter.plot(canvas_3d, poses_3d, edges) cv2.imshow(canvas_3d_window_name, canvas_3d) draw_poses(frame, poses_2d) current_time = (cv2.getTickCount() - current_time) / cv2.getTickFrequency() if mean_time == 0: mean_time = current_time else: mean_time = mean_time * 0.95 + current_time * 0.05 cv2.putText(frame, 'FPS: {}'.format(int(1 / mean_time * 10) / 10), (40, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255)) cv2.imshow('ICV 3D Human Pose Estimation', frame) key = cv2.waitKey(delay) if key == esc_code: break if key == p_code: if delay == 1: delay = 0 else: delay = 1 if delay == 0 or not is_video: # allow to rotate 3D canvas while on pause key = 0 while (key != p_code and key != esc_code and key != space_code): plotter.plot(canvas_3d, poses_3d, edges) cv2.imshow(canvas_3d_window_name, canvas_3d) key = cv2.waitKey(33) if key == esc_code: break else: delay = 1 joint_angles = joint_angle_calculator.calculate_angles(poses_3d) if joint_angles: socket_server.send_data(joint_angles)
def main(): parser = ArgumentParser( description='Lightweight 3D human pose estimation demo. ' 'Press esc to exit, "p" to (un)pause video or process next image.') parser.add_argument( '-m', '--model', help='Required. Path to checkpoint with a trained model ' '(or an .xml file in case of OpenVINO inference).', type=str, required=True) parser.add_argument('--video', help='Optional. Path to video file or camera id.', type=str, default='') parser.add_argument('-o', '--output', help='output directory for estimated results', default='./output') parser.add_argument( '-d', '--device', help='Optional. Specify the target device to infer on: CPU or GPU. ' 'The demo will look for a suitable plugin for device specified ' '(by default, it is GPU).', type=str, default='GPU') parser.add_argument( '--use-openvino', help='Optional. Run network with OpenVINO as inference engine. ' 'CPU, GPU, FPGA, HDDL or MYRIAD devices are supported.', action='store_true') parser.add_argument( '--use-tensorrt', help='Optional. Run network with TensorRT as inference engine.', action='store_true') parser.add_argument('--images', help='Optional. Path to input image(s).', nargs='+', default='') parser.add_argument('--height-size', help='Optional. Network input layer height size.', type=int, default=256) parser.add_argument('--extrinsics-path', help='Optional. Path to file with camera extrinsics.', type=str, default=None) parser.add_argument('--fx', type=np.float32, default=-1, help='Optional. Camera focal length.') args = parser.parse_args() if args.video == '' and args.images == '': raise ValueError('Either --video or --image has to be provided') infer_ctrl = InferCtrl(args.model, args.height_size, device=args.device, openvino=args.use_openvino, tensorrt=args.use_tensorrt, extrinsics_path=args.extrinsics_path, fx=args.fx) frame_provider = ImageReader(args.images) is_video = False outname = args.images if args.video != '': frame_provider = VideoReader(args.video) is_video = True try: cam_index = int(args.video) outname = "output-cam-{}".format(cam_index) except: outname = ".".join(args.video.split(os.sep)[-1].split(".")[:-1]) fx = args.fx mean_time = 0 i = 0 frame_size = None, None dir_name = os.path.join(args.output, outname) os.makedirs(dir_name, exist_ok=True) outname_3d = os.path.join(dir_name, "{}-temp-3D.mp4".format(outname)) outname_frames = os.path.join(dir_name, "{}-temp.mp4".format(outname)) outname_combined_frames = os.path.join( dir_name, "{}-combined-temp.mp4".format(outname)) outname_3d_compressed = os.path.join(dir_name, "{}-3D.mp4".format(outname)) outname_frames_compressed = os.path.join(dir_name, "{}.mp4".format(outname)) outname_frames_combined_compressed = os.path.join( dir_name, "{}-combined.mp4".format(outname)) fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = None out_3d = None out_combined = None fps = None fps_out = 15 try: results_json = [] for frame in frame_provider: current_time = cv2.getTickCount() if frame is None: break inference_result = infer_ctrl.infer(frame, is_video=False, fx=fx) # print(inference_result) # continue # intepreting results frame, canvas_3d = infer_ctrl.process_frame( frame, inference_result) combined_frame = infer_ctrl.process_frame(frame, inference_result, merged=True) for key in inference_result.keys(): value = inference_result[key].get("value", []) if "numpy" in str(type(value)): inference_result[key]["value"] = getattr( value, "tolist", lambda: value)() results_json.append(inference_result) current_time = (cv2.getTickCount() - current_time) / cv2.getTickFrequency() if mean_time == 0: mean_time = current_time else: mean_time = mean_time * 0.95 + current_time * 0.05 fps = int(1 / mean_time * 10) / 10 cv2.putText(frame, 'processing FPS: {}'.format(fps), (40, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255)) i += 1 if out is None or out_3d is None: frame_size = tuple(int(i) for i in frame.shape[:2]) frame_size = frame_size[::-1] frame_size_3d = tuple(int(i) for i in canvas_3d.shape) frame_size_3d = canvas_3d.shape[1::-1] frame_size_combined = tuple( int(i) for i in combined_frame.shape) frame_size_combined = combined_frame.shape[1::-1] print(frame_size_3d, frame_size, frame_size_combined) out_3d = cv2.VideoWriter(outname_3d, fourcc, fps_out, frame_size_3d, True) out = cv2.VideoWriter(outname_frames, fourcc, fps_out, frame_size, True) out_combined = cv2.VideoWriter(outname_combined_frames, fourcc, fps_out, frame_size_combined, True) if out is not None: out.write(frame) if out_3d is not None: out_3d.write(canvas_3d) if out_combined is not None: out_combined.write(combined_frame) except KeyboardInterrupt: print("[INFO] interrupted") if out is not None: out.release() if out_3d is not None: out_3d.release() with open(os.path.join(dir_name, "results.json"), "w") as fp: json.dump(results_json, fp) fp.close() try: os.system( f"ffmpeg -i {outname_frames} -loglevel error -vcodec libx264 {outname_frames_compressed}" ) os.system( f"ffmpeg -i {outname_3d} -loglevel error -vcodec libx264 {outname_3d_compressed}" ) os.system( f"ffmpeg -i {outname_combined_frames} -loglevel error -vcodec libx264 {outname_frames_combined_compressed}" ) # os.system(f"rm -rf {outname_frames} {outname_3d}") except: traceback.print_exc() print("[INFO] finished .... ")