def main():
    # model files check and download
    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)
    check_file_existance(FILE_PATH)

    # prepare input data
    canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8)
    plotter = Plotter3d(canvas_3d.shape[:2])
    canvas_3d_window_name = 'Canvas3D'
    cv2.namedWindow(canvas_3d_window_name)
    cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback)

    with open(FILE_PATH, 'r') as f:
        extrinsics = json.load(f)

    R = np.array(extrinsics['R'], dtype=np.float32)
    t = np.array(extrinsics['t'], dtype=np.float32)

    if args.video is None:
        frame_provider = ImageReader([args.input])
        is_video = False
    else:
        frame_provider = VideoReader(args.video)
        is_video = True

    fx = -1
    delay = 1
    esc_code = 27
    p_code = 112
    space_code = 32
    mean_time = 0
    img_mean = np.array([128, 128, 128], dtype=np.float32)
    base_width_calculated = False

    # net initialize
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id)

    # inference
    for frame_id, frame in enumerate(frame_provider):
        current_time = cv2.getTickCount()
        if frame is None:
            break

        if not base_width_calculated:
            IMAGE_WIDTH = frame.shape[1] * (IMAGE_HEIGHT / frame.shape[0])
            IMAGE_WIDTH = int(IMAGE_WIDTH / STRIDE) * STRIDE
            net.set_input_shape((1, 3, IMAGE_HEIGHT, IMAGE_WIDTH))
            base_width_calculated = True

        input_scale = IMAGE_HEIGHT / frame.shape[0]
        scaled_img = cv2.resize(frame,
                                dsize=None,
                                fx=input_scale,
                                fy=input_scale)
        # better to pad, but cut out for demo
        scaled_img = scaled_img[:, 0:scaled_img.shape[1] -
                                (scaled_img.shape[1] % STRIDE)]

        if fx < 0:  # Focal length is unknown
            fx = np.float32(0.8 * frame.shape[1])

        normalized_img = (scaled_img.astype(np.float32) - img_mean) / 255.0
        normalized_img = np.expand_dims(normalized_img.transpose(2, 0, 1),
                                        axis=0)

        # exectution
        if is_video:
            input_blobs = net.get_input_blob_list()
            net.set_input_blob_data(normalized_img, input_blobs[0])
            net.update()
            features, heatmaps, pafs = net.get_results()

        else:
            print('Start inference...')
            if args.benchmark:
                print('BENCHMARK mode')
                for i in range(5):
                    start = int(round(time.time() * 1000))
                    features, heatmaps, pafs = net.predict([normalized_img])
                    end = int(round(time.time() * 1000))
                    print(f'\tailia processing time {end - start} ms')
            else:
                features, heatmaps, pafs = net.predict([normalized_img])

        inference_result = (features[-1].squeeze(), heatmaps[-1].squeeze(),
                            pafs[-1].squeeze())

        poses_3d, poses_2d = parse_poses(inference_result, input_scale, STRIDE,
                                         fx, is_video)
        edges = []
        if len(poses_3d):
            poses_3d = rotate_poses(poses_3d, R, t)
            poses_3d_copy = poses_3d.copy()
            x = poses_3d_copy[:, 0::4]
            y = poses_3d_copy[:, 1::4]
            z = poses_3d_copy[:, 2::4]
            poses_3d[:, 0::4], poses_3d[:, 1::4], poses_3d[:, 2::4] = -z, x, -y

            poses_3d = poses_3d.reshape(poses_3d.shape[0], 19, -1)[:, :, 0:3]
            edges = (Plotter3d.SKELETON_EDGES +
                     19 * np.arange(poses_3d.shape[0]).reshape(
                         (-1, 1, 1))).reshape((-1, 2))
        plotter.plot(canvas_3d, poses_3d, edges)

        if is_video:
            cv2.imshow(canvas_3d_window_name, canvas_3d)
        else:
            cv2.imwrite(f'Canvas3D_{frame_id}.png', canvas_3d)

        draw_poses(frame, poses_2d)
        current_time = (cv2.getTickCount() -
                        current_time) / cv2.getTickFrequency()
        if mean_time == 0:
            mean_time = current_time
        else:
            mean_time = mean_time * 0.95 + current_time * 0.05
        cv2.putText(frame, 'FPS: {}'.format(int(1 / mean_time * 10) / 10),
                    (40, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255))

        if is_video:
            cv2.imshow('ICV 3D Human Pose Estimation', frame)
        else:
            cv2.imwrite(args.savepath, frame)

        key = cv2.waitKey(delay)
        if key == esc_code:
            break
        if key == p_code:
            if delay == 1:
                delay = 0
            else:
                delay = 1

        if delay == 0 and args.rotate3d:
            key = 0
            while (key != p_code and key != esc_code and key != space_code):
                plotter.plot(canvas_3d, poses_3d, edges)
                cv2.imshow(canvas_3d_window_name, canvas_3d)
                key = cv2.waitKey(33)
            if key == esc_code:
                break
            else:
                delay = 1

    print('Script finished successfully.')
    canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8)
    plotter = Plotter3d(canvas_3d.shape[:2])
    canvas_3d_window_name = 'Canvas 3D'
    cv2.namedWindow(canvas_3d_window_name)
    cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback)

    file_path = args.extrinsics_path
    if file_path is None:
        file_path = os.path.join('data', 'extrinsics.json')
    with open(file_path, 'r') as f:
        extrinsics = json.load(f)
    R = np.array(extrinsics['R'], dtype=np.float32)
    t = np.array(extrinsics['t'], dtype=np.float32)

    frame_provider = ImageReader(args.images)
    is_video = False
    if args.video != '':
        frame_provider = VideoReader(args.video)
        is_video = True
    base_height = args.height_size
    fx = args.fx

    delay = 1
    esc_code = 27
    p_code = 112
    space_code = 32
    mean_time = 0
    for frame in frame_provider:
        current_time = cv2.getTickCount()
        if frame is None:
示例#3
0
def run_inference(args):
    from modules.inference_engine_pytorch import InferenceEnginePyTorch

    socket_server = SocketServer(args.port)
    joint_angle_calculator = JointAngleCalculator()

    stride = 8

    model_path = os.path.join('models', 'human-pose-estimation-3d.pth')
    net = InferenceEnginePyTorch(model_path, "GPU")

    canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8)
    plotter = Plotter3d(canvas_3d.shape[:2])
    canvas_3d_window_name = 'Canvas 3D'
    cv2.namedWindow(canvas_3d_window_name)
    cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback)

    file_path = None
    if file_path is None:
        file_path = os.path.join('data', 'extrinsics.json')
    with open(file_path, 'r') as f:
        extrinsics = json.load(f)
    R = np.array(extrinsics['R'], dtype=np.float32)
    t = np.array(extrinsics['t'], dtype=np.float32)

    frame_provider = ImageReader(args.images)
    is_video = False
    if args.video != '':
        frame_provider = VideoReader(args.video)
        is_video = True
    base_height = args.height_size
    fx = 1 # focal length

    delay = 1
    esc_code = 27
    p_code = 112
    space_code = 32
    mean_time = 0

    for frame in frame_provider:
        current_time = cv2.getTickCount()
        if frame is None:
            break
        input_scale = base_height / frame.shape[0]
        scaled_img = cv2.resize(frame, dsize=None, fx=input_scale, fy=input_scale)
        scaled_img = scaled_img[:, 0:scaled_img.shape[1] - (scaled_img.shape[1] % stride)]  # better to pad, but cut out for demo
        if fx < 0:  # Focal length is unknown
            fx = np.float32(0.8 * frame.shape[1])

        inference_result = net.infer(scaled_img)
        poses_3d, poses_2d = parse_poses(inference_result, input_scale, stride, fx, is_video)
        edges = []

        if len(poses_3d):
            poses_3d = rotate_poses(poses_3d, R, t)
            poses_3d_copy = poses_3d.copy()
            x = poses_3d_copy[:, 0::4]
            y = poses_3d_copy[:, 1::4]
            z = poses_3d_copy[:, 2::4]
            poses_3d[:, 0::4], poses_3d[:, 1::4], poses_3d[:, 2::4] = -z, x, -y

            poses_3d = poses_3d.reshape(poses_3d.shape[0], 19, -1)[:, :, 0:3]
            edges = (Plotter3d.SKELETON_EDGES + 19 * np.arange(poses_3d.shape[0]).reshape((-1, 1, 1))).reshape((-1, 2))

        plotter.plot(canvas_3d, poses_3d, edges)
        cv2.imshow(canvas_3d_window_name, canvas_3d)

        draw_poses(frame, poses_2d)
        current_time = (cv2.getTickCount() - current_time) / cv2.getTickFrequency()
        if mean_time == 0:
            mean_time = current_time
        else:
            mean_time = mean_time * 0.95 + current_time * 0.05
        cv2.putText(frame, 'FPS: {}'.format(int(1 / mean_time * 10) / 10),
                    (40, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255))
        cv2.imshow('ICV 3D Human Pose Estimation', frame)

        key = cv2.waitKey(delay)
        if key == esc_code:
            break
        if key == p_code:
            if delay == 1:
                delay = 0
            else:
                delay = 1
        if delay == 0 or not is_video:  # allow to rotate 3D canvas while on pause
            key = 0
            while (key != p_code
                   and key != esc_code
                   and key != space_code):
                plotter.plot(canvas_3d, poses_3d, edges)
                cv2.imshow(canvas_3d_window_name, canvas_3d)
                key = cv2.waitKey(33)
            if key == esc_code:
                break
            else:
                delay = 1
        
        joint_angles = joint_angle_calculator.calculate_angles(poses_3d)
        if joint_angles:
            socket_server.send_data(joint_angles)
示例#4
0
    canvas_3d = np.zeros((720, 1280, 3), dtype=np.uint8)
    plotter = Plotter3d(canvas_3d.shape[:2])
    canvas_3d_window_name = 'Canvas 3D'
    cv2.namedWindow(canvas_3d_window_name)
    cv2.setMouseCallback(canvas_3d_window_name, Plotter3d.mouse_callback)

    file_path = args.extrinsics_path
    if file_path is None:
        file_path = os.path.join('data', 'extrinsics.json')
    with open(file_path, 'r') as f:
        extrinsics = json.load(f)
    R = np.array(extrinsics['R'], dtype=np.float32)
    t = np.array(extrinsics['t'], dtype=np.float32)

    frame_provider = ImageReader(args.images)
    is_video = False
    if args.video != '':
        frame_provider = VideoReader(args.video)
        is_video = True
    base_height = args.height_size
    fx = args.fx

    delay = 1
    esc_code = 27
    p_code = 112
    space_code = 32
    mean_time = 0
    for frame in frame_provider:
        current_time = cv2.getTickCount()
        if frame is None:
def main():
    parser = ArgumentParser(
        description='Lightweight 3D human pose estimation demo. '
        'Press esc to exit, "p" to (un)pause video or process next image.')
    parser.add_argument(
        '-m',
        '--model',
        help='Required. Path to checkpoint with a trained model '
        '(or an .xml file in case of OpenVINO inference).',
        type=str,
        required=True)
    parser.add_argument('--video',
                        help='Optional. Path to video file or camera id.',
                        type=str,
                        default='')
    parser.add_argument('-o',
                        '--output',
                        help='output directory for estimated results',
                        default='./output')

    parser.add_argument(
        '-d',
        '--device',
        help='Optional. Specify the target device to infer on: CPU or GPU. '
        'The demo will look for a suitable plugin for device specified '
        '(by default, it is GPU).',
        type=str,
        default='GPU')
    parser.add_argument(
        '--use-openvino',
        help='Optional. Run network with OpenVINO as inference engine. '
        'CPU, GPU, FPGA, HDDL or MYRIAD devices are supported.',
        action='store_true')
    parser.add_argument(
        '--use-tensorrt',
        help='Optional. Run network with TensorRT as inference engine.',
        action='store_true')
    parser.add_argument('--images',
                        help='Optional. Path to input image(s).',
                        nargs='+',
                        default='')
    parser.add_argument('--height-size',
                        help='Optional. Network input layer height size.',
                        type=int,
                        default=256)
    parser.add_argument('--extrinsics-path',
                        help='Optional. Path to file with camera extrinsics.',
                        type=str,
                        default=None)
    parser.add_argument('--fx',
                        type=np.float32,
                        default=-1,
                        help='Optional. Camera focal length.')
    args = parser.parse_args()

    if args.video == '' and args.images == '':
        raise ValueError('Either --video or --image has to be provided')

    infer_ctrl = InferCtrl(args.model,
                           args.height_size,
                           device=args.device,
                           openvino=args.use_openvino,
                           tensorrt=args.use_tensorrt,
                           extrinsics_path=args.extrinsics_path,
                           fx=args.fx)

    frame_provider = ImageReader(args.images)
    is_video = False

    outname = args.images

    if args.video != '':
        frame_provider = VideoReader(args.video)
        is_video = True
        try:
            cam_index = int(args.video)
            outname = "output-cam-{}".format(cam_index)
        except:
            outname = ".".join(args.video.split(os.sep)[-1].split(".")[:-1])

    fx = args.fx

    mean_time = 0
    i = 0
    frame_size = None, None

    dir_name = os.path.join(args.output, outname)
    os.makedirs(dir_name, exist_ok=True)

    outname_3d = os.path.join(dir_name, "{}-temp-3D.mp4".format(outname))
    outname_frames = os.path.join(dir_name, "{}-temp.mp4".format(outname))
    outname_combined_frames = os.path.join(
        dir_name, "{}-combined-temp.mp4".format(outname))

    outname_3d_compressed = os.path.join(dir_name, "{}-3D.mp4".format(outname))
    outname_frames_compressed = os.path.join(dir_name,
                                             "{}.mp4".format(outname))
    outname_frames_combined_compressed = os.path.join(
        dir_name, "{}-combined.mp4".format(outname))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = None
    out_3d = None
    out_combined = None
    fps = None
    fps_out = 15

    try:
        results_json = []
        for frame in frame_provider:
            current_time = cv2.getTickCount()
            if frame is None:
                break

            inference_result = infer_ctrl.infer(frame, is_video=False, fx=fx)

            # print(inference_result)
            # continue
            # intepreting results
            frame, canvas_3d = infer_ctrl.process_frame(
                frame, inference_result)
            combined_frame = infer_ctrl.process_frame(frame,
                                                      inference_result,
                                                      merged=True)

            for key in inference_result.keys():
                value = inference_result[key].get("value", [])

                if "numpy" in str(type(value)):
                    inference_result[key]["value"] = getattr(
                        value, "tolist", lambda: value)()

            results_json.append(inference_result)

            current_time = (cv2.getTickCount() -
                            current_time) / cv2.getTickFrequency()

            if mean_time == 0:
                mean_time = current_time
            else:
                mean_time = mean_time * 0.95 + current_time * 0.05

            fps = int(1 / mean_time * 10) / 10

            cv2.putText(frame, 'processing FPS: {}'.format(fps), (40, 80),
                        cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255))
            i += 1

            if out is None or out_3d is None:
                frame_size = tuple(int(i) for i in frame.shape[:2])
                frame_size = frame_size[::-1]

                frame_size_3d = tuple(int(i) for i in canvas_3d.shape)
                frame_size_3d = canvas_3d.shape[1::-1]

                frame_size_combined = tuple(
                    int(i) for i in combined_frame.shape)
                frame_size_combined = combined_frame.shape[1::-1]

                print(frame_size_3d, frame_size, frame_size_combined)

                out_3d = cv2.VideoWriter(outname_3d, fourcc, fps_out,
                                         frame_size_3d, True)
                out = cv2.VideoWriter(outname_frames, fourcc, fps_out,
                                      frame_size, True)
                out_combined = cv2.VideoWriter(outname_combined_frames, fourcc,
                                               fps_out, frame_size_combined,
                                               True)

            if out is not None:
                out.write(frame)

            if out_3d is not None:
                out_3d.write(canvas_3d)

            if out_combined is not None:
                out_combined.write(combined_frame)

    except KeyboardInterrupt:
        print("[INFO] interrupted")

    if out is not None:
        out.release()

    if out_3d is not None:
        out_3d.release()

    with open(os.path.join(dir_name, "results.json"), "w") as fp:
        json.dump(results_json, fp)
        fp.close()

    try:
        os.system(
            f"ffmpeg -i {outname_frames} -loglevel error -vcodec libx264 {outname_frames_compressed}"
        )
        os.system(
            f"ffmpeg -i {outname_3d} -loglevel error -vcodec libx264 {outname_3d_compressed}"
        )
        os.system(
            f"ffmpeg -i {outname_combined_frames} -loglevel error -vcodec libx264 {outname_frames_combined_compressed}"
        )

        # os.system(f"rm -rf {outname_frames} {outname_3d}")
    except:
        traceback.print_exc()

    print("[INFO] finished .... ")