예제 #1
0
    def run(self):

        device = dai.Device(self.create_pipeline())
        device.startPipeline()

        # Define data queues
        if self.input_type == "internal":
            q_video = device.getOutputQueue(name="cam_out",
                                            maxSize=1,
                                            blocking=False)
            q_pd_out = device.getOutputQueue(name="pd_out",
                                             maxSize=1,
                                             blocking=False)
            q_lm_out = device.getOutputQueue(name="lm_out",
                                             maxSize=2,
                                             blocking=False)
            q_lm_in = device.getInputQueue(name="lm_in")
        else:
            q_pd_in = device.getInputQueue(name="pd_in")
            q_pd_out = device.getOutputQueue(name="pd_out",
                                             maxSize=4,
                                             blocking=True)
            q_lm_out = device.getOutputQueue(name="lm_out",
                                             maxSize=4,
                                             blocking=True)
            q_lm_in = device.getInputQueue(name="lm_in")

        self.fps = FPS(mean_nb_frames=20)

        seq_num = 0
        nb_pd_inferences = 0
        nb_lm_inferences = 0
        glob_pd_rtrip_time = 0
        glob_lm_rtrip_time = 0
        while True:
            self.fps.update()

            if self.input_type == "internal":
                in_video = q_video.get()
                video_frame = in_video.getCvFrame()
                self.frame_size = video_frame.shape[
                    0]  # The image is square cropped on the device
                self.pad_w = self.pad_h = 0
            else:
                if self.input_type == "image":
                    vid_frame = self.img
                else:
                    ok, vid_frame = self.cap.read()
                    if not ok:
                        break

                h, w = vid_frame.shape[:2]
                if self.crop:
                    # Cropping the long side to get a square shape
                    self.frame_size = min(h, w)
                    dx = (w - self.frame_size) // 2
                    dy = (h - self.frame_size) // 2
                    video_frame = vid_frame[dy:dy + self.frame_size,
                                            dx:dx + self.frame_size]
                else:
                    # Padding on the small side to get a square shape
                    self.frame_size = max(h, w)
                    self.pad_h = int((self.frame_size - h) / 2)
                    self.pad_w = int((self.frame_size - w) / 2)
                    video_frame = cv2.copyMakeBorder(vid_frame, self.pad_h,
                                                     self.pad_h, self.pad_w,
                                                     self.pad_w,
                                                     cv2.BORDER_CONSTANT)

                frame_nn = dai.ImgFrame()
                frame_nn.setSequenceNum(seq_num)
                frame_nn.setWidth(self.pd_input_length)
                frame_nn.setHeight(self.pd_input_length)
                frame_nn.setData(
                    to_planar(video_frame,
                              (self.pd_input_length, self.pd_input_length)))
                pd_rtrip_time = now()
                q_pd_in.send(frame_nn)

                seq_num += 1

            annotated_frame = video_frame.copy()

            # Get pose detection
            inference = q_pd_out.get()
            if self.input_type != "internal":
                pd_rtrip_time = now() - pd_rtrip_time
                glob_pd_rtrip_time += pd_rtrip_time
            self.pd_postprocess(inference)
            self.pd_render(annotated_frame)
            nb_pd_inferences += 1

            # Landmarks
            self.nb_active_regions = 0
            if self.show_3d:
                self.vis3d.clear_geometries()
                self.vis3d.add_geometry(self.grid_floor,
                                        reset_bounding_box=False)
                self.vis3d.add_geometry(self.grid_wall,
                                        reset_bounding_box=False)
            for i, r in enumerate(self.regions):
                frame_nn = mpu.warp_rect_img(r.rect_points, video_frame,
                                             self.lm_input_length,
                                             self.lm_input_length)
                nn_data = dai.NNData()
                nn_data.setLayer(
                    "input_1",
                    to_planar(frame_nn,
                              (self.lm_input_length, self.lm_input_length)))
                if i == 0:
                    lm_rtrip_time = now(
                    )  # We measure only for the first region
                q_lm_in.send(nn_data)

                # Get landmarks
                inference = q_lm_out.get()
                if i == 0:
                    lm_rtrip_time = now() - lm_rtrip_time
                    glob_lm_rtrip_time += lm_rtrip_time
                    nb_lm_inferences += 1
                self.lm_postprocess(r, inference)
                self.lm_render(annotated_frame, r)
            if self.show_3d:
                self.vis3d.poll_events()
                self.vis3d.update_renderer()
            if self.smoothing and self.nb_active_regions == 0:
                self.filter.reset()

            if self.input_type != "internal" and not self.crop:
                annotated_frame = annotated_frame[self.pad_h:self.pad_h + h,
                                                  self.pad_w:self.pad_w + w]

            if self.show_fps:
                self.fps.display(annotated_frame,
                                 orig=(50, 50),
                                 size=1,
                                 color=(240, 180, 100))
            cv2.imshow("Blazepose", annotated_frame)

            if self.output:
                self.output.write(annotated_frame)

            key = cv2.waitKey(1)
            if key == ord('q') or key == 27:
                break
            elif key == 32:
                # Pause on space bar
                cv2.waitKey(0)
            elif key == ord('1'):
                self.show_pd_box = not self.show_pd_box
            elif key == ord('2'):
                self.show_pd_kps = not self.show_pd_kps
            elif key == ord('3'):
                self.show_rot_rect = not self.show_rot_rect
            elif key == ord('4'):
                self.show_landmarks = not self.show_landmarks
            elif key == ord('5'):
                self.show_scores = not self.show_scores
            elif key == ord('6'):
                self.show_gesture = not self.show_gesture
            elif key == ord('f'):
                self.show_fps = not self.show_fps

        # Print some stats
        print(f"# pose detection inferences : {nb_pd_inferences}")
        print(f"# landmark inferences       : {nb_lm_inferences}")
        if self.input_type != "internal" and nb_pd_inferences != 0:
            print(
                f"Pose detection round trip   : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms"
            )
        if nb_lm_inferences != 0:
            print(
                f"Landmark round trip         : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms"
            )

        if self.output:
            self.output.release()
    def process_image(self, img):
        annotated_frame = img
        if self.camera:
            in_video = self.q_video.get()
            # Convert NV12 to BGR
            yuv = in_video.getData().reshape(
                (in_video.getHeight() * 3 // 2, in_video.getWidth()))
            video_frame = cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR_NV12)
        else:
            if self.image_mode is None:
                vid_frame = img
                height, width, _ = img.shape
                self.video_size = int(min(width, height))
            elif self.image_mode:
                vid_frame = self.img
            else:
                ok, vid_frame = self.cap.read()
                if not ok:
                    # print("not OK video frame")
                    return [], img  #break

            h, w = vid_frame.shape[:2]
            dx = (w - self.video_size) // 2
            dy = (h - self.video_size) // 2
            video_frame = vid_frame[dy:dy + self.video_size,
                                    dx:dx + self.video_size]
            frame_nn = dai.ImgFrame()
            frame_nn.setSequenceNum(self.seq_num)
            frame_nn.setWidth(self.pd_input_length)
            frame_nn.setHeight(self.pd_input_length)
            frame_nn.setData(
                to_planar(video_frame,
                          (self.pd_input_length, self.pd_input_length)))

            self.q_pd_in.send(frame_nn)

            self.seq_num += 1

        annotated_frame = video_frame.copy()

        inference = self.q_pd_out.get()
        self.pd_postprocess(inference)
        self.pd_render(annotated_frame)

        # Hand landmarks
        if self.use_lm:
            for i, r in enumerate(self.regions):
                img_hand = mpu.warp_rect_img(r.rect_points, video_frame,
                                             self.lm_input_length,
                                             self.lm_input_length)
                nn_data = dai.NNData()
                nn_data.setLayer(
                    "input_1",
                    to_planar(img_hand,
                              (self.lm_input_length, self.lm_input_length)))
                self.q_lm_in.send(nn_data)

            # Retrieve hand landmarks
            for i, r in enumerate(self.regions):
                inference = self.q_lm_out.get()
                self.lm_postprocess(r, inference)
                self.lm_render(annotated_frame, r)

        return self.regions, annotated_frame
예제 #3
0
    def run(self):

        device = dai.Device(self.create_pipeline())
        device.startPipeline()

        # Define data queues 
        if self.camera:
            q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False)
            q_pd_out = device.getOutputQueue(name="pd_out", maxSize=1, blocking=False)
            if self.use_lm:
                q_lm_out = device.getOutputQueue(name="lm_out", maxSize=2, blocking=False)
                q_lm_in = device.getInputQueue(name="lm_in")
        else:
            q_pd_in = device.getInputQueue(name="pd_in")
            q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True)
            if self.use_lm:
                q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True)
                q_lm_in = device.getInputQueue(name="lm_in")

        self.fps = FPS(mean_nb_frames=20)

        seq_num = 0
        nb_pd_inferences = 0
        nb_lm_inferences = 0
        glob_pd_rtrip_time = 0
        glob_lm_rtrip_time = 0
        while True:
            self.fps.update()
            if self.camera:
                in_video = q_video.get()
                video_frame = in_video.getCvFrame()
            else:
                if self.image_mode:
                    vid_frame = self.img
                else:
                    ok, vid_frame = self.cap.read()
                    if not ok:
                        break
                h, w = vid_frame.shape[:2]
                dx = (w - self.video_size) // 2
                dy = (h - self.video_size) // 2
                video_frame = vid_frame[dy:dy+self.video_size, dx:dx+self.video_size]
                frame_nn = dai.ImgFrame()
                frame_nn.setSequenceNum(seq_num)
                frame_nn.setWidth(self.pd_input_length)
                frame_nn.setHeight(self.pd_input_length)
                frame_nn.setData(to_planar(video_frame, (self.pd_input_length, self.pd_input_length)))
                q_pd_in.send(frame_nn)
                pd_rtrip_time = now()

                seq_num += 1

            annotated_frame = video_frame.copy()

            # Get palm detection
            inference = q_pd_out.get()
            if not self.camera: glob_pd_rtrip_time += now() - pd_rtrip_time
            self.pd_postprocess(inference)
            self.pd_render(annotated_frame)
            nb_pd_inferences += 1

            # Hand landmarks
            if self.use_lm:
                for i,r in enumerate(self.regions):
                    img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length)
                    nn_data = dai.NNData()   
                    nn_data.setLayer("input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length)))
                    q_lm_in.send(nn_data)
                    if i == 0: lm_rtrip_time = now() # We measure only for the first region
                
                # Retrieve hand landmarks
                for i,r in enumerate(self.regions):
                    inference = q_lm_out.get()
                    if i == 0: glob_lm_rtrip_time += now() - lm_rtrip_time
                    self.lm_postprocess(r, inference)
                    self.lm_render(annotated_frame, r)
                    nb_lm_inferences += 1

                
            self.fps.display(annotated_frame, orig=(50,50),color=(240,180,100))
            cv2.imshow("video", annotated_frame)

            key = cv2.waitKey(1) 
            if key == ord('q') or key == 27:
                break
            elif key == 32:
                # Pause on space bar
                cv2.waitKey(0)
            elif key == ord('1'):
                self.show_pd_box = not self.show_pd_box
            elif key == ord('2'):
                self.show_pd_kps = not self.show_pd_kps
            elif key == ord('3'):
                self.show_rot_rect = not self.show_rot_rect
            elif key == ord('4'):
                self.show_landmarks = not self.show_landmarks
            elif key == ord('5'):
                self.show_handedness = not self.show_handedness
            elif key == ord('6'):
                self.show_scores = not self.show_scores
            elif key == ord('7'):
                self.show_gesture = not self.show_gesture

        # Print some stats
        if not self.camera:
            print(f"# video files frames                 : {seq_num}")
            print(f"# palm detection inferences received : {nb_pd_inferences}")
            print(f"# hand landmark inferences received  : {nb_lm_inferences}")
            print(f"Palm detection round trip            : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms")
            print(f"Hand landmark round trip             : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms")
예제 #4
0
    def run(self):
        device = dai.Device(self.create_pipeline())
        device.startPipeline()

        q_video = device.getOutputQueue(name="cam_out",
                                        maxSize=1,
                                        blocking=False)
        q_pd_in = device.getInputQueue(name="pd_in")
        q_pd_out = device.getOutputQueue(name="pd_out",
                                         maxSize=4,
                                         blocking=True)
        q_lm_out = device.getOutputQueue(name="lm_out",
                                         maxSize=4,
                                         blocking=True)
        q_lm_in = device.getInputQueue(name="lm_in")
        q_asl_out = device.getOutputQueue(name="asl_out",
                                          maxSize=4,
                                          blocking=True)
        q_asl_in = device.getInputQueue(name="asl_in")

        while True:
            in_video = q_video.get()
            video_frame = in_video.getCvFrame()

            h, w = video_frame.shape[:2]
            self.frame_size = max(h, w)
            self.pad_h = int((self.frame_size - h) / 2)
            self.pad_w = int((self.frame_size - w) / 2)

            video_frame = cv2.copyMakeBorder(video_frame, self.pad_h,
                                             self.pad_h, self.pad_w,
                                             self.pad_w, cv2.BORDER_CONSTANT)

            frame_nn = dai.ImgFrame()
            frame_nn.setWidth(self.pd_input_length)
            frame_nn.setHeight(self.pd_input_length)
            frame_nn.setData(
                to_planar(video_frame,
                          (self.pd_input_length, self.pd_input_length)))
            q_pd_in.send(frame_nn)

            annotated_frame = video_frame.copy()

            # Get palm detection
            inference = q_pd_out.get()
            self.pd_postprocess(inference)

            # Send data for hand landmarks
            for i, r in enumerate(self.regions):
                img_hand = mpu.warp_rect_img(r.rect_points, video_frame,
                                             self.lm_input_length,
                                             self.lm_input_length)
                nn_data = dai.NNData()
                nn_data.setLayer(
                    "input_1",
                    to_planar(img_hand,
                              (self.lm_input_length, self.lm_input_length)))
                q_lm_in.send(nn_data)

            # Retrieve hand landmarks
            for i, r in enumerate(self.regions):
                inference = q_lm_out.get()
                self.lm_postprocess(r, inference)
                hand_frame, handedness, hand_bbox = self.lm_render(
                    video_frame, annotated_frame, r)
                # ASL recognition
                if hand_frame is not None and self.asl_recognition:
                    hand_frame = cv2.resize(
                        hand_frame,
                        (self.asl_input_length, self.asl_input_length),
                        interpolation=cv2.INTER_NEAREST)
                    hand_frame = hand_frame.transpose(2, 0, 1)
                    nn_data = dai.NNData()
                    nn_data.setLayer("input", hand_frame)
                    q_asl_in.send(nn_data)
                    asl_result = np.array(q_asl_out.get().getFirstLayerFp16())
                    asl_idx = np.argmax(asl_result)
                    # Recognized ASL character is associated with a probability
                    asl_char = [
                        characters[asl_idx],
                        round(asl_result[asl_idx] * 100, 1)
                    ]
                    selected_char = asl_char
                    current_char_queue = None
                    if handedness > 0.5:
                        current_char_queue = self.right_char_queue
                    else:
                        current_char_queue = self.left_char_queue
                    current_char_queue.append(selected_char)
                    # Peform filtering of recognition resuls using the previous 5 results
                    # If there aren't enough reults, take the first result as output
                    if len(current_char_queue) < 5:
                        selected_char = current_char_queue[0]
                    else:
                        char_candidate = {}
                        for i in range(5):
                            if current_char_queue[i][0] not in char_candidate:
                                char_candidate[current_char_queue[i][0]] = [
                                    1, current_char_queue[i][1]
                                ]
                            else:
                                char_candidate[current_char_queue[i]
                                               [0]][0] += 1
                                char_candidate[current_char_queue[i][0]][
                                    1] += current_char_queue[i][1]
                        most_voted_char = ""
                        max_votes = 0
                        most_voted_char_prob = 0
                        for key in char_candidate:
                            if char_candidate[key][0] > max_votes:
                                max_votes = char_candidate[key][0]
                                most_voted_char = key
                                most_voted_char_prob = round(
                                    char_candidate[key][1] /
                                    char_candidate[key][0], 1)
                        selected_char = (most_voted_char, most_voted_char_prob)

                    if self.show_asl:
                        gesture_string = "Letter: " + selected_char[
                            0] + ", " + str(selected_char[1]) + "%"
                        textSize = self.ft.getTextSize(gesture_string,
                                                       fontHeight=14,
                                                       thickness=-1)[0]
                        cv2.rectangle(video_frame,
                                      (hand_bbox[0] - 5, hand_bbox[1]),
                                      (hand_bbox[0] + textSize[0] + 5,
                                       hand_bbox[1] - 18), (36, 152, 0), -1)
                        self.ft.putText(img=video_frame,
                                        text=gesture_string,
                                        org=(hand_bbox[0], hand_bbox[1] - 5),
                                        fontHeight=14,
                                        color=(255, 255, 255),
                                        thickness=-1,
                                        line_type=cv2.LINE_AA,
                                        bottomLeftOrigin=True)

            video_frame = video_frame[self.pad_h:self.pad_h + h,
                                      self.pad_w:self.pad_w + w]
            cv2.imshow("hand tracker", video_frame)
            key = cv2.waitKey(1)
            if key == ord('q') or key == 27:
                break
            elif key == 32:
                # Pause on space bar
                cv2.waitKey(0)
            elif key == ord('1'):
                self.show_hand_box = not self.show_hand_box
            elif key == ord('2'):
                self.show_landmarks = not self.show_landmarks
            elif key == ord('3'):
                self.show_asl = not self.show_asl
    def run(self):

        self.fps = FPS(mean_nb_frames=20)

        nb_frames = 0
        nb_pd_inferences = 0
        nb_pd_inferences_direct = 0
        nb_lm_inferences = 0
        nb_lm_inferences_after_landmarks_ROI = 0
        glob_pd_rtrip_time = 0
        glob_lm_rtrip_time = 0

        get_new_frame = True
        use_previous_landmarks = False

        global_time = time.perf_counter()
        while True:
            if get_new_frame:
                nb_frames += 1
                if self.input_type == "image":
                    vid_frame = self.img
                else:
                    ok, vid_frame = self.cap.read()
                    if not ok:
                        break
                h, w = vid_frame.shape[:2]
                if self.crop:
                    # Cropping the long side to get a square shape
                    self.frame_size = min(h, w)
                    dx = (w - self.frame_size) // 2
                    dy = (h - self.frame_size) // 2
                    video_frame = vid_frame[dy:dy + self.frame_size,
                                            dx:dx + self.frame_size]
                else:
                    # Padding on the small side to get a square shape
                    self.frame_size = max(h, w)
                    self.pad_h = int((self.frame_size - h) / 2)
                    self.pad_w = int((self.frame_size - w) / 2)
                    video_frame = cv2.copyMakeBorder(vid_frame, self.pad_h,
                                                     self.pad_h, self.pad_w,
                                                     self.pad_w,
                                                     cv2.BORDER_CONSTANT)

                annotated_frame = video_frame.copy()

            if not self.force_detection and use_previous_landmarks:
                self.regions = regions_from_landmarks
                mpu.detections_to_rect(
                    self.regions, kp_pair=[0, 1]
                )  # self.regions.pd_kps are initialized from landmarks on previous frame
                mpu.rect_transformation(self.regions, self.frame_size,
                                        self.frame_size)
            else:
                # Infer pose detection
                # Resize image to NN square input shape
                frame_nn = cv2.resize(video_frame, (self.pd_w, self.pd_h),
                                      interpolation=cv2.INTER_AREA)
                # Transpose hxwx3 -> 1x3xhxw
                frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ]

                pd_rtrip_time = now()
                inference = self.pd_exec_net.infer(
                    inputs={self.pd_input_blob: frame_nn})
                glob_pd_rtrip_time += now() - pd_rtrip_time
                self.pd_postprocess(inference)
                self.pd_render(annotated_frame)
                nb_pd_inferences += 1
                if get_new_frame: nb_pd_inferences_direct += 1

            # Landmarks
            self.nb_active_regions = 0
            if self.show_3d:
                self.vis3d.clear_geometries()
                self.vis3d.add_geometry(self.grid_floor,
                                        reset_bounding_box=False)
                self.vis3d.add_geometry(self.grid_wall,
                                        reset_bounding_box=False)
            if self.force_detection:
                for r in self.regions:
                    frame_nn = mpu.warp_rect_img(r.rect_points, video_frame,
                                                 self.lm_w, self.lm_h)
                    # Transpose hxwx3 -> 1x3xhxw
                    frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ]
                    # Get landmarks
                    lm_rtrip_time = now()
                    inference = self.lm_exec_net.infer(
                        inputs={self.lm_input_blob: frame_nn})
                    glob_lm_rtrip_time += now() - lm_rtrip_time
                    nb_lm_inferences += 1
                    self.lm_postprocess(r, inference)
                    self.lm_render(annotated_frame, r)
            elif len(self.regions) == 1:
                r = self.regions[0]
                frame_nn = mpu.warp_rect_img(r.rect_points, video_frame,
                                             self.lm_w, self.lm_h)
                # Transpose hxwx3 -> 1x3xhxw
                frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ]
                # Get landmarks
                lm_rtrip_time = now()
                inference = self.lm_exec_net.infer(
                    inputs={self.lm_input_blob: frame_nn})
                glob_lm_rtrip_time += now() - lm_rtrip_time
                nb_lm_inferences += 1
                if use_previous_landmarks:
                    nb_lm_inferences_after_landmarks_ROI += 1

                self.lm_postprocess(r, inference)
                if not self.force_detection:
                    if get_new_frame:
                        if not use_previous_landmarks:
                            # With a new frame, we have run the landmark NN on a ROI found by the detection NN...
                            if r.lm_score > self.lm_score_threshold:
                                # ...and succesfully found a body and its landmarks
                                # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y)
                                regions_from_landmarks = [
                                    mpu.Region(pd_kps=r.landmarks_padded[
                                        self.nb_lms - 2:self.nb_lms, :2] /
                                               self.frame_size)
                                ]
                                use_previous_landmarks = True
                        else:
                            # With a new frame, we have run the landmark NN on a ROI calculated from the landmarks of the previous frame...
                            if r.lm_score > self.lm_score_threshold:
                                # ...and succesfully found a body and its landmarks
                                # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y)
                                regions_from_landmarks = [
                                    mpu.Region(pd_kps=r.landmarks_padded[
                                        self.nb_lms - 2:self.nb_lms, :2] /
                                               self.frame_size)
                                ]
                                use_previous_landmarks = True
                            else:
                                # ...and could not find a body
                                # We don't know if it is because the ROI calculated from the previous frame is not reliable (the body moved)
                                # or because there is really no body in the frame. To decide, we have to run the detection NN on this frame
                                get_new_frame = False
                                use_previous_landmarks = False
                                continue
                    else:
                        # On a frame on which we already ran the landmark NN without founding a body,
                        # we have run the detection NN...
                        if r.lm_score > self.lm_score_threshold:
                            # ...and succesfully found a body and its landmarks
                            use_previous_landmarks = True
                            # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y)
                            regions_from_landmarks = [
                                mpu.Region(pd_kps=r.landmarks_padded[
                                    self.nb_lms - 2:self.nb_lms, :2] /
                                           self.frame_size)
                            ]
                            use_previous_landmarks = True
                        # else:
                        # ...and could not find a body
                        # We are sure there is no body in that frame

                        get_new_frame = True
                self.lm_render(annotated_frame, r)
            else:
                # Detection NN hasn't found any body
                get_new_frame = True

            self.fps.update()

            if self.show_3d:
                self.vis3d.poll_events()
                self.vis3d.update_renderer()
            if self.smoothing and self.nb_active_regions == 0:
                self.filter.reset()

            if not self.crop:
                annotated_frame = annotated_frame[self.pad_h:self.pad_h + h,
                                                  self.pad_w:self.pad_w + w]

            if self.show_fps:
                self.fps.display(annotated_frame,
                                 orig=(50, 50),
                                 size=1,
                                 color=(240, 180, 100))
            cv2.imshow("Blazepose", annotated_frame)

            if self.output:
                self.output.write(annotated_frame)

            key = cv2.waitKey(1)
            if key == ord('q') or key == 27:
                break
            elif key == 32:
                # Pause on space bar
                cv2.waitKey(0)
            elif key == ord('1'):
                self.show_pd_box = not self.show_pd_box
            elif key == ord('2'):
                self.show_pd_kps = not self.show_pd_kps
            elif key == ord('3'):
                self.show_rot_rect = not self.show_rot_rect
            elif key == ord('4'):
                self.show_landmarks = not self.show_landmarks
            elif key == ord('5'):
                self.show_scores = not self.show_scores
            elif key == ord('6'):
                self.show_gesture = not self.show_gesture
            elif key == ord('f'):
                self.show_fps = not self.show_fps
            elif key == ord('s'):
                self.show_segmentation = not self.show_segmentation

        # Print some stats
        print(
            f"FPS : {nb_frames/(time.perf_counter() - global_time):.1f} f/s (# frames = {nb_frames})"
        )
        print(
            f"# pose detection inferences : {nb_pd_inferences} - # direct: {nb_pd_inferences_direct} - # after landmarks ROI failures: {nb_pd_inferences-nb_pd_inferences_direct}"
        )
        print(
            f"# landmark inferences       : {nb_lm_inferences} - # after pose detection: {nb_lm_inferences - nb_lm_inferences_after_landmarks_ROI} - # after landmarks ROI prediction: {nb_lm_inferences_after_landmarks_ROI}"
        )
        print(
            f"Pose detection round trip   : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms"
        )
        if nb_lm_inferences:
            print(
                f"Landmark round trip         : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms"
            )

        if self.output:
            self.output.release()