예제 #1
0
    def __init__(self,
                 pd_path="models/palm_detection_6_shaves.blob",
                 pd_score_thresh=0.65,
                 pd_nms_thresh=0.3,
                 lm_path="models/hand_landmark_6_shaves.blob",
                 lm_score_threshold=0.5,
                 show_landmarks=True,
                 show_hand_box=True,
                 asl_path="models/hand_asl_6_shaves.blob",
                 asl_recognition=True,
                 show_asl=True):

        self.pd_path = pd_path
        self.pd_score_thresh = pd_score_thresh
        self.pd_nms_thresh = pd_nms_thresh
        self.lm_path = lm_path
        self.lm_score_threshold = lm_score_threshold
        self.asl_path = asl_path
        self.show_landmarks = show_landmarks
        self.show_hand_box = show_hand_box
        self.asl_recognition = asl_recognition
        self.show_asl = show_asl

        anchor_options = mpu.SSDAnchorOptions(
            num_layers=4,
            min_scale=0.1484375,
            max_scale=0.75,
            input_size_height=128,
            input_size_width=128,
            anchor_offset_x=0.5,
            anchor_offset_y=0.5,
            strides=[8, 16, 16, 16],
            aspect_ratios=[1.0],
            reduce_boxes_in_lowest_layer=False,
            interpolated_scale_aspect_ratio=1.0,
            fixed_anchor_size=True)

        self.anchors = mpu.generate_anchors(anchor_options)
        self.nb_anchors = self.anchors.shape[0]
        print(f"{self.nb_anchors} anchors have been created")

        self.preview_width = 576
        self.preview_height = 324

        self.frame_size = None

        self.ft = cv2.freetype.createFreeType2()
        self.ft.loadFontData(fontFileName='HelveticaNeue.ttf', id=0)

        self.right_char_queue = collections.deque(maxlen=5)
        self.left_char_queue = collections.deque(maxlen=5)

        self.previous_right_char = ""
        self.right_sentence = ""
        self.previous_right_update_time = time.time()
        self.previous_left_char = ""
        self.left_sentence = ""
        self.previous_left_update_time = time.time()
예제 #2
0
    def __init__(self,
                 input_src=None,
                 pd_path=POSE_DETECTION_MODEL,
                 pd_score_thresh=0.5,
                 pd_nms_thresh=0.3,
                 lm_path=FULL_BODY_LANDMARK_MODEL,
                 lm_score_threshold=0.7,
                 full_body=True,
                 use_gesture=False,
                 smoothing=True,
                 filter_window_size=5,
                 filter_velocity_scale=10,
                 show_3d=False,
                 crop=False,
                 multi_detection=False,
                 output=None,
                 internal_fps=15):

        self.pd_path = pd_path
        self.pd_score_thresh = pd_score_thresh
        self.pd_nms_thresh = pd_nms_thresh
        self.lm_path = lm_path
        self.lm_score_threshold = lm_score_threshold
        self.full_body = full_body
        self.use_gesture = use_gesture
        self.smoothing = smoothing
        self.show_3d = show_3d
        self.crop = crop
        self.multi_detection = multi_detection
        if self.multi_detection:
            print("With multi-detection, smoothing filter is disabled.")
            self.smoothing = False
        self.internal_fps = internal_fps

        if input_src == None:
            self.input_type = "internal"  # OAK* internal color camera
            self.video_fps = internal_fps  # Used when saving the output in a video file. Should be close to the real fps
            video_height = video_width = 1080  # Depends on cam.setResolution() in create_pipeline()
        elif input_src.endswith('.jpg') or input_src.endswith('.png'):
            self.input_type = "image"
            self.img = cv2.imread(input_src)
            self.video_fps = 25
            video_height, video_width = self.img.shape[:2]
        else:
            self.input_type = "video"
            if input_src.isdigit():
                input_type = "webcam"
                input_src = int(input_src)
            self.cap = cv2.VideoCapture(input_src)
            self.video_fps = int(self.cap.get(cv2.CAP_PROP_FPS))
            video_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            video_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            print("Video FPS:", self.video_fps)

        self.nb_kps = 33 if self.full_body else 25

        if self.smoothing:
            self.filter = mpu.LandmarksSmoothingFilter(filter_window_size,
                                                       filter_velocity_scale,
                                                       (self.nb_kps, 3))

        # Create SSD anchors
        # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt
        anchor_options = mpu.SSDAnchorOptions(
            num_layers=4,
            min_scale=0.1484375,
            max_scale=0.75,
            input_size_height=128,
            input_size_width=128,
            anchor_offset_x=0.5,
            anchor_offset_y=0.5,
            strides=[8, 16, 16, 16],
            aspect_ratios=[1.0],
            reduce_boxes_in_lowest_layer=False,
            interpolated_scale_aspect_ratio=1.0,
            fixed_anchor_size=True)
        self.anchors = mpu.generate_anchors(anchor_options)
        self.nb_anchors = self.anchors.shape[0]
        print(f"{self.nb_anchors} anchors have been created")

        # Rendering flags
        self.show_pd_box = False
        self.show_pd_kps = False
        self.show_rot_rect = False
        self.show_landmarks = True
        self.show_scores = False
        self.show_gesture = self.use_gesture
        self.show_fps = True

        if self.show_3d:
            self.vis3d = o3d.visualization.Visualizer()
            self.vis3d.create_window()
            opt = self.vis3d.get_render_option()
            opt.background_color = np.asarray([0, 0, 0])
            z = min(video_height, video_width) / 3
            self.grid_floor = create_grid([0, video_height, -z],
                                          [video_width, video_height, -z],
                                          [video_width, video_height, z],
                                          [0, video_height, z],
                                          5,
                                          2,
                                          color=(1, 1, 1))
            self.grid_wall = create_grid([0, 0, z], [video_width, 0, z],
                                         [video_width, video_height, z],
                                         [0, video_height, z],
                                         5,
                                         2,
                                         color=(1, 1, 1))
            self.vis3d.add_geometry(self.grid_floor)
            self.vis3d.add_geometry(self.grid_wall)
            view_control = self.vis3d.get_view_control()
            view_control.set_up(np.array([0, -1, 0]))
            view_control.set_front(np.array([0, 0, -1]))

        if output is None:
            self.output = None
        else:
            fourcc = cv2.VideoWriter_fourcc(*"MJPG")
            self.output = cv2.VideoWriter(output, fourcc, self.video_fps,
                                          (video_width, video_height))
    def __init__(self,
                 input_file=None,
                 pd_path="models/palm_detection.blob",
                 pd_score_thresh=0.5,
                 pd_nms_thresh=0.3,
                 use_lm=True,
                 lm_path="models/hand_landmark.blob",
                 lm_score_threshold=0.5):

        self.camera = input_file is None
        self.pd_path = pd_path
        self.pd_score_thresh = pd_score_thresh
        self.pd_nms_thresh = pd_nms_thresh
        self.use_lm = use_lm
        self.lm_path = lm_path
        self.lm_score_threshold = lm_score_threshold
        self.regions = []

        self.seq_num = None

        if not self.camera:
            if input_file == "direct":
                self.image_mode = None
            elif input_file.endswith('.jpg') or input_file.endswith('.png'):
                self.image_mode = True
                self.img = cv2.imread(input_file)
                self.video_size = np.min(self.img.shape[:2])
            else:
                self.image_mode = False
                self.cap = cv2.VideoCapture(input_file)
                width = self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float `width`
                height = self.cap.get(
                    cv2.CAP_PROP_FRAME_HEIGHT)  # float `height`
                self.video_size = int(min(width, height))

        # Create SSD anchors
        # https://github.com/google/mediapipe/blob/master/mediapipe/modules/palm_detection/palm_detection_cpu.pbtxt
        anchor_options = mpu.SSDAnchorOptions(
            num_layers=4,
            min_scale=0.1484375,
            max_scale=0.75,
            input_size_height=128,
            input_size_width=128,
            anchor_offset_x=0.5,
            anchor_offset_y=0.5,
            strides=[8, 16, 16, 16],
            aspect_ratios=[1.0],
            reduce_boxes_in_lowest_layer=False,
            interpolated_scale_aspect_ratio=1.0,
            fixed_anchor_size=True)
        self.anchors = mpu.generate_anchors(anchor_options)
        self.nb_anchors = self.anchors.shape[0]
        print(f"{self.nb_anchors} anchors have been created")

        # Rendering flags
        if self.use_lm:
            self.show_pd_box = False
            self.show_pd_kps = False
            self.show_rot_rect = False
            self.show_handedness = False
            self.show_landmarks = True
            self.show_scores = False
        else:
            self.show_pd_box = True
            self.show_pd_kps = False
            self.show_rot_rect = False
            self.show_scores = False
    def __init__(self,
                 input_src=None,
                 pd_xml=POSE_DETECTION_MODEL,
                 pd_device="CPU",
                 pd_score_thresh=0.5,
                 pd_nms_thresh=0.3,
                 lm_xml=FULL_BODY_LANDMARK_MODEL,
                 lm_device="CPU",
                 lm_score_threshold=0.7,
                 full_body=True,
                 use_gesture=False,
                 smoothing=True,
                 filter_window_size=5,
                 filter_velocity_scale=10,
                 show_3d=False,
                 crop=False,
                 multi_detection=False,
                 force_detection=False,
                 output=None):

        self.pd_score_thresh = pd_score_thresh
        self.pd_nms_thresh = pd_nms_thresh
        self.lm_score_threshold = lm_score_threshold
        self.full_body = full_body
        self.use_gesture = use_gesture
        self.smoothing = smoothing
        self.show_3d = show_3d
        self.crop = crop
        self.multi_detection = multi_detection
        self.force_detection = force_detection
        if self.multi_detection:
            print(
                "Warning: with multi-detection, smoothing filter is disabled and pose detection is forced on every frame."
            )
            self.smoothing = False
            self.force_detection = True

        if input_src.endswith('.jpg') or input_src.endswith('.png'):
            self.input_type = "image"
            self.img = cv2.imread(input_src)
            self.video_fps = 25
            video_height, video_width = self.img.shape[:2]
        else:
            self.input_type = "video"
            if input_src.isdigit():
                input_type = "webcam"
                input_src = int(input_src)
            self.cap = cv2.VideoCapture(input_src)
            self.video_fps = int(self.cap.get(cv2.CAP_PROP_FPS))
            video_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            video_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        print("Video FPS:", self.video_fps)

        # The full body landmark model predict 39 landmarks.
        # We are interested in the first 35 landmarks
        # from 1 to 33 correspond to the well documented body parts,
        # 34th (mid hips) and 35th (a point above the head) are used to predict ROI of next frame
        # Same for upper body model but with 8 less landmarks
        self.nb_lms = 35 if self.full_body else 27

        if self.smoothing:
            self.filter = mpu.LandmarksSmoothingFilter(filter_window_size,
                                                       filter_velocity_scale,
                                                       (self.nb_lms - 2, 3))

        # Create SSD anchors
        # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt
        anchor_options = mpu.SSDAnchorOptions(
            num_layers=4,
            min_scale=0.1484375,
            max_scale=0.75,
            input_size_height=128,
            input_size_width=128,
            anchor_offset_x=0.5,
            anchor_offset_y=0.5,
            strides=[8, 16, 16, 16],
            aspect_ratios=[1.0],
            reduce_boxes_in_lowest_layer=False,
            interpolated_scale_aspect_ratio=1.0,
            fixed_anchor_size=True)
        self.anchors = mpu.generate_anchors(anchor_options)
        self.nb_anchors = self.anchors.shape[0]
        print(f"{self.nb_anchors} anchors have been created")

        # Load Openvino models
        self.load_models(pd_xml, pd_device, lm_xml, lm_device)

        # Rendering flags
        self.show_pd_box = False
        self.show_pd_kps = False
        self.show_rot_rect = False
        self.show_landmarks = True
        self.show_scores = False
        self.show_gesture = self.use_gesture
        self.show_fps = True
        self.show_segmentation = False

        if self.show_3d:
            self.vis3d = o3d.visualization.Visualizer()
            self.vis3d.create_window()
            opt = self.vis3d.get_render_option()
            opt.background_color = np.asarray([0, 0, 0])
            z = min(video_height, video_width) / 3
            self.grid_floor = create_grid([0, video_height, -z],
                                          [video_width, video_height, -z],
                                          [video_width, video_height, z],
                                          [0, video_height, z],
                                          5,
                                          2,
                                          color=(1, 1, 1))
            self.grid_wall = create_grid([0, 0, z], [video_width, 0, z],
                                         [video_width, video_height, z],
                                         [0, video_height, z],
                                         5,
                                         2,
                                         color=(1, 1, 1))
            self.vis3d.add_geometry(self.grid_floor)
            self.vis3d.add_geometry(self.grid_wall)
            view_control = self.vis3d.get_view_control()
            view_control.set_up(np.array([0, -1, 0]))
            view_control.set_front(np.array([0, 0, -1]))

        if output is None:
            self.output = None
        else:
            fourcc = cv2.VideoWriter_fourcc(*"MJPG")
            self.output = cv2.VideoWriter(output, fourcc, self.video_fps,
                                          (video_width, video_height))