class BlazeposeDepthai: def __init__(self, input_src=None, pd_path=POSE_DETECTION_MODEL, pd_score_thresh=0.5, pd_nms_thresh=0.3, lm_path=FULL_BODY_LANDMARK_MODEL, lm_score_threshold=0.7, full_body=True, use_gesture=False, smoothing=True, filter_window_size=5, filter_velocity_scale=10, show_3d=False, crop=False, multi_detection=False, output=None, internal_fps=15): self.pd_path = pd_path self.pd_score_thresh = pd_score_thresh self.pd_nms_thresh = pd_nms_thresh self.lm_path = lm_path self.lm_score_threshold = lm_score_threshold self.full_body = full_body self.use_gesture = use_gesture self.smoothing = smoothing self.show_3d = show_3d self.crop = crop self.multi_detection = multi_detection if self.multi_detection: print("With multi-detection, smoothing filter is disabled.") self.smoothing = False self.internal_fps = internal_fps if input_src == None: self.input_type = "internal" # OAK* internal color camera self.video_fps = internal_fps # Used when saving the output in a video file. Should be close to the real fps video_height = video_width = 1080 # Depends on cam.setResolution() in create_pipeline() elif input_src.endswith('.jpg') or input_src.endswith('.png'): self.input_type = "image" self.img = cv2.imread(input_src) self.video_fps = 25 video_height, video_width = self.img.shape[:2] else: self.input_type = "video" if input_src.isdigit(): input_type = "webcam" input_src = int(input_src) self.cap = cv2.VideoCapture(input_src) self.video_fps = int(self.cap.get(cv2.CAP_PROP_FPS)) video_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) video_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) print("Video FPS:", self.video_fps) self.nb_kps = 33 if self.full_body else 25 if self.smoothing: self.filter = mpu.LandmarksSmoothingFilter(filter_window_size, filter_velocity_scale, (self.nb_kps, 3)) # Create SSD anchors # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt anchor_options = mpu.SSDAnchorOptions( num_layers=4, min_scale=0.1484375, max_scale=0.75, input_size_height=128, input_size_width=128, anchor_offset_x=0.5, anchor_offset_y=0.5, strides=[8, 16, 16, 16], aspect_ratios=[1.0], reduce_boxes_in_lowest_layer=False, interpolated_scale_aspect_ratio=1.0, fixed_anchor_size=True) self.anchors = mpu.generate_anchors(anchor_options) self.nb_anchors = self.anchors.shape[0] print(f"{self.nb_anchors} anchors have been created") # Rendering flags self.show_pd_box = False self.show_pd_kps = False self.show_rot_rect = False self.show_landmarks = True self.show_scores = False self.show_gesture = self.use_gesture self.show_fps = True if self.show_3d: self.vis3d = o3d.visualization.Visualizer() self.vis3d.create_window() opt = self.vis3d.get_render_option() opt.background_color = np.asarray([0, 0, 0]) z = min(video_height, video_width) / 3 self.grid_floor = create_grid([0, video_height, -z], [video_width, video_height, -z], [video_width, video_height, z], [0, video_height, z], 5, 2, color=(1, 1, 1)) self.grid_wall = create_grid([0, 0, z], [video_width, 0, z], [video_width, video_height, z], [0, video_height, z], 5, 2, color=(1, 1, 1)) self.vis3d.add_geometry(self.grid_floor) self.vis3d.add_geometry(self.grid_wall) view_control = self.vis3d.get_view_control() view_control.set_up(np.array([0, -1, 0])) view_control.set_front(np.array([0, 0, -1])) if output is None: self.output = None else: fourcc = cv2.VideoWriter_fourcc(*"MJPG") self.output = cv2.VideoWriter(output, fourcc, self.video_fps, (video_width, video_height)) def create_pipeline(self): print("Creating pipeline...") # Start defining a pipeline pipeline = dai.Pipeline() pipeline.setOpenVINOVersion( version=dai.OpenVINO.Version.VERSION_2021_2) self.pd_input_length = 128 if self.input_type == "internal": # ColorCamera print("Creating Color Camera...") cam = pipeline.createColorCamera() cam.setPreviewSize(self.pd_input_length, self.pd_input_length) cam.setResolution( dai.ColorCameraProperties.SensorResolution.THE_1080_P) # Crop video to square shape (palm detection takes square image as input) self.video_size = min(cam.getVideoSize()) cam.setVideoSize(self.video_size, self.video_size) # cam.setFps(self.internal_fps) cam.setInterleaved(False) cam.setBoardSocket(dai.CameraBoardSocket.RGB) cam_out = pipeline.createXLinkOut() cam_out.setStreamName("cam_out") # Link video output to host for higher resolution cam.video.link(cam_out.input) # Define pose detection model print("Creating Pose Detection Neural Network...") pd_nn = pipeline.createNeuralNetwork() pd_nn.setBlobPath(str(Path(self.pd_path).resolve().absolute())) # Increase threads for detection # pd_nn.setNumInferenceThreads(2) # Specify that network takes latest arriving frame in non-blocking manner # Pose detection input if self.input_type == "internal": pd_nn.input.setQueueSize(1) pd_nn.input.setBlocking(False) cam.preview.link(pd_nn.input) else: pd_in = pipeline.createXLinkIn() pd_in.setStreamName("pd_in") pd_in.out.link(pd_nn.input) # Pose detection output pd_out = pipeline.createXLinkOut() pd_out.setStreamName("pd_out") pd_nn.out.link(pd_out.input) # Define landmark model print("Creating Landmark Neural Network...") lm_nn = pipeline.createNeuralNetwork() lm_nn.setBlobPath(str(Path(self.lm_path).resolve().absolute())) lm_nn.setNumInferenceThreads(1) # Landmark input self.lm_input_length = 256 lm_in = pipeline.createXLinkIn() lm_in.setStreamName("lm_in") lm_in.out.link(lm_nn.input) # Landmark output lm_out = pipeline.createXLinkOut() lm_out.setStreamName("lm_out") lm_nn.out.link(lm_out.input) print("Pipeline created.") return pipeline def pd_postprocess(self, inference): scores = np.array(inference.getLayerFp16("classificators"), dtype=np.float16) # 896 bboxes = np.array(inference.getLayerFp16("regressors"), dtype=np.float16).reshape( (self.nb_anchors, 12)) # 896x12 # Decode bboxes self.regions = mpu.decode_bboxes(self.pd_score_thresh, scores, bboxes, self.anchors, best_only=not self.multi_detection) # Non maximum suppression (not needed if best_only is True) if self.multi_detection: self.regions = mpu.non_max_suppression(self.regions, self.pd_nms_thresh) mpu.detections_to_rect(self.regions, kp_pair=[0, 1] if self.full_body else [2, 3]) mpu.rect_transformation(self.regions, self.frame_size, self.frame_size) def pd_render(self, frame): for r in self.regions: if self.show_pd_box: box = (np.array(r.pd_box) * self.frame_size).astype(int) cv2.rectangle(frame, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 255, 0), 2) if self.show_pd_kps: # Key point 0 - mid hip center # Key point 1 - point that encodes size & rotation (for full body) # Key point 2 - mid shoulder center # Key point 3 - point that encodes size & rotation (for upper body) if self.full_body: # Only kp 0 and 1 used list_kps = [0, 1] else: # Only kp 2 and 3 used for upper body list_kps = [2, 3] for kp in list_kps: x = int(r.pd_kps[kp][0] * self.frame_size) y = int(r.pd_kps[kp][1] * self.frame_size) cv2.circle(frame, (x, y), 3, (0, 0, 255), -1) cv2.putText(frame, str(kp), (x, y + 12), cv2.FONT_HERSHEY_PLAIN, 1.5, (0, 255, 0), 2) if self.show_scores: cv2.putText( frame, f"Pose score: {r.pd_score:.2f}", (int(r.pd_box[0] * self.frame_size + 10), int((r.pd_box[1] + r.pd_box[3]) * self.frame_size + 60)), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2) def lm_postprocess(self, region, inference): region.lm_score = inference.getLayerFp16("output_poseflag")[0] if region.lm_score > self.lm_score_threshold: self.nb_active_regions += 1 lm_raw = np.array(inference.getLayerFp16("ld_3d")).reshape(-1, 5) # Each keypoint have 5 information: # - X,Y coordinates are local to the region of # interest and range from [0.0, 255.0]. # - Z coordinate is measured in "image pixels" like # the X and Y coordinates and represents the # distance relative to the plane of the subject's # hips, which is the origin of the Z axis. Negative # values are between the hips and the camera; # positive values are behind the hips. Z coordinate # scale is similar with X, Y scales but has different # nature as obtained not via human annotation, by # fitting synthetic data (GHUM model) to the 2D # annotation. # - Visibility, after user-applied sigmoid denotes the # probability that a keypoint is located within the # frame and not occluded by another bigger body # part or another object. # - Presence, after user-applied sigmoid denotes the # probability that a keypoint is located within the # frame. # Normalize x,y,z. Scaling in z = scaling in x = 1/self.lm_input_length lm_raw[:, :3] /= self.lm_input_length # Apply sigmoid on visibility and presence (if used later) # lm_raw[:,3:5] = 1 / (1 + np.exp(-lm_raw[:,3:5])) # region.landmarks contains the landmarks normalized 3D coordinates in the relative oriented body bounding box region.landmarks = lm_raw[:, :3] # Calculate the landmark coordinate in square padded image (region.landmarks_padded) src = np.array([(0, 0), (1, 0), (1, 1)], dtype=np.float32) dst = np.array( [(x, y) for x, y in region.rect_points[1:]], dtype=np.float32 ) # region.rect_points[0] is left bottom point and points going clockwise! mat = cv2.getAffineTransform(src, dst) lm_xy = np.expand_dims(region.landmarks[:self.nb_kps, :2], axis=0) lm_xy = np.squeeze(cv2.transform(lm_xy, mat)) # A segment of length 1 in the coordinates system of body bounding box takes region.rect_w_a pixels in the # original image. Then we arbitrarily divide by 4 for a more realistic appearance. lm_z = region.landmarks[:self.nb_kps, 2:3] * region.rect_w_a / 4 lm_xyz = np.hstack((lm_xy, lm_z)) if self.smoothing: lm_xyz = self.filter.apply(lm_xyz) region.landmarks_padded = lm_xyz.astype(np.int) # If we added padding to make the image square, we need to remove this padding from landmark coordinates # region.landmarks_abs contains absolute landmark coordinates in the original image (padding removed)) region.landmarks_abs = region.landmarks_padded.copy() if self.pad_h > 0: region.landmarks_abs[:, 1] -= self.pad_h if self.pad_w > 0: region.landmarks_abs[:, 0] -= self.pad_w if self.use_gesture: self.recognize_gesture(region) def lm_render(self, frame, region): if region.lm_score > self.lm_score_threshold: if self.show_rot_rect: cv2.polylines(frame, [np.array(region.rect_points)], True, (0, 255, 255), 2, cv2.LINE_AA) if self.show_landmarks: list_connections = LINES_FULL_BODY if self.full_body else LINES_UPPER_BODY lines = [ np.array( [region.landmarks_padded[point, :2] for point in line]) for line in list_connections ] cv2.polylines(frame, lines, False, (255, 180, 90), 2, cv2.LINE_AA) for i, x_y in enumerate(region.landmarks_padded[:, :2]): if i > 10: color = (0, 255, 0) if i % 2 == 0 else (0, 0, 255) elif i == 0: color = (0, 255, 255) elif i in [4, 5, 6, 8, 10]: color = (0, 255, 0) else: color = (0, 0, 255) cv2.circle(frame, (x_y[0], x_y[1]), 4, color, -11) if self.show_3d: points = region.landmarks_abs lines = LINE_MESH_FULL_BODY if self.full_body else LINE_MESH_UPPER_BODY colors = COLORS_FULL_BODY for i, a_b in enumerate(lines): a, b = a_b line = create_segment(points[a], points[b], radius=5, color=colors[i]) if line: self.vis3d.add_geometry(line, reset_bounding_box=False) if self.show_scores: cv2.putText(frame, f"Landmark score: {region.lm_score:.2f}", (int(region.pd_box[0] * self.frame_size + 10), int((region.pd_box[1] + region.pd_box[3]) * self.frame_size + 90)), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2) if self.use_gesture and self.show_gesture: cv2.putText(frame, region.gesture, (int(region.pd_box[0] * self.frame_size + 10), int(region.pd_box[1] * self.frame_size - 50)), cv2.FONT_HERSHEY_PLAIN, 5, (0, 1190, 255), 3) def recognize_gesture(self, r): def angle_with_y(v): # v: 2d vector (x,y) # Returns angle in degree ofv with y-axis of image plane if v[1] == 0: return 90 angle = atan2(v[0], v[1]) return np.degrees(angle) # For the demo, we want to recognize the flag semaphore alphabet # For this task, we just need to measure the angles of both arms with vertical right_arm_angle = angle_with_y(r.landmarks_abs[14, :2] - r.landmarks_abs[12, :2]) left_arm_angle = angle_with_y(r.landmarks_abs[13, :2] - r.landmarks_abs[11, :2]) right_pose = int((right_arm_angle + 202.5) / 45) left_pose = int((left_arm_angle + 202.5) / 45) r.gesture = semaphore_flag.get((right_pose, left_pose), None) def run(self): device = dai.Device(self.create_pipeline()) device.startPipeline() # Define data queues if self.input_type == "internal": q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) q_pd_out = device.getOutputQueue(name="pd_out", maxSize=1, blocking=False) q_lm_out = device.getOutputQueue(name="lm_out", maxSize=2, blocking=False) q_lm_in = device.getInputQueue(name="lm_in") else: q_pd_in = device.getInputQueue(name="pd_in") q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) q_lm_in = device.getInputQueue(name="lm_in") self.fps = FPS(mean_nb_frames=20) seq_num = 0 nb_pd_inferences = 0 nb_lm_inferences = 0 glob_pd_rtrip_time = 0 glob_lm_rtrip_time = 0 while True: self.fps.update() if self.input_type == "internal": in_video = q_video.get() video_frame = in_video.getCvFrame() self.frame_size = video_frame.shape[ 0] # The image is square cropped on the device self.pad_w = self.pad_h = 0 else: if self.input_type == "image": vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: break h, w = vid_frame.shape[:2] if self.crop: # Cropping the long side to get a square shape self.frame_size = min(h, w) dx = (w - self.frame_size) // 2 dy = (h - self.frame_size) // 2 video_frame = vid_frame[dy:dy + self.frame_size, dx:dx + self.frame_size] else: # Padding on the small side to get a square shape self.frame_size = max(h, w) self.pad_h = int((self.frame_size - h) / 2) self.pad_w = int((self.frame_size - w) / 2) video_frame = cv2.copyMakeBorder(vid_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) frame_nn = dai.ImgFrame() frame_nn.setSequenceNum(seq_num) frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData( to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) pd_rtrip_time = now() q_pd_in.send(frame_nn) seq_num += 1 annotated_frame = video_frame.copy() # Get pose detection inference = q_pd_out.get() if self.input_type != "internal": pd_rtrip_time = now() - pd_rtrip_time glob_pd_rtrip_time += pd_rtrip_time self.pd_postprocess(inference) self.pd_render(annotated_frame) nb_pd_inferences += 1 # Landmarks self.nb_active_regions = 0 if self.show_3d: self.vis3d.clear_geometries() self.vis3d.add_geometry(self.grid_floor, reset_bounding_box=False) self.vis3d.add_geometry(self.grid_wall, reset_bounding_box=False) for i, r in enumerate(self.regions): frame_nn = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(frame_nn, (self.lm_input_length, self.lm_input_length))) if i == 0: lm_rtrip_time = now( ) # We measure only for the first region q_lm_in.send(nn_data) # Get landmarks inference = q_lm_out.get() if i == 0: lm_rtrip_time = now() - lm_rtrip_time glob_lm_rtrip_time += lm_rtrip_time nb_lm_inferences += 1 self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) if self.show_3d: self.vis3d.poll_events() self.vis3d.update_renderer() if self.smoothing and self.nb_active_regions == 0: self.filter.reset() if self.input_type != "internal" and not self.crop: annotated_frame = annotated_frame[self.pad_h:self.pad_h + h, self.pad_w:self.pad_w + w] if self.show_fps: self.fps.display(annotated_frame, orig=(50, 50), size=1, color=(240, 180, 100)) cv2.imshow("Blazepose", annotated_frame) if self.output: self.output.write(annotated_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_pd_box = not self.show_pd_box elif key == ord('2'): self.show_pd_kps = not self.show_pd_kps elif key == ord('3'): self.show_rot_rect = not self.show_rot_rect elif key == ord('4'): self.show_landmarks = not self.show_landmarks elif key == ord('5'): self.show_scores = not self.show_scores elif key == ord('6'): self.show_gesture = not self.show_gesture elif key == ord('f'): self.show_fps = not self.show_fps # Print some stats print(f"# pose detection inferences : {nb_pd_inferences}") print(f"# landmark inferences : {nb_lm_inferences}") if self.input_type != "internal" and nb_pd_inferences != 0: print( f"Pose detection round trip : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms" ) if nb_lm_inferences != 0: print( f"Landmark round trip : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms" ) if self.output: self.output.release()
class HandTracker: def __init__(self, input_file=None, pd_path="models/palm_detection.blob", pd_score_thresh=0.5, pd_nms_thresh=0.3, use_lm=True, lm_path="models/hand_landmark.blob", lm_score_threshold=0.5, use_gesture=False): self.camera = input_file is None self.pd_path = pd_path self.pd_score_thresh = pd_score_thresh self.pd_nms_thresh = pd_nms_thresh self.use_lm = use_lm self.lm_path = lm_path self.lm_score_threshold = lm_score_threshold self.use_gesture = use_gesture if not self.camera: if input_file.endswith('.jpg') or input_file.endswith('.png') : self.image_mode = True self.img = cv2.imread(input_file) self.video_size = np.min(self.img.shape[:2]) else: self.image_mode = False self.cap = cv2.VideoCapture(input_file) width = self.cap.get(cv2.CAP_PROP_FRAME_WIDTH) # float `width` height = self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # float `height` self.video_size = int(min(width, height)) # Create SSD anchors # https://github.com/google/mediapipe/blob/master/mediapipe/modules/palm_detection/palm_detection_cpu.pbtxt anchor_options = mpu.SSDAnchorOptions(num_layers=4, min_scale=0.1484375, max_scale=0.75, input_size_height=128, input_size_width=128, anchor_offset_x=0.5, anchor_offset_y=0.5, strides=[8, 16, 16, 16], aspect_ratios= [1.0], reduce_boxes_in_lowest_layer=False, interpolated_scale_aspect_ratio=1.0, fixed_anchor_size=True) self.anchors = mpu.generate_anchors(anchor_options) self.nb_anchors = self.anchors.shape[0] print(f"{self.nb_anchors} anchors have been created") # Rendering flags if self.use_lm: self.show_pd_box = False self.show_pd_kps = False self.show_rot_rect = False self.show_handedness = False self.show_landmarks = True self.show_scores = False self.show_gesture = self.use_gesture else: self.show_pd_box = True self.show_pd_kps = False self.show_rot_rect = False self.show_scores = False def create_pipeline(self): print("Creating pipeline...") # Start defining a pipeline pipeline = dai.Pipeline() self.pd_input_length = 128 if self.camera: # ColorCamera print("Creating Color Camera...") cam = pipeline.createColorCamera() cam.setPreviewSize(self.pd_input_length, self.pd_input_length) cam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P) # Crop video to square shape (palm detection takes square image as input) self.video_size = min(cam.getVideoSize()) cam.setVideoSize(self.video_size, self.video_size) cam.setFps(30) cam.setInterleaved(False) cam.setBoardSocket(dai.CameraBoardSocket.RGB) cam_out = pipeline.createXLinkOut() cam_out.setStreamName("cam_out") # Link video output to host for higher resolution cam.video.link(cam_out.input) # Define palm detection model print("Creating Palm Detection Neural Network...") pd_nn = pipeline.createNeuralNetwork() pd_nn.setBlobPath(str(Path(self.pd_path).resolve().absolute())) # Increase threads for detection # pd_nn.setNumInferenceThreads(2) # Specify that network takes latest arriving frame in non-blocking manner # Palm detection input if self.camera: pd_nn.input.setQueueSize(1) pd_nn.input.setBlocking(False) cam.preview.link(pd_nn.input) else: pd_in = pipeline.createXLinkIn() pd_in.setStreamName("pd_in") pd_in.out.link(pd_nn.input) # Palm detection output pd_out = pipeline.createXLinkOut() pd_out.setStreamName("pd_out") pd_nn.out.link(pd_out.input) # Define hand landmark model if self.use_lm: print("Creating Hand Landmark Neural Network...") lm_nn = pipeline.createNeuralNetwork() lm_nn.setBlobPath(str(Path(self.lm_path).resolve().absolute())) lm_nn.setNumInferenceThreads(1) # Hand landmark input self.lm_input_length = 224 lm_in = pipeline.createXLinkIn() lm_in.setStreamName("lm_in") lm_in.out.link(lm_nn.input) # Hand landmark output lm_out = pipeline.createXLinkOut() lm_out.setStreamName("lm_out") lm_nn.out.link(lm_out.input) print("Pipeline created.") return pipeline def pd_postprocess(self, inference): scores = np.array(inference.getLayerFp16("classificators"), dtype=np.float16) # 896 bboxes = np.array(inference.getLayerFp16("regressors"), dtype=np.float16).reshape((self.nb_anchors,18)) # 896x18 # Decode bboxes self.regions = mpu.decode_bboxes(self.pd_score_thresh, scores, bboxes, self.anchors) # Non maximum suppression self.regions = mpu.non_max_suppression(self.regions, self.pd_nms_thresh) if self.use_lm: mpu.detections_to_rect(self.regions) mpu.rect_transformation(self.regions, self.video_size, self.video_size) def pd_render(self, frame): for r in self.regions: if self.show_pd_box: box = (np.array(r.pd_box) * self.video_size).astype(int) cv2.rectangle(frame, (box[0], box[1]), (box[0]+box[2], box[1]+box[3]), (0,255,0), 2) if self.show_pd_kps: for i,kp in enumerate(r.pd_kps): x = int(kp[0] * self.video_size) y = int(kp[1] * self.video_size) cv2.circle(frame, (x, y), 6, (0,0,255), -1) cv2.putText(frame, str(i), (x, y+12), cv2.FONT_HERSHEY_PLAIN, 1.5, (0,255,0), 2) if self.show_scores: cv2.putText(frame, f"Palm score: {r.pd_score:.2f}", (int(r.pd_box[0] * self.video_size+10), int((r.pd_box[1]+r.pd_box[3])*self.video_size+60)), cv2.FONT_HERSHEY_PLAIN, 2, (255,255,0), 2) def recognize_gesture(self, r): # Finger states # state: -1=unknown, 0=close, 1=open d_3_5 = mpu.distance(r.landmarks[3], r.landmarks[5]) d_2_3 = mpu.distance(r.landmarks[2], r.landmarks[3]) angle0 = mpu.angle(r.landmarks[0], r.landmarks[1], r.landmarks[2]) angle1 = mpu.angle(r.landmarks[1], r.landmarks[2], r.landmarks[3]) angle2 = mpu.angle(r.landmarks[2], r.landmarks[3], r.landmarks[4]) r.thumb_angle = angle0+angle1+angle2 if angle0+angle1+angle2 > 460 and d_3_5 / d_2_3 > 1.2: r.thumb_state = 1 else: r.thumb_state = 0 if r.landmarks[8][1] < r.landmarks[7][1] < r.landmarks[6][1]: r.index_state = 1 elif r.landmarks[6][1] < r.landmarks[8][1]: r.index_state = 0 else: r.index_state = -1 if r.landmarks[12][1] < r.landmarks[11][1] < r.landmarks[10][1]: r.middle_state = 1 elif r.landmarks[10][1] < r.landmarks[12][1]: r.middle_state = 0 else: r.middle_state = -1 if r.landmarks[16][1] < r.landmarks[15][1] < r.landmarks[14][1]: r.ring_state = 1 elif r.landmarks[14][1] < r.landmarks[16][1]: r.ring_state = 0 else: r.ring_state = -1 if r.landmarks[20][1] < r.landmarks[19][1] < r.landmarks[18][1]: r.little_state = 1 elif r.landmarks[18][1] < r.landmarks[20][1]: r.little_state = 0 else: r.little_state = -1 # Gesture if r.thumb_state == 1 and r.index_state == 1 and r.middle_state == 1 and r.ring_state == 1 and r.little_state == 1: r.gesture = "FIVE" elif r.thumb_state == 0 and r.index_state == 0 and r.middle_state == 0 and r.ring_state == 0 and r.little_state == 0: r.gesture = "FIST" elif r.thumb_state == 1 and r.index_state == 0 and r.middle_state == 0 and r.ring_state == 0 and r.little_state == 0: r.gesture = "OK" elif r.thumb_state == 0 and r.index_state == 1 and r.middle_state == 1 and r.ring_state == 0 and r.little_state == 0: r.gesture = "PEACE" elif r.thumb_state == 0 and r.index_state == 1 and r.middle_state == 0 and r.ring_state == 0 and r.little_state == 0: r.gesture = "ONE" elif r.thumb_state == 1 and r.index_state == 1 and r.middle_state == 0 and r.ring_state == 0 and r.little_state == 0: r.gesture = "TWO" elif r.thumb_state == 1 and r.index_state == 1 and r.middle_state == 1 and r.ring_state == 0 and r.little_state == 0: r.gesture = "THREE" elif r.thumb_state == 0 and r.index_state == 1 and r.middle_state == 1 and r.ring_state == 1 and r.little_state == 1: r.gesture = "FOUR" else: r.gesture = None def lm_postprocess(self, region, inference): region.lm_score = inference.getLayerFp16("Identity_1")[0] region.handedness = inference.getLayerFp16("Identity_2")[0] lm_raw = np.array(inference.getLayerFp16("Identity_dense/BiasAdd/Add")) lm = [] for i in range(int(len(lm_raw)/3)): # x,y,z -> x/w,y/h,z/w (here h=w) lm.append(lm_raw[3*i:3*(i+1)]/self.lm_input_length) region.landmarks = lm if self.use_gesture: self.recognize_gesture(region) def lm_render(self, frame, region): if region.lm_score > self.lm_score_threshold: if self.show_rot_rect: cv2.polylines(frame, [np.array(region.rect_points)], True, (0,255,255), 2, cv2.LINE_AA) if self.show_landmarks: src = np.array([(0, 0), (1, 0), (1, 1)], dtype=np.float32) dst = np.array([ (x, y) for x,y in region.rect_points[1:]], dtype=np.float32) # region.rect_points[0] is left bottom point ! mat = cv2.getAffineTransform(src, dst) lm_xy = np.expand_dims(np.array([(l[0], l[1]) for l in region.landmarks]), axis=0) lm_xy = np.squeeze(cv2.transform(lm_xy, mat)).astype(np.int) list_connections = [[0, 1, 2, 3, 4], [0, 5, 6, 7, 8], [5, 9, 10, 11, 12], [9, 13, 14 , 15, 16], [13, 17], [0, 17, 18, 19, 20]] lines = [np.array([lm_xy[point] for point in line]) for line in list_connections] cv2.polylines(frame, lines, False, (255, 0, 0), 2, cv2.LINE_AA) if self.use_gesture: # color depending on finger state (1=open, 0=close, -1=unknown) color = { 1: (0,255,0), 0: (0,0,255), -1:(0,255,255)} radius = 6 cv2.circle(frame, (lm_xy[0][0], lm_xy[0][1]), radius, color[-1], -1) for i in range(1,5): cv2.circle(frame, (lm_xy[i][0], lm_xy[i][1]), radius, color[region.thumb_state], -1) for i in range(5,9): cv2.circle(frame, (lm_xy[i][0], lm_xy[i][1]), radius, color[region.index_state], -1) for i in range(9,13): cv2.circle(frame, (lm_xy[i][0], lm_xy[i][1]), radius, color[region.middle_state], -1) for i in range(13,17): cv2.circle(frame, (lm_xy[i][0], lm_xy[i][1]), radius, color[region.ring_state], -1) for i in range(17,21): cv2.circle(frame, (lm_xy[i][0], lm_xy[i][1]), radius, color[region.little_state], -1) else: for x,y in lm_xy: cv2.circle(frame, (x, y), 6, (0,128,255), -1) if self.show_handedness: cv2.putText(frame, f"RIGHT {region.handedness:.2f}" if region.handedness > 0.5 else f"LEFT {1-region.handedness:.2f}", (int(region.pd_box[0] * self.video_size+10), int((region.pd_box[1]+region.pd_box[3])*self.video_size+20)), cv2.FONT_HERSHEY_PLAIN, 2, (0,255,0) if region.handedness > 0.5 else (0,0,255), 2) if self.show_scores: cv2.putText(frame, f"Landmark score: {region.lm_score:.2f}", (int(region.pd_box[0] * self.video_size+10), int((region.pd_box[1]+region.pd_box[3])*self.video_size+90)), cv2.FONT_HERSHEY_PLAIN, 2, (255,255,0), 2) if self.use_gesture and self.show_gesture: cv2.putText(frame, region.gesture, (int(region.pd_box[0]*self.video_size+10), int(region.pd_box[1]*self.video_size-50)), cv2.FONT_HERSHEY_PLAIN, 3, (255,255,255), 3) def run(self): device = dai.Device(self.create_pipeline()) device.startPipeline() # Define data queues if self.camera: q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) q_pd_out = device.getOutputQueue(name="pd_out", maxSize=1, blocking=False) if self.use_lm: q_lm_out = device.getOutputQueue(name="lm_out", maxSize=2, blocking=False) q_lm_in = device.getInputQueue(name="lm_in") else: q_pd_in = device.getInputQueue(name="pd_in") q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) if self.use_lm: q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) q_lm_in = device.getInputQueue(name="lm_in") self.fps = FPS(mean_nb_frames=20) seq_num = 0 nb_pd_inferences = 0 nb_lm_inferences = 0 glob_pd_rtrip_time = 0 glob_lm_rtrip_time = 0 while True: self.fps.update() if self.camera: in_video = q_video.get() video_frame = in_video.getCvFrame() else: if self.image_mode: vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: break h, w = vid_frame.shape[:2] dx = (w - self.video_size) // 2 dy = (h - self.video_size) // 2 video_frame = vid_frame[dy:dy+self.video_size, dx:dx+self.video_size] frame_nn = dai.ImgFrame() frame_nn.setSequenceNum(seq_num) frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData(to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) q_pd_in.send(frame_nn) pd_rtrip_time = now() seq_num += 1 annotated_frame = video_frame.copy() # Get palm detection inference = q_pd_out.get() if not self.camera: glob_pd_rtrip_time += now() - pd_rtrip_time self.pd_postprocess(inference) self.pd_render(annotated_frame) nb_pd_inferences += 1 # Hand landmarks if self.use_lm: for i,r in enumerate(self.regions): img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer("input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length))) q_lm_in.send(nn_data) if i == 0: lm_rtrip_time = now() # We measure only for the first region # Retrieve hand landmarks for i,r in enumerate(self.regions): inference = q_lm_out.get() if i == 0: glob_lm_rtrip_time += now() - lm_rtrip_time self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) nb_lm_inferences += 1 self.fps.display(annotated_frame, orig=(50,50),color=(240,180,100)) cv2.imshow("video", annotated_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_pd_box = not self.show_pd_box elif key == ord('2'): self.show_pd_kps = not self.show_pd_kps elif key == ord('3'): self.show_rot_rect = not self.show_rot_rect elif key == ord('4'): self.show_landmarks = not self.show_landmarks elif key == ord('5'): self.show_handedness = not self.show_handedness elif key == ord('6'): self.show_scores = not self.show_scores elif key == ord('7'): self.show_gesture = not self.show_gesture # Print some stats if not self.camera: print(f"# video files frames : {seq_num}") print(f"# palm detection inferences received : {nb_pd_inferences}") print(f"# hand landmark inferences received : {nb_lm_inferences}") print(f"Palm detection round trip : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms") print(f"Hand landmark round trip : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms")
fps = FPS() while True: # Read a new frame ok, frame = video.read() if not ok: break fps.update() frame = frame.copy() nb_persons, body_kps, face_kps = my_op.eval(frame) #print(2) my_op.draw_body(frame) if args.face: my_op.draw_face(frame) my_op.draw_eyes(frame) fps.display(frame) cv2.imshow("Rendering", frame) if args.output: out.write(frame) # Exit if ESC pressed k = cv2.waitKey(1) & 0xff if k == 27: break elif k == 32: # space cv2.waitKey(0) elif k == ord("s") and isinstance(args.input, int): w_h_idx = (w_h_idx + 1) % len(w_h_list) w, h = w_h_list[w_h_idx] video.set(cv2.CAP_PROP_FRAME_WIDTH, w) video.set(cv2.CAP_PROP_FRAME_HEIGHT, h) video.release()
class BlazeposeOpenvino: def __init__(self, input_src=None, pd_xml=POSE_DETECTION_MODEL, pd_device="CPU", pd_score_thresh=0.5, pd_nms_thresh=0.3, lm_xml=FULL_BODY_LANDMARK_MODEL, lm_device="CPU", lm_score_threshold=0.7, full_body=True, use_gesture=False, smoothing=True, filter_window_size=5, filter_velocity_scale=10, show_3d=False, crop=False, multi_detection=False, force_detection=False, output=None): self.pd_score_thresh = pd_score_thresh self.pd_nms_thresh = pd_nms_thresh self.lm_score_threshold = lm_score_threshold self.full_body = full_body self.use_gesture = use_gesture self.smoothing = smoothing self.show_3d = show_3d self.crop = crop self.multi_detection = multi_detection self.force_detection = force_detection if self.multi_detection: print( "Warning: with multi-detection, smoothing filter is disabled and pose detection is forced on every frame." ) self.smoothing = False self.force_detection = True if input_src.endswith('.jpg') or input_src.endswith('.png'): self.input_type = "image" self.img = cv2.imread(input_src) self.video_fps = 25 video_height, video_width = self.img.shape[:2] else: self.input_type = "video" if input_src.isdigit(): input_type = "webcam" input_src = int(input_src) self.cap = cv2.VideoCapture(input_src) self.video_fps = int(self.cap.get(cv2.CAP_PROP_FPS)) video_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) video_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) print("Video FPS:", self.video_fps) # The full body landmark model predict 39 landmarks. # We are interested in the first 35 landmarks # from 1 to 33 correspond to the well documented body parts, # 34th (mid hips) and 35th (a point above the head) are used to predict ROI of next frame # Same for upper body model but with 8 less landmarks self.nb_lms = 35 if self.full_body else 27 if self.smoothing: self.filter = mpu.LandmarksSmoothingFilter(filter_window_size, filter_velocity_scale, (self.nb_lms - 2, 3)) # Create SSD anchors # https://github.com/google/mediapipe/blob/master/mediapipe/modules/pose_detection/pose_detection_cpu.pbtxt anchor_options = mpu.SSDAnchorOptions( num_layers=4, min_scale=0.1484375, max_scale=0.75, input_size_height=128, input_size_width=128, anchor_offset_x=0.5, anchor_offset_y=0.5, strides=[8, 16, 16, 16], aspect_ratios=[1.0], reduce_boxes_in_lowest_layer=False, interpolated_scale_aspect_ratio=1.0, fixed_anchor_size=True) self.anchors = mpu.generate_anchors(anchor_options) self.nb_anchors = self.anchors.shape[0] print(f"{self.nb_anchors} anchors have been created") # Load Openvino models self.load_models(pd_xml, pd_device, lm_xml, lm_device) # Rendering flags self.show_pd_box = False self.show_pd_kps = False self.show_rot_rect = False self.show_landmarks = True self.show_scores = False self.show_gesture = self.use_gesture self.show_fps = True self.show_segmentation = False if self.show_3d: self.vis3d = o3d.visualization.Visualizer() self.vis3d.create_window() opt = self.vis3d.get_render_option() opt.background_color = np.asarray([0, 0, 0]) z = min(video_height, video_width) / 3 self.grid_floor = create_grid([0, video_height, -z], [video_width, video_height, -z], [video_width, video_height, z], [0, video_height, z], 5, 2, color=(1, 1, 1)) self.grid_wall = create_grid([0, 0, z], [video_width, 0, z], [video_width, video_height, z], [0, video_height, z], 5, 2, color=(1, 1, 1)) self.vis3d.add_geometry(self.grid_floor) self.vis3d.add_geometry(self.grid_wall) view_control = self.vis3d.get_view_control() view_control.set_up(np.array([0, -1, 0])) view_control.set_front(np.array([0, 0, -1])) if output is None: self.output = None else: fourcc = cv2.VideoWriter_fourcc(*"MJPG") self.output = cv2.VideoWriter(output, fourcc, self.video_fps, (video_width, video_height)) def load_models(self, pd_xml, pd_device, lm_xml, lm_device): print("Loading Inference Engine") self.ie = IECore() print("Device info:") versions = self.ie.get_versions(pd_device) print("{}{}".format(" " * 8, pd_device)) print("{}MKLDNNPlugin version ......... {}.{}".format( " " * 8, versions[pd_device].major, versions[pd_device].minor)) print("{}Build ........... {}".format( " " * 8, versions[pd_device].build_number)) # Pose detection model pd_name = os.path.splitext(pd_xml)[0] pd_bin = pd_name + '.bin' print( "Pose Detection model - Reading network files:\n\t{}\n\t{}".format( pd_xml, pd_bin)) self.pd_net = self.ie.read_network(model=pd_xml, weights=pd_bin) # Input blob: input - shape: [1, 3, 128, 128] # Output blob: classificators - shape: [1, 896, 1] : scores # Output blob: regressors - shape: [1, 896, 12] : bboxes self.pd_input_blob = next(iter(self.pd_net.input_info)) print( f"Input blob: {self.pd_input_blob} - shape: {self.pd_net.input_info[self.pd_input_blob].input_data.shape}" ) _, _, self.pd_h, self.pd_w = self.pd_net.input_info[ self.pd_input_blob].input_data.shape for o in self.pd_net.outputs.keys(): print(f"Output blob: {o} - shape: {self.pd_net.outputs[o].shape}") self.pd_scores = "classificators" self.pd_bboxes = "regressors" print("Loading pose detection model into the plugin") self.pd_exec_net = self.ie.load_network(network=self.pd_net, num_requests=1, device_name=pd_device) self.pd_infer_time_cumul = 0 self.pd_infer_nb = 0 self.infer_nb = 0 self.infer_time_cumul = 0 # Landmarks model if lm_device != pd_device: print("Device info:") versions = self.ie.get_versions(pd_device) print("{}{}".format(" " * 8, pd_device)) print("{}MKLDNNPlugin version ......... {}.{}".format( " " * 8, versions[pd_device].major, versions[pd_device].minor)) print("{}Build ........... {}".format( " " * 8, versions[pd_device].build_number)) lm_name = os.path.splitext(lm_xml)[0] lm_bin = lm_name + '.bin' print("Landmark model - Reading network files:\n\t{}\n\t{}".format( lm_xml, lm_bin)) self.lm_net = self.ie.read_network(model=lm_xml, weights=lm_bin) # Input blob: input_1 - shape: [1, 3, 256, 256] # Output blob: ld_3d - shape: [1, 195] for full body or [1, 155] for upper body # Output blob: output_poseflag - shape: [1, 1] # Output blob: output_segmentation - shape: [1, 1, 128, 128] self.lm_input_blob = next(iter(self.lm_net.input_info)) print( f"Input blob: {self.lm_input_blob} - shape: {self.lm_net.input_info[self.lm_input_blob].input_data.shape}" ) _, _, self.lm_h, self.lm_w = self.lm_net.input_info[ self.lm_input_blob].input_data.shape for o in self.lm_net.outputs.keys(): print(f"Output blob: {o} - shape: {self.lm_net.outputs[o].shape}") self.lm_score = "output_poseflag" self.lm_segmentation = "output_segmentation" self.lm_landmarks = "ld_3d" print("Loading landmark model to the plugin") self.lm_exec_net = self.ie.load_network(network=self.lm_net, num_requests=1, device_name=lm_device) self.lm_infer_time_cumul = 0 self.lm_infer_nb = 0 def pd_postprocess(self, inference): scores = np.squeeze(inference[self.pd_scores]) # 896 bboxes = inference[self.pd_bboxes][0] # 896x12 # Decode bboxes self.regions = mpu.decode_bboxes(self.pd_score_thresh, scores, bboxes, self.anchors, best_only=not self.multi_detection) # Non maximum suppression (not needed if best_only is True) if self.multi_detection: self.regions = mpu.non_max_suppression(self.regions, self.pd_nms_thresh) mpu.detections_to_rect(self.regions, kp_pair=[0, 1] if self.full_body else [2, 3]) mpu.rect_transformation(self.regions, self.frame_size, self.frame_size) def pd_render(self, frame): for r in self.regions: if self.show_pd_box: box = (np.array(r.pd_box) * self.frame_size).astype(int) cv2.rectangle(frame, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 255, 0), 2) if self.show_pd_kps: # Key point 0 - mid hip center # Key point 1 - point that encodes size & rotation (for full body) # Key point 2 - mid shoulder center # Key point 3 - point that encodes size & rotation (for upper body) if self.full_body: # Only kp 0 and 1 used list_kps = [0, 1] else: # Only kp 2 and 3 used for upper body list_kps = [2, 3] for kp in list_kps: x = int(r.pd_kps[kp][0] * self.frame_size) y = int(r.pd_kps[kp][1] * self.frame_size) cv2.circle(frame, (x, y), 3, (0, 0, 255), -1) cv2.putText(frame, str(kp), (x, y + 12), cv2.FONT_HERSHEY_PLAIN, 1.5, (0, 255, 0), 2) if self.show_scores and r.pd_score is not None: cv2.putText(frame, f"Pose score: {r.pd_score:.2f}", (50, self.frame_size // 2), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2) def lm_postprocess(self, region, inference): region.lm_score = np.squeeze(inference[self.lm_score]) if region.lm_score > self.lm_score_threshold: self.nb_active_regions += 1 lm_raw = inference[self.lm_landmarks].reshape(-1, 5) # Each keypoint have 5 information: # - X,Y coordinates are local to the region of # interest and range from [0.0, 255.0]. # - Z coordinate is measured in "image pixels" like # the X and Y coordinates and represents the # distance relative to the plane of the subject's # hips, which is the origin of the Z axis. Negative # values are between the hips and the camera; # positive values are behind the hips. Z coordinate # scale is similar with X, Y scales but has different # nature as obtained not via human annotation, by # fitting synthetic data (GHUM model) to the 2D # annotation. # - Visibility, after user-applied sigmoid denotes the # probability that a keypoint is located within the # frame and not occluded by another bigger body # part or another object. # - Presence, after user-applied sigmoid denotes the # probability that a keypoint is located within the # frame. # Normalize x,y,z. Here self.lm_w = self.lm_h and scaling in z = scaling in x = 1/self.lm_w lm_raw[:, :3] /= self.lm_w # Apply sigmoid on visibility and presence (if used later) # lm_raw[:,3:5] = 1 / (1 + np.exp(-lm_raw[:,3:5])) # region.landmarks contains the landmarks normalized 3D coordinates in the relative oriented body bounding box region.landmarks = lm_raw[:, :3] # Calculate the landmark coordinate in square padded image (region.landmarks_padded) src = np.array([(0, 0), (1, 0), (1, 1)], dtype=np.float32) dst = np.array( [(x, y) for x, y in region.rect_points[1:]], dtype=np.float32 ) # region.rect_points[0] is left bottom point and points going clockwise! mat = cv2.getAffineTransform(src, dst) lm_xy = np.expand_dims(region.landmarks[:self.nb_lms, :2], axis=0) lm_xy = np.squeeze(cv2.transform(lm_xy, mat)) # A segment of length 1 in the coordinates system of body bounding box takes region.rect_w_a pixels in the # original image. Then I arbitrarily divide by 4 for a more realistic appearance. lm_z = region.landmarks[:self.nb_lms, 2:3] * region.rect_w_a / 4 lm_xyz = np.hstack((lm_xy, lm_z)) if self.smoothing: lm_xyz = self.filter.apply(lm_xyz) region.landmarks_padded = lm_xyz.astype(np.int) # If we added padding to make the image square, we need to remove this padding from landmark coordinates # region.landmarks_abs contains absolute landmark coordinates in the original image (padding removed)) region.landmarks_abs = region.landmarks_padded.copy() if self.pad_h > 0: region.landmarks_abs[:, 1] -= self.pad_h if self.pad_w > 0: region.landmarks_abs[:, 0] -= self.pad_w if self.use_gesture: self.recognize_gesture(region) if self.show_segmentation: self.seg = np.squeeze(inference[self.lm_segmentation]) self.seg = 1 / (1 + np.exp(-self.seg)) def lm_render(self, frame, region): if region.lm_score > self.lm_score_threshold: if self.show_segmentation: ret, mask = cv2.threshold(self.seg, 0.5, 1, cv2.THRESH_BINARY) mask = (mask * 255).astype(np.uint8) cv2.imshow("seg", self.seg) # cv2.imshow("mask", mask) src = np.array( [[0, 0], [128, 0], [128, 128]], dtype=np.float32) # rect_points[0] is left bottom point ! dst = np.array(region.rect_points[1:], dtype=np.float32) mat = cv2.getAffineTransform(src, dst) mask = cv2.warpAffine(mask, mat, (self.frame_size, self.frame_size)) # cv2.imshow("mask2", mask) # mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR) l = frame.shape[0] frame2 = cv2.bitwise_and(frame, frame, mask=mask) if not self.crop: frame2 = frame2[self.pad_h:l - self.pad_h, self.pad_w:l - self.pad_w] cv2.imshow("Segmentation", frame2) if self.show_rot_rect: cv2.polylines(frame, [np.array(region.rect_points)], True, (0, 255, 255), 2, cv2.LINE_AA) if self.show_landmarks: list_connections = LINES_FULL_BODY if self.full_body else LINES_UPPER_BODY lines = [ np.array( [region.landmarks_padded[point, :2] for point in line]) for line in list_connections ] cv2.polylines(frame, lines, False, (255, 180, 90), 2, cv2.LINE_AA) for i, x_y in enumerate(region.landmarks_padded[:self.nb_lms - 2, :2]): if i > 10: color = (0, 255, 0) if i % 2 == 0 else (0, 0, 255) elif i == 0: color = (0, 255, 255) elif i in [4, 5, 6, 8, 10]: color = (0, 255, 0) else: color = (0, 0, 255) cv2.circle(frame, (x_y[0], x_y[1]), 4, color, -11) if self.show_3d: points = region.landmarks_abs lines = LINE_MESH_FULL_BODY if self.full_body else LINE_MESH_UPPER_BODY colors = COLORS_FULL_BODY for i, a_b in enumerate(lines): a, b = a_b line = create_segment(points[a], points[b], radius=5, color=colors[i]) if line: self.vis3d.add_geometry(line, reset_bounding_box=False) if self.show_scores: cv2.putText(frame, f"Landmark score: {region.lm_score:.2f}", (region.landmarks_padded[24, 0] - 10, region.landmarks_padded[24, 1] + 90), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2) if self.use_gesture and self.show_gesture: cv2.putText(frame, region.gesture, (region.landmarks_padded[6, 0] - 10, region.landmarks_padded[6, 1] - 50), cv2.FONT_HERSHEY_PLAIN, 5, (0, 1190, 255), 3) def recognize_gesture(self, r): def angle_with_y(v): # v: 2d vector (x,y) # Returns angle in degree ofv with y-axis of image plane if v[1] == 0: return 90 angle = atan2(v[0], v[1]) return np.degrees(angle) # For the demo, we want to recognize the flag semaphore alphabet # For this task, we just need to measure the angles of both arms with vertical right_arm_angle = angle_with_y(r.landmarks_abs[14, :2] - r.landmarks_abs[12, :2]) left_arm_angle = angle_with_y(r.landmarks_abs[13, :2] - r.landmarks_abs[11, :2]) right_pose = int((right_arm_angle + 202.5) / 45) left_pose = int((left_arm_angle + 202.5) / 45) r.gesture = semaphore_flag.get((right_pose, left_pose), None) def run(self): self.fps = FPS(mean_nb_frames=20) nb_frames = 0 nb_pd_inferences = 0 nb_pd_inferences_direct = 0 nb_lm_inferences = 0 nb_lm_inferences_after_landmarks_ROI = 0 glob_pd_rtrip_time = 0 glob_lm_rtrip_time = 0 get_new_frame = True use_previous_landmarks = False global_time = time.perf_counter() while True: if get_new_frame: nb_frames += 1 if self.input_type == "image": vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: break h, w = vid_frame.shape[:2] if self.crop: # Cropping the long side to get a square shape self.frame_size = min(h, w) dx = (w - self.frame_size) // 2 dy = (h - self.frame_size) // 2 video_frame = vid_frame[dy:dy + self.frame_size, dx:dx + self.frame_size] else: # Padding on the small side to get a square shape self.frame_size = max(h, w) self.pad_h = int((self.frame_size - h) / 2) self.pad_w = int((self.frame_size - w) / 2) video_frame = cv2.copyMakeBorder(vid_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) annotated_frame = video_frame.copy() if not self.force_detection and use_previous_landmarks: self.regions = regions_from_landmarks mpu.detections_to_rect( self.regions, kp_pair=[0, 1] ) # self.regions.pd_kps are initialized from landmarks on previous frame mpu.rect_transformation(self.regions, self.frame_size, self.frame_size) else: # Infer pose detection # Resize image to NN square input shape frame_nn = cv2.resize(video_frame, (self.pd_w, self.pd_h), interpolation=cv2.INTER_AREA) # Transpose hxwx3 -> 1x3xhxw frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ] pd_rtrip_time = now() inference = self.pd_exec_net.infer( inputs={self.pd_input_blob: frame_nn}) glob_pd_rtrip_time += now() - pd_rtrip_time self.pd_postprocess(inference) self.pd_render(annotated_frame) nb_pd_inferences += 1 if get_new_frame: nb_pd_inferences_direct += 1 # Landmarks self.nb_active_regions = 0 if self.show_3d: self.vis3d.clear_geometries() self.vis3d.add_geometry(self.grid_floor, reset_bounding_box=False) self.vis3d.add_geometry(self.grid_wall, reset_bounding_box=False) if self.force_detection: for r in self.regions: frame_nn = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_w, self.lm_h) # Transpose hxwx3 -> 1x3xhxw frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ] # Get landmarks lm_rtrip_time = now() inference = self.lm_exec_net.infer( inputs={self.lm_input_blob: frame_nn}) glob_lm_rtrip_time += now() - lm_rtrip_time nb_lm_inferences += 1 self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) elif len(self.regions) == 1: r = self.regions[0] frame_nn = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_w, self.lm_h) # Transpose hxwx3 -> 1x3xhxw frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ] # Get landmarks lm_rtrip_time = now() inference = self.lm_exec_net.infer( inputs={self.lm_input_blob: frame_nn}) glob_lm_rtrip_time += now() - lm_rtrip_time nb_lm_inferences += 1 if use_previous_landmarks: nb_lm_inferences_after_landmarks_ROI += 1 self.lm_postprocess(r, inference) if not self.force_detection: if get_new_frame: if not use_previous_landmarks: # With a new frame, we have run the landmark NN on a ROI found by the detection NN... if r.lm_score > self.lm_score_threshold: # ...and succesfully found a body and its landmarks # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y) regions_from_landmarks = [ mpu.Region(pd_kps=r.landmarks_padded[ self.nb_lms - 2:self.nb_lms, :2] / self.frame_size) ] use_previous_landmarks = True else: # With a new frame, we have run the landmark NN on a ROI calculated from the landmarks of the previous frame... if r.lm_score > self.lm_score_threshold: # ...and succesfully found a body and its landmarks # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y) regions_from_landmarks = [ mpu.Region(pd_kps=r.landmarks_padded[ self.nb_lms - 2:self.nb_lms, :2] / self.frame_size) ] use_previous_landmarks = True else: # ...and could not find a body # We don't know if it is because the ROI calculated from the previous frame is not reliable (the body moved) # or because there is really no body in the frame. To decide, we have to run the detection NN on this frame get_new_frame = False use_previous_landmarks = False continue else: # On a frame on which we already ran the landmark NN without founding a body, # we have run the detection NN... if r.lm_score > self.lm_score_threshold: # ...and succesfully found a body and its landmarks use_previous_landmarks = True # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y) regions_from_landmarks = [ mpu.Region(pd_kps=r.landmarks_padded[ self.nb_lms - 2:self.nb_lms, :2] / self.frame_size) ] use_previous_landmarks = True # else: # ...and could not find a body # We are sure there is no body in that frame get_new_frame = True self.lm_render(annotated_frame, r) else: # Detection NN hasn't found any body get_new_frame = True self.fps.update() if self.show_3d: self.vis3d.poll_events() self.vis3d.update_renderer() if self.smoothing and self.nb_active_regions == 0: self.filter.reset() if not self.crop: annotated_frame = annotated_frame[self.pad_h:self.pad_h + h, self.pad_w:self.pad_w + w] if self.show_fps: self.fps.display(annotated_frame, orig=(50, 50), size=1, color=(240, 180, 100)) cv2.imshow("Blazepose", annotated_frame) if self.output: self.output.write(annotated_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_pd_box = not self.show_pd_box elif key == ord('2'): self.show_pd_kps = not self.show_pd_kps elif key == ord('3'): self.show_rot_rect = not self.show_rot_rect elif key == ord('4'): self.show_landmarks = not self.show_landmarks elif key == ord('5'): self.show_scores = not self.show_scores elif key == ord('6'): self.show_gesture = not self.show_gesture elif key == ord('f'): self.show_fps = not self.show_fps elif key == ord('s'): self.show_segmentation = not self.show_segmentation # Print some stats print( f"FPS : {nb_frames/(time.perf_counter() - global_time):.1f} f/s (# frames = {nb_frames})" ) print( f"# pose detection inferences : {nb_pd_inferences} - # direct: {nb_pd_inferences_direct} - # after landmarks ROI failures: {nb_pd_inferences-nb_pd_inferences_direct}" ) print( f"# landmark inferences : {nb_lm_inferences} - # after pose detection: {nb_lm_inferences - nb_lm_inferences_after_landmarks_ROI} - # after landmarks ROI prediction: {nb_lm_inferences_after_landmarks_ROI}" ) print( f"Pose detection round trip : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms" ) if nb_lm_inferences: print( f"Landmark round trip : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms" ) if self.output: self.output.release()