def run(self): device = dai.Device(self.create_pipeline()) device.startPipeline() # Define data queues if self.input_type == "internal": q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) q_pd_out = device.getOutputQueue(name="pd_out", maxSize=1, blocking=False) q_lm_out = device.getOutputQueue(name="lm_out", maxSize=2, blocking=False) q_lm_in = device.getInputQueue(name="lm_in") else: q_pd_in = device.getInputQueue(name="pd_in") q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) q_lm_in = device.getInputQueue(name="lm_in") self.fps = FPS(mean_nb_frames=20) seq_num = 0 nb_pd_inferences = 0 nb_lm_inferences = 0 glob_pd_rtrip_time = 0 glob_lm_rtrip_time = 0 while True: self.fps.update() if self.input_type == "internal": in_video = q_video.get() video_frame = in_video.getCvFrame() self.frame_size = video_frame.shape[ 0] # The image is square cropped on the device self.pad_w = self.pad_h = 0 else: if self.input_type == "image": vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: break h, w = vid_frame.shape[:2] if self.crop: # Cropping the long side to get a square shape self.frame_size = min(h, w) dx = (w - self.frame_size) // 2 dy = (h - self.frame_size) // 2 video_frame = vid_frame[dy:dy + self.frame_size, dx:dx + self.frame_size] else: # Padding on the small side to get a square shape self.frame_size = max(h, w) self.pad_h = int((self.frame_size - h) / 2) self.pad_w = int((self.frame_size - w) / 2) video_frame = cv2.copyMakeBorder(vid_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) frame_nn = dai.ImgFrame() frame_nn.setSequenceNum(seq_num) frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData( to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) pd_rtrip_time = now() q_pd_in.send(frame_nn) seq_num += 1 annotated_frame = video_frame.copy() # Get pose detection inference = q_pd_out.get() if self.input_type != "internal": pd_rtrip_time = now() - pd_rtrip_time glob_pd_rtrip_time += pd_rtrip_time self.pd_postprocess(inference) self.pd_render(annotated_frame) nb_pd_inferences += 1 # Landmarks self.nb_active_regions = 0 if self.show_3d: self.vis3d.clear_geometries() self.vis3d.add_geometry(self.grid_floor, reset_bounding_box=False) self.vis3d.add_geometry(self.grid_wall, reset_bounding_box=False) for i, r in enumerate(self.regions): frame_nn = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(frame_nn, (self.lm_input_length, self.lm_input_length))) if i == 0: lm_rtrip_time = now( ) # We measure only for the first region q_lm_in.send(nn_data) # Get landmarks inference = q_lm_out.get() if i == 0: lm_rtrip_time = now() - lm_rtrip_time glob_lm_rtrip_time += lm_rtrip_time nb_lm_inferences += 1 self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) if self.show_3d: self.vis3d.poll_events() self.vis3d.update_renderer() if self.smoothing and self.nb_active_regions == 0: self.filter.reset() if self.input_type != "internal" and not self.crop: annotated_frame = annotated_frame[self.pad_h:self.pad_h + h, self.pad_w:self.pad_w + w] if self.show_fps: self.fps.display(annotated_frame, orig=(50, 50), size=1, color=(240, 180, 100)) cv2.imshow("Blazepose", annotated_frame) if self.output: self.output.write(annotated_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_pd_box = not self.show_pd_box elif key == ord('2'): self.show_pd_kps = not self.show_pd_kps elif key == ord('3'): self.show_rot_rect = not self.show_rot_rect elif key == ord('4'): self.show_landmarks = not self.show_landmarks elif key == ord('5'): self.show_scores = not self.show_scores elif key == ord('6'): self.show_gesture = not self.show_gesture elif key == ord('f'): self.show_fps = not self.show_fps # Print some stats print(f"# pose detection inferences : {nb_pd_inferences}") print(f"# landmark inferences : {nb_lm_inferences}") if self.input_type != "internal" and nb_pd_inferences != 0: print( f"Pose detection round trip : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms" ) if nb_lm_inferences != 0: print( f"Landmark round trip : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms" ) if self.output: self.output.release()
def process_image(self, img): annotated_frame = img if self.camera: in_video = self.q_video.get() # Convert NV12 to BGR yuv = in_video.getData().reshape( (in_video.getHeight() * 3 // 2, in_video.getWidth())) video_frame = cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR_NV12) else: if self.image_mode is None: vid_frame = img height, width, _ = img.shape self.video_size = int(min(width, height)) elif self.image_mode: vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: # print("not OK video frame") return [], img #break h, w = vid_frame.shape[:2] dx = (w - self.video_size) // 2 dy = (h - self.video_size) // 2 video_frame = vid_frame[dy:dy + self.video_size, dx:dx + self.video_size] frame_nn = dai.ImgFrame() frame_nn.setSequenceNum(self.seq_num) frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData( to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) self.q_pd_in.send(frame_nn) self.seq_num += 1 annotated_frame = video_frame.copy() inference = self.q_pd_out.get() self.pd_postprocess(inference) self.pd_render(annotated_frame) # Hand landmarks if self.use_lm: for i, r in enumerate(self.regions): img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length))) self.q_lm_in.send(nn_data) # Retrieve hand landmarks for i, r in enumerate(self.regions): inference = self.q_lm_out.get() self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) return self.regions, annotated_frame
def run(self): device = dai.Device(self.create_pipeline()) device.startPipeline() # Define data queues if self.camera: q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) q_pd_out = device.getOutputQueue(name="pd_out", maxSize=1, blocking=False) if self.use_lm: q_lm_out = device.getOutputQueue(name="lm_out", maxSize=2, blocking=False) q_lm_in = device.getInputQueue(name="lm_in") else: q_pd_in = device.getInputQueue(name="pd_in") q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) if self.use_lm: q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) q_lm_in = device.getInputQueue(name="lm_in") self.fps = FPS(mean_nb_frames=20) seq_num = 0 nb_pd_inferences = 0 nb_lm_inferences = 0 glob_pd_rtrip_time = 0 glob_lm_rtrip_time = 0 while True: self.fps.update() if self.camera: in_video = q_video.get() video_frame = in_video.getCvFrame() else: if self.image_mode: vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: break h, w = vid_frame.shape[:2] dx = (w - self.video_size) // 2 dy = (h - self.video_size) // 2 video_frame = vid_frame[dy:dy+self.video_size, dx:dx+self.video_size] frame_nn = dai.ImgFrame() frame_nn.setSequenceNum(seq_num) frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData(to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) q_pd_in.send(frame_nn) pd_rtrip_time = now() seq_num += 1 annotated_frame = video_frame.copy() # Get palm detection inference = q_pd_out.get() if not self.camera: glob_pd_rtrip_time += now() - pd_rtrip_time self.pd_postprocess(inference) self.pd_render(annotated_frame) nb_pd_inferences += 1 # Hand landmarks if self.use_lm: for i,r in enumerate(self.regions): img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer("input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length))) q_lm_in.send(nn_data) if i == 0: lm_rtrip_time = now() # We measure only for the first region # Retrieve hand landmarks for i,r in enumerate(self.regions): inference = q_lm_out.get() if i == 0: glob_lm_rtrip_time += now() - lm_rtrip_time self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) nb_lm_inferences += 1 self.fps.display(annotated_frame, orig=(50,50),color=(240,180,100)) cv2.imshow("video", annotated_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_pd_box = not self.show_pd_box elif key == ord('2'): self.show_pd_kps = not self.show_pd_kps elif key == ord('3'): self.show_rot_rect = not self.show_rot_rect elif key == ord('4'): self.show_landmarks = not self.show_landmarks elif key == ord('5'): self.show_handedness = not self.show_handedness elif key == ord('6'): self.show_scores = not self.show_scores elif key == ord('7'): self.show_gesture = not self.show_gesture # Print some stats if not self.camera: print(f"# video files frames : {seq_num}") print(f"# palm detection inferences received : {nb_pd_inferences}") print(f"# hand landmark inferences received : {nb_lm_inferences}") print(f"Palm detection round trip : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms") print(f"Hand landmark round trip : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms")
def run(self): device = dai.Device(self.create_pipeline()) device.startPipeline() q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False) q_pd_in = device.getInputQueue(name="pd_in") q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True) q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True) q_lm_in = device.getInputQueue(name="lm_in") q_asl_out = device.getOutputQueue(name="asl_out", maxSize=4, blocking=True) q_asl_in = device.getInputQueue(name="asl_in") while True: in_video = q_video.get() video_frame = in_video.getCvFrame() h, w = video_frame.shape[:2] self.frame_size = max(h, w) self.pad_h = int((self.frame_size - h) / 2) self.pad_w = int((self.frame_size - w) / 2) video_frame = cv2.copyMakeBorder(video_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) frame_nn = dai.ImgFrame() frame_nn.setWidth(self.pd_input_length) frame_nn.setHeight(self.pd_input_length) frame_nn.setData( to_planar(video_frame, (self.pd_input_length, self.pd_input_length))) q_pd_in.send(frame_nn) annotated_frame = video_frame.copy() # Get palm detection inference = q_pd_out.get() self.pd_postprocess(inference) # Send data for hand landmarks for i, r in enumerate(self.regions): img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length) nn_data = dai.NNData() nn_data.setLayer( "input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length))) q_lm_in.send(nn_data) # Retrieve hand landmarks for i, r in enumerate(self.regions): inference = q_lm_out.get() self.lm_postprocess(r, inference) hand_frame, handedness, hand_bbox = self.lm_render( video_frame, annotated_frame, r) # ASL recognition if hand_frame is not None and self.asl_recognition: hand_frame = cv2.resize( hand_frame, (self.asl_input_length, self.asl_input_length), interpolation=cv2.INTER_NEAREST) hand_frame = hand_frame.transpose(2, 0, 1) nn_data = dai.NNData() nn_data.setLayer("input", hand_frame) q_asl_in.send(nn_data) asl_result = np.array(q_asl_out.get().getFirstLayerFp16()) asl_idx = np.argmax(asl_result) # Recognized ASL character is associated with a probability asl_char = [ characters[asl_idx], round(asl_result[asl_idx] * 100, 1) ] selected_char = asl_char current_char_queue = None if handedness > 0.5: current_char_queue = self.right_char_queue else: current_char_queue = self.left_char_queue current_char_queue.append(selected_char) # Peform filtering of recognition resuls using the previous 5 results # If there aren't enough reults, take the first result as output if len(current_char_queue) < 5: selected_char = current_char_queue[0] else: char_candidate = {} for i in range(5): if current_char_queue[i][0] not in char_candidate: char_candidate[current_char_queue[i][0]] = [ 1, current_char_queue[i][1] ] else: char_candidate[current_char_queue[i] [0]][0] += 1 char_candidate[current_char_queue[i][0]][ 1] += current_char_queue[i][1] most_voted_char = "" max_votes = 0 most_voted_char_prob = 0 for key in char_candidate: if char_candidate[key][0] > max_votes: max_votes = char_candidate[key][0] most_voted_char = key most_voted_char_prob = round( char_candidate[key][1] / char_candidate[key][0], 1) selected_char = (most_voted_char, most_voted_char_prob) if self.show_asl: gesture_string = "Letter: " + selected_char[ 0] + ", " + str(selected_char[1]) + "%" textSize = self.ft.getTextSize(gesture_string, fontHeight=14, thickness=-1)[0] cv2.rectangle(video_frame, (hand_bbox[0] - 5, hand_bbox[1]), (hand_bbox[0] + textSize[0] + 5, hand_bbox[1] - 18), (36, 152, 0), -1) self.ft.putText(img=video_frame, text=gesture_string, org=(hand_bbox[0], hand_bbox[1] - 5), fontHeight=14, color=(255, 255, 255), thickness=-1, line_type=cv2.LINE_AA, bottomLeftOrigin=True) video_frame = video_frame[self.pad_h:self.pad_h + h, self.pad_w:self.pad_w + w] cv2.imshow("hand tracker", video_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_hand_box = not self.show_hand_box elif key == ord('2'): self.show_landmarks = not self.show_landmarks elif key == ord('3'): self.show_asl = not self.show_asl
def run(self): self.fps = FPS(mean_nb_frames=20) nb_frames = 0 nb_pd_inferences = 0 nb_pd_inferences_direct = 0 nb_lm_inferences = 0 nb_lm_inferences_after_landmarks_ROI = 0 glob_pd_rtrip_time = 0 glob_lm_rtrip_time = 0 get_new_frame = True use_previous_landmarks = False global_time = time.perf_counter() while True: if get_new_frame: nb_frames += 1 if self.input_type == "image": vid_frame = self.img else: ok, vid_frame = self.cap.read() if not ok: break h, w = vid_frame.shape[:2] if self.crop: # Cropping the long side to get a square shape self.frame_size = min(h, w) dx = (w - self.frame_size) // 2 dy = (h - self.frame_size) // 2 video_frame = vid_frame[dy:dy + self.frame_size, dx:dx + self.frame_size] else: # Padding on the small side to get a square shape self.frame_size = max(h, w) self.pad_h = int((self.frame_size - h) / 2) self.pad_w = int((self.frame_size - w) / 2) video_frame = cv2.copyMakeBorder(vid_frame, self.pad_h, self.pad_h, self.pad_w, self.pad_w, cv2.BORDER_CONSTANT) annotated_frame = video_frame.copy() if not self.force_detection and use_previous_landmarks: self.regions = regions_from_landmarks mpu.detections_to_rect( self.regions, kp_pair=[0, 1] ) # self.regions.pd_kps are initialized from landmarks on previous frame mpu.rect_transformation(self.regions, self.frame_size, self.frame_size) else: # Infer pose detection # Resize image to NN square input shape frame_nn = cv2.resize(video_frame, (self.pd_w, self.pd_h), interpolation=cv2.INTER_AREA) # Transpose hxwx3 -> 1x3xhxw frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ] pd_rtrip_time = now() inference = self.pd_exec_net.infer( inputs={self.pd_input_blob: frame_nn}) glob_pd_rtrip_time += now() - pd_rtrip_time self.pd_postprocess(inference) self.pd_render(annotated_frame) nb_pd_inferences += 1 if get_new_frame: nb_pd_inferences_direct += 1 # Landmarks self.nb_active_regions = 0 if self.show_3d: self.vis3d.clear_geometries() self.vis3d.add_geometry(self.grid_floor, reset_bounding_box=False) self.vis3d.add_geometry(self.grid_wall, reset_bounding_box=False) if self.force_detection: for r in self.regions: frame_nn = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_w, self.lm_h) # Transpose hxwx3 -> 1x3xhxw frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ] # Get landmarks lm_rtrip_time = now() inference = self.lm_exec_net.infer( inputs={self.lm_input_blob: frame_nn}) glob_lm_rtrip_time += now() - lm_rtrip_time nb_lm_inferences += 1 self.lm_postprocess(r, inference) self.lm_render(annotated_frame, r) elif len(self.regions) == 1: r = self.regions[0] frame_nn = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_w, self.lm_h) # Transpose hxwx3 -> 1x3xhxw frame_nn = np.transpose(frame_nn, (2, 0, 1))[None, ] # Get landmarks lm_rtrip_time = now() inference = self.lm_exec_net.infer( inputs={self.lm_input_blob: frame_nn}) glob_lm_rtrip_time += now() - lm_rtrip_time nb_lm_inferences += 1 if use_previous_landmarks: nb_lm_inferences_after_landmarks_ROI += 1 self.lm_postprocess(r, inference) if not self.force_detection: if get_new_frame: if not use_previous_landmarks: # With a new frame, we have run the landmark NN on a ROI found by the detection NN... if r.lm_score > self.lm_score_threshold: # ...and succesfully found a body and its landmarks # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y) regions_from_landmarks = [ mpu.Region(pd_kps=r.landmarks_padded[ self.nb_lms - 2:self.nb_lms, :2] / self.frame_size) ] use_previous_landmarks = True else: # With a new frame, we have run the landmark NN on a ROI calculated from the landmarks of the previous frame... if r.lm_score > self.lm_score_threshold: # ...and succesfully found a body and its landmarks # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y) regions_from_landmarks = [ mpu.Region(pd_kps=r.landmarks_padded[ self.nb_lms - 2:self.nb_lms, :2] / self.frame_size) ] use_previous_landmarks = True else: # ...and could not find a body # We don't know if it is because the ROI calculated from the previous frame is not reliable (the body moved) # or because there is really no body in the frame. To decide, we have to run the detection NN on this frame get_new_frame = False use_previous_landmarks = False continue else: # On a frame on which we already ran the landmark NN without founding a body, # we have run the detection NN... if r.lm_score > self.lm_score_threshold: # ...and succesfully found a body and its landmarks use_previous_landmarks = True # Predict the ROI for the next frame from the last 2 landmarks normalized coordinates (x,y) regions_from_landmarks = [ mpu.Region(pd_kps=r.landmarks_padded[ self.nb_lms - 2:self.nb_lms, :2] / self.frame_size) ] use_previous_landmarks = True # else: # ...and could not find a body # We are sure there is no body in that frame get_new_frame = True self.lm_render(annotated_frame, r) else: # Detection NN hasn't found any body get_new_frame = True self.fps.update() if self.show_3d: self.vis3d.poll_events() self.vis3d.update_renderer() if self.smoothing and self.nb_active_regions == 0: self.filter.reset() if not self.crop: annotated_frame = annotated_frame[self.pad_h:self.pad_h + h, self.pad_w:self.pad_w + w] if self.show_fps: self.fps.display(annotated_frame, orig=(50, 50), size=1, color=(240, 180, 100)) cv2.imshow("Blazepose", annotated_frame) if self.output: self.output.write(annotated_frame) key = cv2.waitKey(1) if key == ord('q') or key == 27: break elif key == 32: # Pause on space bar cv2.waitKey(0) elif key == ord('1'): self.show_pd_box = not self.show_pd_box elif key == ord('2'): self.show_pd_kps = not self.show_pd_kps elif key == ord('3'): self.show_rot_rect = not self.show_rot_rect elif key == ord('4'): self.show_landmarks = not self.show_landmarks elif key == ord('5'): self.show_scores = not self.show_scores elif key == ord('6'): self.show_gesture = not self.show_gesture elif key == ord('f'): self.show_fps = not self.show_fps elif key == ord('s'): self.show_segmentation = not self.show_segmentation # Print some stats print( f"FPS : {nb_frames/(time.perf_counter() - global_time):.1f} f/s (# frames = {nb_frames})" ) print( f"# pose detection inferences : {nb_pd_inferences} - # direct: {nb_pd_inferences_direct} - # after landmarks ROI failures: {nb_pd_inferences-nb_pd_inferences_direct}" ) print( f"# landmark inferences : {nb_lm_inferences} - # after pose detection: {nb_lm_inferences - nb_lm_inferences_after_landmarks_ROI} - # after landmarks ROI prediction: {nb_lm_inferences_after_landmarks_ROI}" ) print( f"Pose detection round trip : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms" ) if nb_lm_inferences: print( f"Landmark round trip : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms" ) if self.output: self.output.release()