def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("debug.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") fdnet = FaceDetectionModel(args.fdmodel) lmnet = FacialLandMarksDetectionModel(args.lmmodel) hpnet = HeadPoseEstimationModel(args.hpmodel) genet = GazeEstimationModel(args.gemodel) start_time = time.time() fdnet.load_model() logging.info( f"Face Detection Model: {1000 * (time.time() - start_time):.1f}ms") start_time = time.time() lmnet.load_model() logging.info( f"Facial Landmarks Detection Model: {1000 * (time.time() - start_time):.1f}ms" ) start_time = time.time() hpnet.load_model() logging.info( f"Headpose Estimation Model: {1000 * (time.time() - start_time):.1f}ms" ) start_time = time.time() genet.load_model() logging.info( f"Gaze Estimation Model: {1000 * (time.time() - start_time):.1f}ms" ) # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() frame_count = 0 fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 # face detection p_frame = fdnet.preprocess_input(frame) start_time = time.time() fd_output = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, bboxes = fdnet.preprocess_output( fd_output, frame, args.print) for bbox in bboxes: face = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] p_frame = lmnet.preprocess_input(face) start_time = time.time() lm_output = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lm_output, bbox, out_frame, args.print) # get head pose estimation p_frame = hpnet.preprocess_input(face) start_time = time.time() hp_output = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angles = hpnet.preprocess_output( hp_output, out_frame, face, bbox, args.print) # get gaze estimation out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() ge_output = genet.predict(left_eye, right_eye, headpose_angles) ge_infertime += time.time() - start_time out_frame, gaze_vector = genet.preprocess_output( ge_output, out_frame, bbox, left_eye_point, right_eye_point, args.print) if not args.no_video: cv2.imshow('image', out_frame) if not args.no_move: mc.move(gaze_vector[0], gaze_vector[1]) break if key_pressed == 27: break if frame_count > 0: logging.info( f"Face Detection:{1000* fd_infertime/frame_count:.1f}ms") logging.info( f"Facial Landmarks Detection:{1000* lm_infertime/frame_count:.1f}ms" ) logging.info( f"Headpose Estimation:{1000* hp_infertime/frame_count:.1f}ms") logging.info( f"Gaze Estimation:{1000* ge_infertime/frame_count:.1f}ms") feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception(f"Error during inference:{str(ex)}")
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("gaze-app.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") #mc.move(100,100) fdnet = FaceDetectionModel(args.fdmodel) lmnet = FacialLandMarksDetectionModel(args.lmmodel) hpnet = HeadPoseEstimationModel(args.hpmodel) genet = GazeEstimationModel(args.gemodel) ### Load the model through ### logging.info("============== Models Load time ===============") start_time = time.time() fdnet.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() lmnet.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() hpnet.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() genet.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) logging.info("============== End =====================") # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() # FPS = feeder.get_fps() # Grab the shape of the input # width = feeder.get_width() # height = feeder.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 #print(int((frame_count) % int(FPS))) # face detection p_frame = fdnet.preprocess_input(frame) start_time = time.time() fnoutput = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, fboxes = fdnet.preprocess_output( fnoutput, frame, args.print) #for each face for fbox in fboxes: # fbox = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]] p_frame = lmnet.preprocess_input(face) start_time = time.time() lmoutput = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lmoutput, fbox, out_frame, args.print) # get head pose estimation p_frame = hpnet.preprocess_input(face) start_time = time.time() hpoutput = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angels = hpnet.preprocess_output( hpoutput, out_frame, face, fbox, args.print) # get gaze estimation out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() geoutput = genet.predict(left_eye, right_eye, headpose_angels) ge_infertime += time.time() - start_time out_frame, gazevector = genet.preprocess_output( geoutput, out_frame, fbox, left_eye_point, right_eye_point, args.print) if (not args.no_video): cv2.imshow('im', out_frame) if (not args.no_move): mc.move(gazevector[0], gazevector[1]) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") # Release the capture and destroy any OpenCV windows feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception("Error in inference:" + str(ex))
def infer_on_stream(args): face_detection_model_file = args.faceDetectionModel facial_landmarks_detection_model_file = args.facialLandmarksModel head_pose_estimation_model_file = args.headPoseModel gaze_estimation_model_file = args.gazeModel video_file = args.input device_name = args.device cpu_extension = args.cpu_extension prob_threshold = args.prob_threshold preview_flag = args.preview_flag output_path = args.output_path if not os.path.exists(output_path): os.mkdir(output_path) mouse_control = MouseController("low", "fast") try: logging.info("*********** Model Load Time ***************") start_model_load_time = time.time() start_time = time.time() face_detection_model = FaceDetectionModel(face_detection_model_file, device_name, cpu_extension) logging.info("Face Detection Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() facial_landmarks_detection_model = FacialLandmarksDetectionModel( facial_landmarks_detection_model_file, device_name, cpu_extension) logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() head_pose_estimation_model = HeadPoseEstimationModel( head_pose_estimation_model_file, device_name, cpu_extension) logging.info("Head Pose Estimation Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() gaze_estimation_model = GazeEstimationModel(gaze_estimation_model_file, device_name, cpu_extension) logging.info("Gaze Estimation Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) total_model_load_time = time.time() - start_model_load_time logging.info("*********** Model Load Completed ***********") except Exception as e: logging.error("ERROR in model loading: " + str(e)) sys.exit(1) feeder = InputFeeder('video', video_file) feeder.load_data() out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.fps() / 10), (1920, 1080), True) start_inference_time = 0 frame_count = 0 face_detect_infer_time = 0 facial_landmarks_infer_time = 0 head_pose_infer_time = 0 gaze_infer_time = 0 while True: try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 ## Face Detecton Model image = face_detection_model.preprocess_input(frame) start_time = time.time() outputs = face_detection_model.predict(image) face_detect_infer_time += (time.time() - start_time) out_frame, faces = face_detection_model.preprocess_output( outputs, frame, preview_flag, prob_threshold) for face in faces: crop_image = frame[face[1]:face[3], face[0]:face[2]] ## Facial Landmarks Detecton Model image = facial_landmarks_detection_model.preprocess_input( crop_image) start_time = time.time() outputs = facial_landmarks_detection_model.predict(image) facial_landmarks_infer_time += (time.time() - start_time) out_frame, left_eye_point, right_eye_point = facial_landmarks_detection_model.preprocess_output( outputs, out_frame, face, preview_flag) ## Head Pose Estimation Model image = head_pose_estimation_model.preprocess_input(crop_image) start_time = time.time() outputs = head_pose_estimation_model.predict(image) head_pose_infer_time += (time.time() - start_time) out_frame, headpose_angels_list = head_pose_estimation_model.preprocess_output( outputs, out_frame, preview_flag) ## Gaze Estimation Model out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input( out_frame, crop_image, left_eye_point, right_eye_point) start_time = time.time() outputs = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels_list) gaze_infer_time += (time.time() - start_time) out_frame, gazevector = gaze_estimation_model.preprocess_output( outputs, out_frame, face, left_eye_point, right_eye_point, preview_flag) cv2.imshow("Computer Pointer Control", out_frame) out_video.write(out_frame) mouse_control.move(gazevector[0], gazevector[1]) if key_pressed == 27: break if frame_count > 0: logging.info("*********** Model Inference Time ****************") logging.info("Face Detection Model: {:.1f} ms.".format( 1000 * face_detect_infer_time / frame_count)) logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format( 1000 * facial_landmarks_infer_time / frame_count)) logging.info("Head Pose Detection Model: {:.1f} ms.".format( 1000 * head_pose_infer_time / frame_count)) logging.info("Gaze Detection Model: {:.1f} ms.".format( 1000 * gaze_infer_time / frame_count)) logging.info("*********** Model Inference Completed ***********") total_infer_time = time.time() - start_inference_time total_inference_time = round(total_infer_time, 1) fps = frame_count / total_inference_time with open(os.path.join(output_path, 'stats.txt'), 'w') as f: f.write(str(total_inference_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_model_load_time) + '\n') logging.info("*********** Total Summary ****************") logging.info(f"Total Model Load Time: {total_model_load_time}") logging.info(f"Total Inference Time: {total_inference_time}") logging.info(f"FPS: {fps}") logging.info("*********** Total Summary ***********") logging.info("*********** ************************* ***********") feeder.close() cv2.destroyAllWindows()