def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("gaze-app.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") #mc.move(100,100) fdnet = FaceDetection(args.fdmodel) lmnet = FacialLandmarks(args.lmmodel) hpnet = HeadPoseEstimation(args.hpmodel) genet = GazeEstimation(args.gemodel) ### Load the model through ### logging.info("============== Models Load time ===============") start_time = time.time() fdnet.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) fdnet.check_model() logging.info("Face Detection estimation layers loaded correctly") start_time = time.time() lmnet.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) lmnet.check_model() logging.info("Facial Landmarks estimation layers loaded correctly") start_time = time.time() hpnet.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) hpnet.check_model() logging.info("Head pose estimation layers loaded correctly") start_time = time.time() genet.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) genet.check_model() logging.info("Gaze estimation layers loaded correctly") logging.info("============== End =====================") # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() # FPS = feeder.get_fps() # Grab the shape of the input # width = feeder.get_width() # height = feeder.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 #print(int((frame_count) % int(FPS))) # face detection fd_process_time = time.time() p_frame = fdnet.preprocess_input(frame) start_time = time.time() fnoutput = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, fboxes = fdnet.preprocess_output( fnoutput, frame, args.print) logging.info( "Face Detection Model processing time : {:.1f}ms".format( 1000 * (time.time() - fd_process_time))) #for each face for fbox in fboxes: # fbox = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]] lm_process_time = time.time() p_frame = lmnet.preprocess_input(face) start_time = time.time() lmoutput = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lmoutput, fbox, out_frame, args.print) logging.info( "Landmarks model processing time : {:.1f}ms".format( 1000 * (time.time() - lm_process_time))) # get head pose estimation hp_process_time = time.time() p_frame = hpnet.preprocess_input(face) start_time = time.time() hpoutput = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angels = hpnet.preprocess_output( hpoutput, out_frame, face, fbox, args.print) logging.info( "Headpose estimation model processing time : {:.1f}ms". format(1000 * (time.time() - hp_process_time))) # get gaze estimation gaze_process_time = time.time() out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() geoutput = genet.predict(left_eye, right_eye, headpose_angels) ge_infertime += time.time() - start_time out_frame, gazevector = genet.preprocess_output( geoutput, out_frame, fbox, left_eye_point, right_eye_point, args.print) logging.info( "Gaze estimation model processing time : {:.1f}ms".format( 1000 * (time.time() - gaze_process_time))) if (not args.no_video): cv2.imshow('im', out_frame) if (not args.no_move): mc.move(gazevector[0], gazevector[1]) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") # Release the capture and destroy any OpenCV windows feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception("Error in inference:" + str(ex))
def main(): args = build_argparser().parse_args() device_name = args.device prob_threshold = args.prob_threshold logger_object = log.getLogger() # Initialize variables with the input arguments model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'FacialLandmarkModel': args.facialLandmarksModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } # Instantiate model face_model = FaceDetection(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_model = FacialLandmarksDetection(model_path_dict['FacialLandmarkModel'], device_name, threshold=prob_threshold) head_pose_model = HeadPoseEstimation(model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_model = GazeEstimation(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) mouse_controller = MouseController('medium', 'fast') # Load Models and get time start_time = time.time() face_model.load_model() logger_object.error("Face detection model loaded: time: {:.3f} ms".format((time.time() - start_time) * 1000)) first_mark = time.time() landmark_model.load_model() logger_object.error( "Facial landmarks detection model loaded: time: {:.3f} ms".format((time.time() - first_mark) * 1000)) second_mark = time.time() head_pose_model.load_model() logger_object.error("Head pose estimation model loaded: time: {:.3f} ms".format((time.time() - second_mark) * 1000)) third_mark = time.time() gaze_model.load_model() logger_object.error("Gaze estimation model loaded: time: {:.3f} ms".format((time.time() - third_mark) * 1000)) load_total_time = time.time() - start_time logger_object.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000)) logger_object.error("All models are loaded successfully..") # Check extention of these unsupported layers face_model.check_model() landmark_model.check_model() head_pose_model.check_model() gaze_model.check_model() preview_flags = args.previewFlags input_filename = args.input output_path = args.output_path prob_threshold = args.prob_threshold if input_filename.lower() == 'cam': input_feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger_object.error("Unable to find specified video file") exit(1) input_feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger_object.error("Unable to find specified model file" + str(model_path)) exit(1) input_feeder.load_data() width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS)) out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps, (width, height), True) frame_counter = 0 start_inf_time = time.time() for ret, frame in input_feeder.next_batch(): if not ret: break frame_counter += 1 key = cv2.waitKey(60) try: cropped_image, face_cords = face_model.predict(frame, prob_threshold) if type(cropped_image) == int: print("Unable to detect the face") if key == 27: break continue left_eye, right_eye, eye_cords = landmark_model.predict(cropped_image) pose_output = head_pose_model.predict(cropped_image) x, y, z = gaze_model.predict(left_eye, right_eye, pose_output, cropped_image, eye_cords) mouse_controller.move(x, y) except Exception as e: print(str(e) + " for frame " + str(frame_counter)) continue image = cv2.resize(frame, (width, height)) if not len(preview_flags) == 0: preview_frame = frame.copy() if 'fd' in preview_flags: if len(preview_flags) != 1: preview_frame = cropped_image cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (0, 0, 255), 3) if 'hp' in preview_flags: cv2.putText( frame, "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".format( pose_output[0], pose_output[1], pose_output[2]), (20, 40), cv2.FONT_HERSHEY_DUPLEX, 1, (255, 0, 0), 3) if 'ge' in preview_flags: cv2.putText( frame, "Gaze vector: x= {:.2f} , y= {:.2f} , z= {:.2f}".format( x, y, z), (15, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 3) image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(image) if frame_counter % 5 == 0: mouse_controller.move(x, y) if key == 27: break inference_time = round(time.time() - start_inf_time, 1) fps = int(frame_counter) / inference_time logger_object.error("counter {} seconds".format(frame_counter)) logger_object.error("total inference time {} seconds".format(inference_time)) logger_object.error("fps {} frame/second".format(fps)) with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f: f.write('inference time : ' + str(inference_time) + '\n') f.write('fps: ' + str(fps) + '\n') f.write('Models Loading: '+ str(load_total_time) + '\n') logger_object.error('Video stream ended') cv2.destroyAllWindows() input_feeder.close()
head_pose_time = (time.time() - head_start_time) * 1000 facial_landmark_start = time.time() facial_landmark_model.load_model() facial_landmark_time = (time.time() - facial_landmark_start) * 1000 gaze_model_start = time.time() gaze_estimation_model.load_model() gaze_model_time = (time.time() - gaze_model_start) * 1000 total_loading_time = (time.time() - start_time) * 1000 face_model.check_model() head_pose_model.check_model() facial_landmark_model.check_model() gaze_estimation_model.check_model() if input_file.lower() == 'cam': input_feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_file): logger.error("Unable to find video file for input") exit(1) input_feeder = InputFeeder(input_type='video', input_file=input_file) input_feeder.load_data() width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS)) writer = None