def main(): """ Load inference networks, stream video to network, and output stats and video. :return: None """ # Logger init logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") # Get command line args args = get_arg() #Load Preferencies with open(args.config_file, "r") as yamlfile: cfg = yaml.load(yamlfile, Loader=yaml.FullLoader) models = cfg['models'] input_source = args.input video_path = cfg['video_path'] face_model = FaceDetection(models['face_detection']) head_pose_model = HeadPoseEstimation(models['head_pose_estimation']) facial_landmarks_model = FacialLandmarksDetection(models['facial_landmarks_detection']) gaze_estimation_model = GazeEstimation(models['gaze_estimation']) # Initialise the MouseController mouse_contr = MouseController("low","fast") # Load the models and log timing start_time = time.time() face_model.load_model(args.device) logging.info("Load Face Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() facial_landmarks_model.load_model(args.device) logging.info("Load Facial Landmarks Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() head_pose_model.load_model(args.device) logging.info("Load Head Pose Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() gaze_estimation_model.load_model(args.device) logging.info("Load Gaze Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) # Get and open video or camera capture #input_feed = InputFeeder('video', args.input) #input_feed.load_data() input_feed = InputFeeder(input_type=input_source, input_file=video_path) input_feed.load_data() if not input_feed.cap.isOpened(): log.critical('Error opening input, check --video_path parameter') sys.exit(1) # FPS = input_feed.get_fps() # Grab the shape of the input # width = input_feed.get_width() # height = input_feed.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### facedetect_infer_time = 0 landmark_infer_time = 0 headpose_infer_time = 0 gaze_infer_time = 0 while True: # Read the next frame try: frame = next(input_feed.next_batch()) except StopIteration: break if frame is None: break key_pressed = cv2.waitKey(60) frame_count += 1 input_height, input_width, _ = frame.shape logging.info("frame {count} size {w}, {h}".format(count= frame_count, w = input_width, h =input_height)) # face detection p_frame = face_model.preprocess_input(frame) start_time = time.time() fnoutput = face_model.predict(p_frame) facedetect_infer_time += time.time() - start_time out_frame,fboxes = face_model.preprocess_output(fnoutput,frame,args.overlay, args.prob_threshold) #for each face for fbox in fboxes: face = frame[fbox[1]:fbox[3],fbox[0]:fbox[2]] p_frame = facial_landmarks_model.preprocess_input(face) start_time = time.time() lmoutput = facial_landmarks_model.predict(p_frame) landmark_infer_time += time.time() - start_time out_frame,left_eye_point,right_eye_point = facial_landmarks_model.preprocess_output(lmoutput, fbox, out_frame,args.overlay, args.prob_threshold) # get head pose estimation p_frame = head_pose_model.preprocess_input(face) start_time = time.time() hpoutput = head_pose_model.predict(p_frame) headpose_infer_time += time.time() - start_time out_frame, headpose_angels = head_pose_model.preprocess_output(hpoutput,out_frame, face,fbox,args.overlay, args.prob_threshold) # get gaze estimation out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input(out_frame,face,left_eye_point,right_eye_point,args.overlay) start_time = time.time() geoutput = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels) gaze_infer_time += time.time() - start_time out_frame, gazevector = gaze_estimation_model.preprocess_output(geoutput,out_frame,fbox, left_eye_point,right_eye_point,args.overlay, args.prob_threshold) cv2.imshow('im', out_frame) if(args.mouse_move): logging.info("mouse move vector : x ={}, y={}".format(gazevector[0], gazevector[1])) mouse_contr.move(gazevector[0], gazevector[1]) #use only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if(frame_count>0): logging.info("***** Models Inference time *****") logging.info("Face Detection:{:.1f}ms".format(1000* facedetect_infer_time/frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format(1000* landmark_infer_time/frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format(1000* headpose_infer_time/frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format(1000* gaze_infer_time/frame_count)) # Release the capture and destroy any OpenCV windows input_feed.close() cv2.destroyAllWindows()
def main(): inputPath = args.input inputFeeder = None # Verify and Load Models face_model_link = args.facedetection facial_landmark_link = args.faciallandmark gaze_estimation_link = args.gazeestimation head_pose_link = args.headpose if not check_model_exists(face_model_link) or not check_model_exists( facial_landmark_link) or not check_model_exists( gaze_estimation_link) or not check_model_exists( head_pose_link): exit(1) device_name = args.device threshold = args.prob_threshold cpu_extension = args.cpu_extension previewHeadPose = args.previewHeadPose previewFace = args.previewFaceDetection previewFaceLandmark = args.previewFaceLandmark previewGazeEstimation = args.previewGazeEstimation fliph = True if str(args.flip_horizontal).lower() == "true" else False # Initialize Models face_model = FaceDetectionModel(face_model_link, device_name, cpu_extension, threshold) facial_landmark_model = FacialLandmarksDetectionModel( facial_landmark_link, device_name, cpu_extension) gaze_estimation_model = GazeEstimationModel(gaze_estimation_link, device_name, cpu_extension) head_pose_model = HeadPoseEstimationModel(head_pose_link, device_name, cpu_extension) if inputPath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputPath): logger.error("Unable to find specified video file") exit(1) inputFeeder = InputFeeder("video", inputPath) # Load Models fm_time = time.time() face_model.load_model() fm_time = time.time() - fm_time flm_time = time.time() facial_landmark_model.load_model() flm_time = time.time() - flm_time gem_time = time.time() gaze_estimation_model.load_model() gem_time = time.time() - gem_time hpm_time = time.time() head_pose_model.load_model() hpm_time = time.time() - hpm_time benchmarks['loadtime'] = {} benchmarks['loadtime']['face_landmark'] = flm_time benchmarks['loadtime']['face_detection'] = fm_time benchmarks['loadtime']['gaze_estimation'] = gem_time benchmarks['loadtime']['head_pose_estimation'] = hpm_time mouse_controller = MouseController('high', 'slow') inputFeeder.load_data() frame_count = 0 for ret, frame in inputFeeder.next_batch(): if not ret: break # Waiting for 10ms for key input if cv2.waitKey(1) == 17: break FPS_COUNT = time.time() frame_count += 1 increase_brightness(frame) if fliph: frame = cv2.flip(frame, 1) face_detection_predict_time = time.time() croppedFace, face_coords = face_model.predict(frame.copy()) face_detection_predict_time = time.time() - face_detection_predict_time if croppedFace is None or croppedFace is 0: logger.error("Unable to detect the face.") continue # Head Pose prediction head_pose_predict_time = time.time() head_output = head_pose_model.predict(croppedFace.copy()) head_pose_predict_time = time.time() - head_pose_predict_time # Facial Landmark prediction facial_landmark_predict_time = time.time() left_eye, right_eye, eye_coords = facial_landmark_model.predict( croppedFace.copy()) facial_landmark_predict_time = time.time( ) - facial_landmark_predict_time # Gaze Estimation prediction gaze_estimation_predict_time = time.time() gaze_vector, raw_vector = gaze_estimation_model.predict( left_eye, right_eye, head_output) gaze_estimation_predict_time = time.time( ) - gaze_estimation_predict_time FPS_COUNT = time.time() - FPS_COUNT FPS_COUNT = 1 // FPS_COUNT logger.debug("FPS %s" % (FPS_COUNT)) benchmarks['predict_time'] = {} benchmarks['predict_time'][ 'face_landmark'] = facial_landmark_predict_time benchmarks['predict_time'][ 'face_detection'] = face_detection_predict_time benchmarks['predict_time'][ 'gaze_estimation'] = gaze_estimation_predict_time benchmarks['predict_time'][ 'head_pose_estimation'] = head_pose_predict_time logger.debug(benchmarks) if previewFace or previewFaceLandmark or previewGazeEstimation or previewHeadPose: preview_frame = frame.copy() if previewFace: cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255, 0, 0), 1) preview_frame = croppedFace if previewFaceLandmark: cv2.rectangle(croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 1) cv2.rectangle(croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 1) if previewGazeEstimation and gaze_vector: x, y, z = gaze_vector x = int(x) y = int(y) # left eye center left_eye_center_x = (eye_coords[0][0] + eye_coords[0][2]) // 2 left_eye_center_y = (eye_coords[0][1] + eye_coords[0][3]) // 2 left_eye_center_dx = left_eye_center_x + (x * 2) left_eye_center_dy = left_eye_center_y + (-y * 2) left_eye_ref = left_eye.copy() # right eye center right_eye_center_x = (eye_coords[1][0] + eye_coords[1][2]) // 2 right_eye_center_y = (eye_coords[1][1] + eye_coords[1][3]) // 2 right_eye_center_dx = right_eye_center_x + (x * 2) right_eye_center_dy = right_eye_center_y + (-y * 2) right_eye_ref = right_eye.copy() # head pose head_pose_y = head_output[1] * math.pi / 180 line_size = configuration.GAZE_ARROW_LENGTH # gaze-axis cv2.arrowedLine(croppedFace, (left_eye_center_x, left_eye_center_y), (left_eye_center_dx, left_eye_center_dy), (0, 255, 255), 1) cv2.arrowedLine(croppedFace, (right_eye_center_x, right_eye_center_y), (right_eye_center_dx, right_eye_center_dy), (0, 255, 255), 1) if previewHeadPose: cv2.rectangle(preview_frame, (5, 5), (85, 65), configuration.UI_COLOR, 1) cv2.putText(preview_frame, "YAW: {:.2f}".format(head_output[0]), (10, 20), configuration.DEFAULT_FONT, configuration.FONT_SIZE, configuration.UI_COLOR, 1) cv2.putText(preview_frame, "PITCH: {:.2f}".format(head_output[1]), (10, 40), configuration.DEFAULT_FONT, configuration.FONT_SIZE, configuration.UI_COLOR, 1) cv2.putText(preview_frame, "ROLL: {:.2f}".format(head_output[2]), (10, 60), configuration.DEFAULT_FONT, configuration.FONT_SIZE, configuration.UI_COLOR, 1) cv2.imshow( "Gaze Detection [VIsualization]", cv2.resize(preview_frame, (configuration.PREVIEW_WIDTH, configuration.PREVIEW_HEIGHT))) if frame_count % configuration.MOVE_MOUSE_AFTER_FRAMES_COUNT == 0: logger.debug("moving mouse = x: {},y: {}".format( gaze_vector[0], gaze_vector[1])) mouse_controller.move(gaze_vector[0], gaze_vector[1]) logger.info("Video Stream Finished...") cv2.destroyAllWindows() inputFeeder.close()
def main(): # get command line args args = build_argparser().parse_args() logger = log.getLogger() type_input = args.input if type_input == "CAM": inputFeeder = InputFeeder("cam") else: inputFeeder = InputFeeder("video", args.input) inputFeeder.load_data() mc = MouseController("medium", "fast") fdm = FaceDetectionModel(model_name=args.face_dectection_model, device=args.device, extensions=args.cpu_extension, threshold=args.prob_threshold) fldm = FacialLandmarksModel(model_name=args.face_landmarks_model, device=args.device, extensions=args.cpu_extension) gem = GazeEstimationModel(model_name=args.gaze_estimation_model, device=args.device, extensions=args.cpu_extension) hpem = HeadPoseEstimationModel(model_name=args.head_pose_model, device=args.device, extensions=args.cpu_extension) data_capture = {} start_time = time.time() fdm.load_model() fdm_load_time = time.time() fldm.load_model() fldm_load_time = time.time() hpem.load_model() hpem_load_time = time.time() gem.load_model() gem_load_time = time.time() data_capture['FaceDetectionModel_loadtime'] = round( (fdm_load_time - start_time) * 1000, 3) data_capture['FacialLandmarksModel_loadtime'] = round( (fldm_load_time - fdm_load_time) * 1000, 3) data_capture['HeadPoseEstimationModel_loadtime'] = round( (hpem_load_time - fldm_load_time) * 1000, 3) data_capture['GazeEstimationModel_loadtime'] = round( (gem_load_time - hpem_load_time) * 1000, 3) for flag, frame in inputFeeder.next_batch(): if not flag: break pressedKey = cv2.waitKey(60) start_infer_time = time.time() # time to start inference face_coords, face_img = fdm.predict(frame) fdm_infertime = time.time() if face_coords == 0: # if face not detected continue hpem_out = hpem.predict(face_img) hpem_infertime = time.time() left_eye, right_eye, eye_coord = fldm.predict(face_img) fldm_infertime = time.time() if left_eye.all() == 0 or right_eye.all( ) == 0: # if eye are not detected continue mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hpem_out) gem_infertime = time.time() if args.preview: output_boxes(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3])) bound_boxes(frame, eye_coord, 45, 25, face_coords[0], face_coords[1]) text = "Yaw: {:.2f}, Pitch: {:+.2f}, Roll: {:.2f}".format( hpem_out[0], hpem_out[1], hpem_out[2]) output_text(frame, text, (100, 100)) h = frame.shape[0] w = frame.shape[1] center_of_face = (h / 2, w / 2, 0) draw_axes(frame, center_of_face, hpem_out[0], hpem_out[1], hpem_out[2], scale=50, focal_length=950) cv2.imshow('video', cv2.resize(frame, (500, 500))) cv2.imshow('video', cv2.resize(frame, (500, 500))) mc.move(mouse_coord[0], mouse_coord[1]) if pressedKey == 27: break data_capture['FaceDetectionModel_Inferencetime'] = round( (fdm_infertime - start_infer_time) * 1000, 3) data_capture['HeadPoseEstimationModel_Inferencetime'] = round( (hpem_infertime - fdm_infertime) * 1000, 3) data_capture['FacialLandmarksModel_Inferencetime'] = round( (fldm_infertime - hpem_infertime) * 1000, 3) data_capture['GazeEstimationModel_Inferencetime'] = round( (gem_infertime - fldm_infertime) * 1000, 3) total_time = round((time.time() - start_infer_time) * 1000, 3) data_capture['Total_time'] = total_time df = pd.DataFrame.from_dict(data_capture, orient='index', columns=['time(msecs)']) df.to_csv("results.csv") logger.error("Video has ended...") cv2.destroyAllWindows() inputFeeder.close()
def infer_on_stream(args): prob_threshold = args.prob_threshold face_detector_path = args.face_detection_model facial_landmark_path = args.facial_landmark_detection_model head_pose_path = args.head_pose_estimation_model gaze_est_path = args.gaze_estimation_model input_display = args.display_type device = args.device extension = args.cpu_extension input_file = args.input speed = args.mouse_speed precision = args.mouse_precision face_detector = Model_Face_Detect(model_name=face_detector_path, device=device, extensions=extension) log.info("face_detector object intitialised") face_landmark_detector = Model_Facial_Land(model_name=facial_landmark_path, device=device, extensions=extension) log.info("face_landmark_detector object initialised") head_pose_estimation = Model_HeadPos(model_name=head_pose_path, device=device, extensions=extension) log.info("head_pose_estimation object initialised") gaze_estimation = Model_Gaze_Est(model_name=gaze_est_path, device=device, extensions=extension) log.info("gaze_estimation object initialised") model_loading = time.time() start_time = time.time() face_detector.load_model() log.info("Face Detector Model Loaded...") face_landmark_detector.load_model() log.info("Facial Landmark Model Loaded...") head_pose_estimation.load_model() log.info("Head Pose Estimation Model Loaded...") gaze_estimation.load_model() log.info("Gaze Estimation Model Loaded...") total_models_load_time = time.time() - start_time try: input_feeder = InputFeeder(input_display, input_file) input_feeder.load_data() except: log.error("Something went wrong with loading camera/mouse") exit(0) mouse = MouseController(precision, speed) frames = 0 start_inf_time = time.time() for ret, frame in input_feeder.next_batch(): if not ret: break frames += 1 key = cv2.waitKey(60) start_inf_disp = time.time() #original = "original" #cv2.namedWindow(original) # Create a named window #cv2.moveWindow(original, 600,200) # Move it to (40,30) #cv2.imshow(original,cv2.resize(frame,(600,600))) # Start inference on face_detection model face_coords, face_image = face_detector.predict(frame, prob_threshold) if (face_coords): # Start inference on face_landmarks_detection model eye_coords, left_eye, right_eye, image_proccess = face_landmark_detector.predict( face_image) # Start inference on head pose estimation model head_pose_angles = head_pose_estimation.predict(face_image) # Start inference on gaze estimation model mouse_coord, gaze_coord = gaze_estimation.predict( left_eye, right_eye, head_pose_angles) left_eye = (eye_coords[0][0] + 15, eye_coords[0][1] + 15) right_eye = (eye_coords[1][0] + 15, eye_coords[1][1] + 15) gaze_x = int(gaze_coord[0] * 250) gaze_y = int(-gaze_coord[1] * 250) cv2.arrowedLine(image_proccess, left_eye, (left_eye[0] + gaze_x, left_eye[1] + gaze_y), (80, 15, 120), 3) cv2.arrowedLine(image_proccess, right_eye, (right_eye[0] + gaze_x, right_eye[1] + gaze_y), (80, 15, 120), 3) inference_time = time.time() - start_inf_disp inf_time_display="Inference Time Per Frame: {:.3f}ms"\ .format(inference_time*1000) cv2.putText(image_proccess, inf_time_display, (10, 10), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 250, 0), 1) infer_img = "process_img" cv2.namedWindow(infer_img) cv2.moveWindow(infer_img, 10, 200) # Move it to (10,200) cv2.imshow(infer_img, cv2.resize(image_proccess, (600, 600))) mouse.move(mouse_coord[0], mouse_coord[1]) total_inference_time = time.time() - start_inf_time fps = int(frames) / (total_inference_time) with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats_' + str(device) + '.txt'), 'w') as f: f.write("Inference Time: {:.3f}"\ .format(total_inference_time)+'\n') f.write("FPS: {:.3f}"\ .format(fps)+'\n') f.write("Model Loading Time: {:.3f}"\ .format(total_models_load_time)+'\n') input_feeder.close() cv2.destroyAllWindows()
def main(): """ """ # Grab command line args args = build_argparser().parse_args() input_src = args.input device = args.device extension = args.cpu_extension prob_threshold = args.prob_threshold face_detection_model = args.facedetectionmodel head_pose_model = args.headposemodel landmarks_model = args.facelandmarksnmodel gaze_estimation_model = args.gazeestimationmodel # Create log object set for console output and set log level log_obj = log.getLogger() log_obj.setLevel(LOGLEVEL) console_handler = log.StreamHandler() console_handler.setLevel(LOGLEVEL) log_obj.addHandler(console_handler) # Create detection objects face_detection_obj = FaceDetectionModel(face_detection_model, device, extension) head_pose_obj = HeadPoseModel(head_pose_model, device, extension) landmarks_obj = LandmarksModel(landmarks_model, device, extension) gaze_estimation_obj = GazeEstimationModel(gaze_estimation_model, device, extension) # Create mouse controller object mouse_controller = MouseController('medium', 'fast') # Place mouse at the center of the screen mouse_controller.init_position() log_obj.info("[Info]: Place mouse at the center of the screen") # Place holder for total inferencing time total_inference_time = 0 # Load models and get the model loading times start_time = time.time() face_detection_obj.load_model() end_time = time.time() face_detection_loading_time = end_time - start_time start_time = time.time() head_pose_obj.load_model() end_time = time.time() head_pose_loading_time = end_time - start_time start_time = time.time() landmarks_obj.load_model() end_time = time.time() landmarks_detection_loading_time = end_time - start_time start_time = time.time() gaze_estimation_obj.load_model() end_time = time.time() gaze_estimation_loading_time = end_time - start_time # Configure input video source if input_src.lower() == 'cam': input_channel = InputFeeder(input_type='cam') elif not os.path.exists(input_src): log.error("Video file not found! Exiting....") exit(1) else: input_channel = InputFeeder(input_type='video', input_file=input_src) log_obj.info("[Info]: Opening video file ...") input_channel.load_data() video_width = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) video_height = int(input_channel.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_channel.cap.get(cv2.CAP_PROP_FPS)) frame_counter = 0 total_face_inf_time = 0 total_head_inf_time = 0 total_lanmarks_inf_time = 0 total_gaze_inf_time = 0 frame_processing_time = 0 # Process each frame try: for frame in input_channel.next_batch(): frame_processing_start_time = time.time() frame_counter = frame_counter + 1 key = cv2.waitKey(60) # Use face detection to find cropped face and provide face coordinates cropped_face, face_coords, face_inference_time = face_detection_obj.predict( frame, prob_threshold) total_face_inf_time = total_face_inf_time + face_inference_time # Now use cropped face for head pose detection head_pose_estimate, head_inference_time = head_pose_obj.predict( cropped_face, prob_threshold) total_head_inf_time = total_head_inf_time + head_inference_time # Now use cropped face for landmarks detection cropped_left_eye, cropped_right_eye, eyes_coords, converted_landmarks, landmarks_inference_time = landmarks_obj.predict( cropped_face, prob_threshold) total_lanmarks_inf_time = total_lanmarks_inf_time + landmarks_inference_time # Finally gaze estimation gaze_vector, gaze_estimate_time = gaze_estimation_obj.predict( cropped_left_eye, cropped_right_eye, head_pose_estimate) total_gaze_inf_time = total_gaze_inf_time + gaze_estimate_time # Move the mouse #mouse_controller.move(gaze_vector[0], gaze_vector[1]) # Show size-reduced frame for visual comparison # Check potential visualize flags: 'F', 'H', 'L', 'G' # If flag exist, process image to show inference results if args.visualize is not None: visualize_flag = str(args.visualize) # Draw bounding box around detected face if 'F' in visualize_flag: cv2.rectangle(frame, (face_coords[0][0], face_coords[0][1]), (face_coords[0][2], face_coords[0][3]), (0, 255, 0), 2) # Show head pose parameters if 'H' in visualize_flag: cv2.putText( frame, "Head pose: yaw: {:.3f}, pitch: {:.3f}, roll: {:.3f}". format(head_pose_estimate[0], head_pose_estimate[1], head_pose_estimate[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5) # Draw dots on detected facial landmarks if 'L' in visualize_flag: cv2.circle(frame, (converted_landmarks[0] + face_coords[0][0], converted_landmarks[1] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[2] + face_coords[0][0], converted_landmarks[3] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[4] + face_coords[0][0], converted_landmarks[5] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[6] + face_coords[0][0], converted_landmarks[7] + face_coords[0][1]), 10, (0, 255, 0), 5) cv2.circle(frame, (converted_landmarks[8] + face_coords[0][0], converted_landmarks[9] + face_coords[0][1]), 10, (0, 255, 0), 5) # Display gaze parameters if 'G' in visualize_flag: cv2.putText( frame, "Gaze estimate: x: {:.3f}, y: {:.3f}, z: {:.3f}". format(gaze_vector[0], gaze_vector[1], gaze_vector[2]), (10, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 5) resized_frame = cv2.resize(frame, (640, 360)) cv2.imshow('frame', resized_frame) if frame_counter % 4 == 0: mouse_controller.move(gaze_vector[0], gaze_vector[1]) frame_processing_time = frame_processing_time + ( time.time() - frame_processing_start_time) * 1000 if key == 27: break except Exception as e: #traceback.print_exc() if 'shape' in str(e): log_obj.info("Video feed finished") else: log_obj.error("[ERROR]: " + str(e)) pass # All done, cleaning up cv2.destroyAllWindows() input_channel.close() # Print out statistics log_obj.info("[Info]: Video source FPS: " + str(fps)) log_obj.info("[Info]: Total frame count: " + str(frame_counter)) log_obj.info("") log_obj.info("[Info]: Face detection model loading time: {:.3f} ms".format( face_detection_loading_time * 1000)) log_obj.info("[Info]: Head pose model loading time: {:.3f} ms".format( head_pose_loading_time * 1000)) log_obj.info( "[Info]: Facial landmarks detection model loading time: {:.3f} ms". format(landmarks_detection_loading_time * 1000)) log_obj.info( "[Info]: Gaze estimation model loading time: {:.3f} ms".format( gaze_estimation_loading_time * 1000)) log_obj.info("") log_obj.info( "[Info]: Average per frame total processing time : {:.3f} ms".format( frame_processing_time / frame_counter)) log_obj.info("[Info]: Average face inferencing time: {:.3f} ms".format( total_face_inf_time / frame_counter)) log_obj.info( "[Info]: Average head pose inferencing time: {:.3f} ms".format( total_head_inf_time / frame_counter)) log_obj.info( "[Info]: Average facial landmarks inferencing time: {:.3f} ms".format( total_lanmarks_inf_time / frame_counter)) log_obj.info("[Info]: Average gaze estimate time: {:.3f} ms".format( total_gaze_inf_time / frame_counter))
def main(): # Command line args args = build_argparser().parse_args() previewFlags = args.previewFlags logger = log.getLogger() inputFilePath = args.input inputFeeder = None if inputFilePath.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFilePath): logger.error("Unable to locate specified video file") exit(1) inputFeeder = InputFeeder("video", inputFilePath) modelPathDict = { 'FaceDetect': args.facedetection, 'FacialDetect': args.facialdetection, 'GazeEstimate': args.gazeestimation, 'HeadPoseEstimation': args.headpose } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey]): logger.error("Unable to find specified " + fileNameKey + " xml file") exit(1) fdm = FaceDetect(modelPathDict['FaceDetect'], args.device, args.cpu_extension) fldm = FacialDetect(modelPathDict['FacialDetect'], args.device, args.cpu_extension) gem = GazeEstimate(modelPathDict['GazeEstimate'], args.device, args.cpu_extension) hpem = HeadPoseEstimation(modelPathDict['HeadPoseEstimation'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') inputFeeder.load_data() fdm.load_model() fldm.load_model() gem.load_model() hpem.load_model() frame_count = 0 for ret, frame in inputFeeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) croppedFace, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(croppedFace) == int: logger.error("Face not detected.") if key == 27: break continue hp_out = hpem.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy()) new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out) if (not len(previewFlags) == 0): preview_frame = frame.copy() if 'fd' in previewFlags: preview_frame = croppedFace if 'fl' in previewFlags: cv2.rectangle(croppedFace, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle(croppedFace, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) if 'hp' in previewFlags: cv2.putText( preview_frame, "Pose Angles: yaw:{:.2f} | Pitch:{:.2f} | Roll:{:.2f}". format(hp_out[0], hp_out[1], hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.5, (239, 174, 0), 2) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re cv2.imshow("visualization", cv2.resize(preview_frame, (500, 500))) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("VideoStreaming Ended") cv2.destroyAllWindows() inputFeeder.close()
def main(): # Get arguments args = build_argparser().parse_args() cursor = MouseController('medium', 'fast') # Set logger and error messages logs = logging.getLogger() if args.input.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(args.input): logs.error("Unable to find specified video file") inputFeeder = InputFeeder("video", args.input) if not os.path.isfile(args.face_detection_model): logs.error("Error: face detection model's xml file not found") if not os.path.isfile(args.facial_landmark_model): logs.error("Error: facial landmark model's xml file not found") if not os.path.isfile(args.gaze_estimation_model): logs.error("Error: gaze estimation model's xml file not found") if not os.path.isfile(args.head_pose_model): logs.error("Error: head pose model's xml file not found") # Load and check the model Landmark = Model_LandmarkDetection(args.facial_landmark_model, args.device, args.cpu_extension) Landmark.check_model() fdm = Model_FaceDetection(args.face_detection_model, args.device, args.cpu_extension) fdm.check_model() hpem = Model_PoseEstimation(args.head_pose_model, args.device, args.cpu_extension) hpem.check_model() gem = Model_GazeEstimation(args.gaze_estimation_model, args.device, args.cpu_extension) gem.check_model() load_data = [ inputFeeder.load_data(), Landmark.load_model(), fdm.load_model(), hpem.load_model(), gem.load_model() ] load_data # Processes f_count = 0 for ret, frame in inputFeeder.next_batch(): if not ret: break f_count = f_count + 1 show_video = cv2.imshow('Video', cv2.resize(frame, (500, 500))) if f_count % 5 == 0: show_video prob_thr = args.prob_threshold key = cv2.waitKey(60) frame_a = frame.copy() pred = fdm.predict(frame_a, prob_thr) cropped, co_ords = pred typ = type(cropped) if typ == int: logs.error("Face not detected") if key == 27: break continue crop_a = cropped.copy() pose = hpem.predict(crop_a) left_eye, right_eye, box = Landmark.predict(cropped.copy()) cursor_co, gaze_vector = gem.predict(left_eye, right_eye, pose) x, y = box[0][0] - 10, box[0][1] - 10 x1, y1 = box[0][2] + 10, box[0][3] + 10 x2, y2 = box[1][0] - 10, box[1][1] - 10 x3, y3 = box[1][2] + 10, box[1][3] + 10 color1 = (255, 255, 255) color2 = (237, 48, 202) text_pos, text_pos2 = (10, 50), (10, 100) fontScale = 0.6 font = 1 fontColor = (255, 255, 255) lineType = 1 if (len(args.preview_flags) != 0): preview_frame = frame.copy() if 'ldm' in args.preview_flags: cv2.rectangle(cropped, (x, y), (x1, y1), color1, 2) cv2.rectangle(cropped, (x2, y2), (x3, y3), color1, 2) if 'fdm' in args.preview_flags: cv2.rectangle(preview_frame, (co_ords[0], co_ords[1]), (co_ords[2], co_ords[3]), (255, 0, 0), 3) preview_frame = cropped if 'gem' in args.preview_flags: x = int(gaze_vector[0] * 12) y = int(gaze_vector[1] * 12) w = 160 le = left_eye.copy() re = right_eye.copy() thick = 2 start_a, end_a = (x - w, y - w), (x + w, y + w) start_b, end_b = (x - w, y + w), (x + w, y - w) cv2.line(left_eye, start_b, end_b, color2, thick) cv2.line(right_eye, start_b, end_b, color2, thick) left = cv2.line(le, start_a, end_a, color2, thick) right = cv2.line(re, start_a, end_a, color2, thick) cv2.line(re, start_a, end_a, color2, thick) a1, b1, c1, d1 = box[0][0], box[0][1], box[0][2], box[0][3] a2, b2, c2, d2 = box[1][0], box[1][1], box[1][2], box[1][3] cropped[b1:d1, a1:c1], cropped[b2:d2, a2:c2] = left, right if 'hpm' in args.preview_flags: cv2.putText( preview_frame, """Angles: Roll= {:.1f} , Pitch= {:.1f} , Yaw= {:.1f} """. format(pose[2], pose[1], pose[0]), text_pos, font, fontScale, fontColor, lineType) show_video = cv2.imshow("", cv2.resize(preview_frame, (500, 500))) show_video if f_count % 5 == 0: cursor.move(cursor_co[0], cursor_co[1]) if key == 27: break # Ends program end_msg, end_feed, end_video = logs.error( "Video ended."), inputFeeder.close(), cv2.destroyAllWindows() end_msg end_feed end_video
def main(): args = build_argparser().parse_args() log.debug(args) # Load face detection model faceDetection = ModelFaceDetection(args.face_detection_model, args.prob_threshold, args.device, args.cpu_extension) start_model_load_time = time.time() faceDetection.load_model() facedetection_model_load_time = time.time() - start_model_load_time log.debug('Facedetection model load time. {}'.format( facedetection_model_load_time)) #Load Head pose estimation model headPoseEstimation = ModelHeadPoseEstimation( args.headpose_estimation_model, args.prob_threshold, args.device, args.cpu_extension) start_model_load_time = time.time() headPoseEstimation.load_model() headposeestimation_model_load_time = time.time() - start_model_load_time log.debug('Head pose estimation model load time. {}'.format( headposeestimation_model_load_time)) #Facial landmark model facialLandmarkDetection = ModelFacialLandmarkDetection( args.landmarks_regression_model, args.prob_threshold, args.device, args.cpu_extension) start_model_load_time = time.time() facialLandmarkDetection.load_model() facialLandmarkDetection_model_load_time = time.time( ) - start_model_load_time log.debug('Facial landmarks detection model load time. {}'.format( facialLandmarkDetection_model_load_time)) #Gaze estimation model gazeEstimation = ModelGazeEstimation(args.gaze_estimation_model, args.prob_threshold, args.device, args.cpu_extension) start_model_load_time = time.time() gazeEstimation.load_model() gazeEstimation_model_load_time = time.time() - start_model_load_time log.debug('Gaze estimation model load time. {}'.format( gazeEstimation_model_load_time)) # Feeder feeder = InputFeeder(args.input) feeder.load_data() counter = 0 window_name = 'frame' facedetection_inference_time_sum = 0 headpose_inference_time_sum = 0 faciallandmark_inference_time_sum = 0 gazeestimation_inference_time_sum = 0 #Process Framea for frame in feeder.next_batch(): if frame is None: break key_pressed = cv2.waitKey(1) if key_pressed == 27: break #Face detection start_inference_time = time.time() face_image, face_coords = faceDetection.predict(frame) facedetection_inference_time = time.time() - start_inference_time facedetection_inference_time_sum += facedetection_inference_time #Head pose estimation start_inference_time = time.time() yaw, pitch, roll = headPoseEstimation.predict(face_image) headpose_inference_time = time.time() - start_inference_time headpose_inference_time_sum += headpose_inference_time # log.debug('Head pose yaw, pirch ,roll {}, {}, {}'.format(yaw, pitch, roll)) #Facial landmarks detection start_inference_time = time.time() left_eye_image, right_eye_image, eye_coords = facialLandmarkDetection.predict( face_image) faciallandmark_inference_time = time.time() - start_inference_time faciallandmark_inference_time_sum += faciallandmark_inference_time # cv2.imwrite('left_eye.png', left_eye_image) # cv2.imwrite('right_eye.png', right_eye_image) # cv2.imwrite('face.png', face_image) #Gaze estimation start_inference_time = time.time() gaze_vector = gazeEstimation.predict(left_eye_image, right_eye_image, [yaw, pitch, roll]) gazeestimation_inference_time = time.time() - start_inference_time gazeestimation_inference_time_sum += gazeestimation_inference_time #log.debug('Gaze Vector {}, {}'.format(gaze_vector[0], gaze_vector[1])) #Mouse if (counter % 2 == 0): mouse = MouseController('high', 'fast') mouse.move(gaze_vector[0], gaze_vector[1]) #Display frame if (args.show): font = cv2.FONT_HERSHEY_SIMPLEX if 0 < len(face_coords): #face rect fxmin = face_coords[0][0] fymin = face_coords[0][1] fxmax = face_coords[0][2] fymax = face_coords[0][3] cv2.rectangle(frame, (fxmin, fymin), (fxmax, fymax), (200, 0, 0), 2) #eye rect cv2.rectangle( frame, (fxmin + eye_coords[0][0], fymin + eye_coords[0][1]), (fxmin + eye_coords[0][2], fymin + eye_coords[0][3]), (0, 200, 0), 2) cv2.rectangle( frame, (fxmin + eye_coords[1][0], fymin + eye_coords[1][1]), (fxmin + eye_coords[1][2], fymin + eye_coords[1][3]), (0, 200, 0), 2) #Face position length = 100 yaw = math.radians(yaw) pitch = math.radians(-pitch) roll = math.radians(roll) x1 = int(length * (math.cos(yaw) * math.cos(roll))) y1 = int(length * (math.cos(pitch) * math.sin(roll) + math.cos(roll) * math.sin(pitch) * math.sin(yaw))) x2 = int(length * (-math.cos(yaw) * math.sin(roll))) y2 = int(length * (math.cos(pitch) * math.cos(roll) + math.sin(pitch) * math.sin(yaw) * math.sin(roll))) x3 = int(length * (math.sin(yaw))) y3 = int(length * (-math.cos(yaw) * math.sin(pitch))) cv2.line(frame, (fxmin, fymin), (fxmin + x1, fymin + y1), (0, 255, 0), 2) cv2.line(frame, (fxmin, fymin), (fxmin + x2, fymin + y2), (255, 0, 0), 2) cv2.line(frame, (fxmin, fymin), (fxmin + x3, fymin + y3), (0, 0, 255), 2) #gaze x = int(length * gaze_vector[0]) y = -int(length * gaze_vector[1]) cv2.line(frame, (fxmax, fymax), (fxmax + x, fymax + y), (0, 255, 255), 5) else: cv2.putText(frame, 'Face not detected', (10, 10), font, 1, (255, 255, 255), 1) cv2.imshow( window_name, cv2.resize(frame, (int(frame.shape[1] / 3), int(frame.shape[0] / 3)))) counter += 1 log.debug("Face detection inference time average {}".format( facedetection_inference_time_sum / counter)) log.debug("Headpose inference time average {}".format( headpose_inference_time_sum / counter)) log.debug("Faciallandmark inference time average {}".format( faciallandmark_inference_time_sum / counter)) log.debug("Gazeestimation inference time average {}".format( gazeestimation_inference_time_sum / counter)) if (args.show): cv2.destroyWindow(window_name)
def infer_on_stream(args, logger): visualizers = args.visualize video_file = args.input input_feeder = None if video_file.lower() == "cam": input_feeder = InputFeeder("cam") else: try: input_feeder = InputFeeder("video", video_file) except FileNotFoundError: logger.error("Unable to find specified video file") exiit(1) input_feeder.load_data() mouse = MouseController('medium', 'fast') #load models fdm, fldm, gem, hpem, face_detect_laoding_time, facial_detect_laoding_time, head_pose_estimation_laoding_time, gaze_estimation_laoding_time, total_loading_time, status = load_models( args, logger) if status != 0: #if any model is not loaded exit(1) frame_count = 0 #start time of the inferencing start_inf_time = time.time() #iterate till the break key is pressed for flag, frame in input_feeder.next_batch(): if not flag: break frame_count += 1 if frame_count % 3 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) cropped_image, face_coords = fdm.predict(frame.copy(), args.prob_threshold) if type(cropped_image) == int: logger.error("Unable to detect the face.") if key == 27: break continue hp_out = hpem.predict(cropped_image.copy()) left_eye, right_eye, eye_coords = fldm.predict(cropped_image.copy()) mouse_coord, vector = gem.predict(left_eye, right_eye, hp_out) if (not len(visualizers) == 0): preview_frame = frame.copy() switches = {"fd": 0, "fld": 1, "hp": 2, "ge": 3} for i in visualizers: val = switches.get(i) if val == 0: logger.error("Visualising: Face") face_detect_visualize(preview_frame, face_coords) if val == 1: logger.error("Visualising: Facial Landmarks") facial_landmarks_visualize(preview_frame, cropped_image, eye_coords, face_coords) if val == 2: logger.error("Visualising: Head Pose") headpose_visualize(preview_frame, hp_out, face_coords) if val == 3: logger.error("Visualising: Gaze") x = vector[0] y = vector[1] gaze_visualize(preview_frame, cropped_image, x, y, left_eye, right_eye, eye_coords, face_coords) cv2.imshow("visualization", cv2.resize(preview_frame, (500, 500))) if frame_count % 3 == 0: mouse.move(mouse_coord[0], mouse_coord[1]) if key == 27: #if benchmarking is enabled if (args.benchmark == "true"): logger.error("Face Detection Model Loading Time: {}s".format( face_detect_laoding_time)) logger.error( "Facial Landmarks Detection Model Loading Time: {}s". format(facial_detect_laoding_time)) logger.error( "Head Pose Estimation Model Loading Time: {}s".format( head_pose_estimation_laoding_time)) logger.error("Gaze Estimation Model Loading Time: {}s".format( gaze_estimation_laoding_time)) logger.error( "Total Loading Time: {}s".format(total_loading_time)) inference_time = round(time.time() - start_inf_time, 1) fps = int(frame_count) / inference_time logger.error( "total inference time {} seconds".format(inference_time)) logger.error("fps {} frame/second".format(fps)) with open( os.path.join( os.path.dirname(os.path.abspath(__file__)), 'benchmark.txt'), 'w') as f: f.write( str("Total Inference Time: " + str(inference_time) + '\n')) f.write(str("Total FPS: " + str(fps) + '\n')) f.write( str("Total Model Loading Time: " + str(total_loading_time) + '\n')) break logger.error("VideoStream ended...") cv2.destroyAllWindows() input_feeder.close()
def run_app(args): face_detection_model = Model_Face_Detection(args.model_path_fd, args.device, args.cpu_extension, threshold=args.threshold) face_detection_model.load_model() head_pose_model = Model_Head_Pose_Estimation(args.model_path_hp, args.device, args.cpu_extension) head_pose_model.load_model() face_landmark_model = Model_Facial_Landmarks(args.model_path_fl, args.device, args.cpu_extension) face_landmark_model.load_model() gaze_model = Model_Gaze_Estimation(args.model_path_ge, args.device, args.cpu_extension) gaze_model.load_model() input_feeder = InputFeeder( args.input_type, args.input_file, ) input_feeder.load_data() mouse_controller = MouseController("medium", "fast") # while input_feeder.cap.isOpened(): # feed_out=input_feeder.next_batch() frame_count = 0 custom = args.toggle for frame in input_feeder.next_batch(): if frame is None: break key_pressed = cv2.waitKey(60) frame_count += 1 face_out, cords = face_detection_model.predict(frame.copy()) # When no face was detected if cords == 0: inf_info = "No Face Detected in the Frame" write_text_img(frame, inf_info, 400) continue eyes_cords, left_eye, right_eye = face_landmark_model.predict( face_out.copy()) head_pose_out = head_pose_model.predict(face_out.copy()) gaze_out = gaze_model.predict(left_eye, right_eye, head_pose_out) # Faliure in processing both eyes if gaze_out is None: continue x, y = gaze_out if frame_count % 5 == 0: mouse_controller.move(x, y) inf_info = "Head Pose (y: {:.2f}, p: {:.2f}, r: {:.2f})".format( head_pose_out[0], head_pose_out[1], head_pose_out[2]) # Process Visualization if 'frame' in custom: visualization(frame, cords, face_out, eyes_cords) if 'stats' in custom: write_text_img(face_out, inf_info, 400) inf_info = "Gaze Angle: x: {:.2f}, y: {:.2f}".format(x, y) log.info("Statistic " + inf_info) write_text_img(face_out, inf_info, 400, 15) if 'gaze' in custom: display_head_pose(frame, head_pose_out, cords) out_f = np.hstack( (cv2.resize(frame, (400, 400)), cv2.resize(face_out, (400, 400)))) cv2.imshow('Visualization', out_f) if key_pressed == 27: break input_feeder.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() logger = logging.getLogger('main') is_benchmarking = False # initialize variables with the input arguments for easy access model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'LandmarkRegressionModel': args.landmarkRegressionModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } preview_flags = args.previewFlags input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold output_path = args.output_path if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger.error("Unable to find specified model file" + str(model_path)) exit(1) # instantiate model face_detection_model = FaceDetectionModel(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_detection_model = LandmarkDetectionModel(model_path_dict['LandmarkRegressionModel'], device_name, threshold=prob_threshold) head_pose_estimation_model = HeadPoseEstimationModel(model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_estimation_model = GazeEstimationModel(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) if not is_benchmarking: mouse_controller = MouseController('medium', 'fast') # load Models start_model_load_time = time.time() face_detection_model.load_model() landmark_detection_model.load_model() head_pose_estimation_model.load_model() gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time feeder.load_data() out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.get_fps()/10), (1920, 1080), True) frame_count = 0 start_inference_time = time.time() for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) try: face_cords, cropped_image = face_detection_model.predict(frame) if type(cropped_image) == int: logger.warning("Unable to detect the face") if key == 27: break continue left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict(cropped_image) pose_output = head_pose_estimation_model.predict(cropped_image) mouse_cord, gaze_vector = gaze_estimation_model.predict(left_eye_image, right_eye_image, pose_output) except Exception as e: logger.warning("Could predict using model" + str(e) + " for frame " + str(frame_count)) continue image = cv2.resize(frame, (500, 500)) if not len(preview_flags) == 0: preview_frame = draw_preview( frame, preview_flags, cropped_image, left_eye_image, right_eye_image, face_cords, eye_cords, pose_output, gaze_vector) image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(frame) if frame_count % 5 == 0 and not is_benchmarking: mouse_controller.move(mouse_cord[0], mouse_cord[1]) if key == 27: break total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = frame_count / total_inference_time try: os.mkdir(output_path) except OSError as error: logger.error(error) with open(output_path+'stats.txt', 'w') as f: f.write(str(total_inference_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_model_load_time) + '\n') logger.info('Model load time: ' + str(total_model_load_time)) logger.info('Inference time: ' + str(total_inference_time)) logger.info('FPS: ' + str(fps)) logger.info('Video stream ended') cv2.destroyAllWindows() feeder.close()
def main(args): logger = logging.getLogger() feeder = None if args.input_type == constants.VIDEO or args.input_type == constants.IMAGE: extension = str(args.input).split('.')[1] # if not extension.lower() in constants.ALLOWED_EXTENSIONS: # logger.error('Please provide supported extension.' + str(constants.ALLOWED_EXTENSIONS)) # exit(1) # if not os.path.isfile(args.input): # logger.error("Unable to find specified video/image file") # exit(1) feeder = InputFeeder(args.input_type, args.input) elif args.input_type == constants.IP_CAMERA: if not str(args.input).startswith('http://'): logger.error('Please provide ip of server with http://') exit(1) feeder = InputFeeder(args.input_type, args.input) elif args.input_type == constants.WEBCAM: feeder = InputFeeder(args.input_type) mc = MouseController("medium", "fast") feeder.load_data() face_model = Face_Model(args.face, args.device, args.cpu_extension) face_model.check_model() landmark_model = Landmark_Model(args.landmarks, args.device, args.cpu_extension) landmark_model.check_model() gaze_model = Gaze_Estimation_Model(args.gazeestimation, args.device, args.cpu_extension) gaze_model.check_model() head_model = Head_Pose_Model(args.headpose, args.device, args.cpu_extension) head_model.check_model() face_model.load_model() logger.info("Face Detection Model Loaded...") landmark_model.load_model() logger.info("Landmark Detection Model Loaded...") gaze_model.load_model() logger.info("Gaze Estimation Model Loaded...") head_model.load_model() logger.info("Head Pose Detection Model Loaded...") print('Loaded') try: frame_count = 0 for ret, frame in feeder.next_batch(): if not ret: break if frame is None: continue frame_count += 1 crop_face = None if True: crop_face, box = face_model.predict(frame.copy()) if crop_face is None: logger.error("Unable to detect the face.") continue imshow('frame', crop_face, width=400) (lefteye_x, lefteye_y), ( righteye_x, righteye_y ), eye_coords, left_eye, right_eye = landmark_model.predict( crop_face.copy(), eye_surrounding_area=15) # imshow("left_eye", left_eye, width=100) # imshow("right_eye", right_eye, width=100) '''TODO dlib is better to crop eye with perfection''' head_position = head_model.predict(crop_face.copy()) if True: if cv2.waitKey(20) & 0xFF == ord('q'): break continue gaze, (mousex, mousey) = gaze_model.predict(left_eye.copy(), right_eye.copy(), head_position) if (len(args.debug) > 0): debuFrame = frame.copy() if crop_face is None: continue thickness = 2 radius = 2 color = (0, 0, 255) [[le_xmin, le_ymin, le_xmax, le_ymax], [re_xmin, re_ymin, re_xmax, re_ymax]] = eye_coords if 'face' in args.debug: cv2.rectangle(debuFrame, (box[0], box[1]), (box[2], box[3]), (255, 255, 255), 2) cv2.rectangle(crop_face, (re_xmin, re_ymin), (re_xmax, re_ymax), (100, 255, 100), 2) cv2.rectangle(crop_face, (le_xmin, le_ymin), (le_xmax, le_ymax), (100, 255, 100), 2) ''' LandMark ''' cv2.circle(crop_face, (lefteye_x, lefteye_y), radius, color, thickness) cv2.circle(crop_face, (righteye_x, righteye_y), radius, color, thickness) debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face if 'headpose' in args.debug: yaw = head_position[0] pitch = head_position[1] roll = head_position[2] sinY = math.sin(yaw * math.pi / 180.0) sinP = math.sin(pitch * math.pi / 180.0) sinR = math.sin(roll * math.pi / 180.0) cosY = math.cos(yaw * math.pi / 180.0) cosP = math.cos(pitch * math.pi / 180.0) cosR = math.cos(roll * math.pi / 180.0) cH, cW = crop_face.shape[:2] arrowLength = 0.4 * cH * cW xCenter = int(cW / 2) yCenter = int(cH / 2) # center to right # cv2.line(crop_face, (xCenter, yCenter), # (int((xCenter + arrowLength * (cosR * cosY + sinY * sinP * sinR))), # int((yCenter + arrowLength * cosP * sinR))), (186, 204, 2), 1) # # # center to top # cv2.line(crop_face, (xCenter, yCenter), # (int(((xCenter + arrowLength * (cosR * sinY * sinP + cosY * sinR)))), # int((yCenter - arrowLength * cosP * cosR))), (186, 204, 2), 1) # # # center to forward # cv2.line(crop_face, (xCenter, yCenter), # (int(((xCenter + arrowLength * sinY * cosP))), # int((yCenter + arrowLength * sinP))), (186, 204, 2), 1) # cv2.putText( crop_face, 'head pose: (y={:.2f}, p={:.2f}, r={:.2f})'.format( yaw, pitch, roll), (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 255), 1) if 'gaze' in args.debug: cH, cW = crop_face.shape[:2] arrowLength = 0.6 * cH gazeArrowX = gaze[0] * arrowLength gazeArrowY = -gaze[1] * arrowLength debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face cv2.arrowedLine(crop_face, (lefteye_x, lefteye_y), (int(lefteye_x + gazeArrowX), int(lefteye_y + gazeArrowY)), (184, 113, 57), 2) cv2.arrowedLine(crop_face, (righteye_x, righteye_y), (int(righteye_x + gazeArrowX), int(righteye_y + gazeArrowY)), (184, 113, 57), 2) cv2.putText(crop_face, 'gaze angles: h={}, v={}'.format("!", "2"), (0, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 255), 1) debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face # # imshow("face", crop_face, width=400) # cv2.moveWindow("face", 0, 0) # imshow("debug", debuFrame, width=400) # cv2.moveWindow("debug", cW * 2, cH) # try: # if frame_count % 5 == 0: # mc.move(mousex, mousey) # except Exception as err: # logger.error("Moving cursor outside the PC not supported yet !!") # key = cv2.waitKey(60) imshow('frame', debuFrame, width=1210) if cv2.waitKey(20) & 0xFF == ord('q'): break except Exception as err: logger.error(err) cv2.destroyAllWindows() feeder.close()
def main(): try: args = build_argparser().parse_args() logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("computer-pointer-controller.log"), logging.StreamHandler() ]) print_output_frame = args.print_output_frame logger = logging.getLogger() input_file_path = args.input feeder = None if input_file_path.lower() == "CAM": feeder = InputFeeder("cam") else: if not os.path.isfile(input_file_path): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder("video", input_file_path) mc = MouseController('low', 'fast') feeder.load_data() modelPathDict = { 'FaceDetectionModel': args.face, 'FacialLandmarksDetectionModel': args.landmark, 'GazeEstimationModel': args.gazeestimation, 'HeadPoseEstimationModel': args.headpose } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey] + '.xml'): logger.error("Unable to find specified " + fileNameKey + " xml file") exit(1) logging.info("============== Models Load time ===============") face_detection = FaceDetection(args.face, args.device, args.prob_threshold, args.cpu_extension) start_time = time.time() face_detection.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) landmarks_detection = FacialLandmarksDetection(args.landmark, args.device, args.cpu_extension) start_time = time.time() landmarks_detection.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) gaze_estimation = GazeEstimation(args.gazeestimation, args.device, args.cpu_extension) start_time = time.time() gaze_estimation.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) headpose_estimation = HeadPoseEstimation(args.headpose, args.device, args.cpu_extension) start_time = time.time() headpose_estimation.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) logging.info("============== End =====================") frame_count = 0 fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) start_time = time.time() cropped_face, face_coords = face_detection.predict(frame.copy()) fd_infertime += time.time() - start_time if len(cropped_face) == 0: logger.error("Unable to detect the face.") continue start_time = time.time() headpose_out = headpose_estimation.predict(cropped_face.copy()) hp_infertime += time.time() - start_time start_time = time.time() left_eye, right_eye, eye_coords = landmarks_detection.predict( cropped_face.copy()) lm_infertime += time.time() - start_time start_time = time.time() new_mouse_coord, gaze_vector = gaze_estimation.predict( left_eye, right_eye, headpose_out) ge_infertime += time.time() - start_time if print_output_frame: preview_frame = frame.copy() if 'fd' in print_output_frame: preview_frame = cropped_face cv2.rectangle(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255, 0, 0), 3) if 'fl' in print_output_frame: cv2.rectangle(cropped_face, (eye_coords[0][0], eye_coords[0][1]), (eye_coords[0][2], eye_coords[0][3]), (0, 255, 0), 2) cv2.rectangle(cropped_face, (eye_coords[1][0], eye_coords[1][1]), (eye_coords[1][2], eye_coords[1][3]), (0, 255, 0), 2) if 'hp' in print_output_frame: cv2.putText( cropped_face, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(headpose_out[0], headpose_out[1], headpose_out[2]), (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.25, (0, 0, 0), 1) face = frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] xmin, ymin, _, _ = face_coords face_center = (xmin + face.shape[1] / 2, ymin + face.shape[0] / 2, 0) headpose_estimation.draw_axes(frame, face_center, headpose_out[0], headpose_out[1], headpose_out[2]) if 'ge' in print_output_frame: cropped_h, cropped_w = cropped_face.shape[:2] arrow_length = 0.3 * cropped_h gaze_arrow_x = gaze_vector[0] * arrow_length gaze_arrow_y = -gaze_vector[1] * arrow_length cv2.arrowedLine(cropped_face, (eye_coords[0][0], eye_coords[0][1]), (int(eye_coords[0][2] + gaze_arrow_x), int(eye_coords[0][3] + gaze_arrow_y)), (0, 255, 0), 2) cv2.arrowedLine(cropped_face, (eye_coords[1][0], eye_coords[1][1]), (int(eye_coords[1][2] + gaze_arrow_x), int(eye_coords[1][3] + gaze_arrow_y)), (0, 255, 0), 2) #frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = cropped_face if len(preview_frame) != 0: img_hor = np.hstack((cv2.resize(preview_frame, (800, 800)), cv2.resize(frame, (800, 800)))) else: img_hor = cv2.resize(frame, (800, 800)) cv2.imshow("Monitor", img_hor) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") logger.info("Video stream ended...") cv2.destroyAllWindows() feeder.close() except Exception as ex: logging.exception("Error in inference") logging.exception("Exception type:") logging.exception(type(ex)) logging.exception("Exception args:") logging.exception(ex.args) logging.exception("Exception:") logging.exception(ex)
def main(args): # Multiple Modes of Control ########################### ## 0 = No Control ## 1 = Gaze Angle Control ## 2 = Head Pose Control ## 3 = Sound Control ########################### controlMode = 0 modes = ['No Control', 'Gaze Control', 'Head Pose', 'Sound Control'] #################### # Control Commands # #################### # Left Click = Yawn # Right Click = Looking up # Increment Control Modes = Right Wink # Left Eye Wink and Smile are left unassigned # You can dictate text in Sound mode (Control mode = 3) ##################################################################### # Initializing the Speech Recognition Thread ##################################################################### # You can add more controls as you deem fit. numbers = ['zero', 'one', 'two', 'three', 'four', \ 'five', 'six', 'seven', 'eight', 'nine'] controls = ['left', 'right', 'up', 'down'] control_syn = {} for control in controls: control_syn.setdefault(control, []) # Need to account for similar sounding words as speech recog is on the edge! control_syn['left'].extend(['let', 'left', 'light', 'live', 'laugh']) control_syn['right'].extend( ['right', 'write', 'great', 'fight', 'might', 'ride']) control_syn['up'].extend(['up', 'hop', 'hope', 'out']) control_syn['down'].extend(['down', 'doubt', 'though']) device_list = load_device() stream_reader = audio_helper.StreamReader(device_list[1][0], received_frames) if not stream_reader.initialize(): print("Failed to initialize Stream Reader") speech.close() speech = None return speech = SpeechManager() print('speech config = ' + str(SPEECH_CONFIG)) if not speech.initialize(SPEECH_CONFIG, infer_device='CPU', batch_size=8): print("Failed to initialize ASR recognizer") speech.close() speech = None return stt = Queue() prevUtterance = '' reading_thread = Thread(target=stream_reader.read_stream, \ args=(speech, stt), daemon=True) reading_thread.start() ##################################################################### # Fixing 60x60 box as yaw and pitch boundaries to # correspond to head turning left and right (yaw) # and also moving up and down (pitch) headYawPitchBounds = [-30, 30] lastGaze = [0, 0] lastPose = [0, 0] # Set the stickiness value stickinessHead = 5 stickinessGaze = 10 eventText = "No Event" # init the logger logger = logging.getLogger() feeder = None feeder = InputFeeder(args.input_type, args.input) feeder.load_data() mc = MouseController("medium", "fast") # Loading all the gesture control models viz. face, head and gaze face_model = FaceDetector(args.face, args.device, args.cpu_extension) # face_model.check_model() face_model.load_model() logger.info("Face Detection Model Loaded...") head_model = HeadPoseDetect(args.headpose, args.device, args.cpu_extension) # head_model.check_model() head_model.load_model() logger.info("Head Pose Detection Model Loaded...") landmarks_model = LandmarksDetect(args.landmarks, args.device, args.cpu_extension) # landmarks_model.check_model() landmarks_model.load_model() logger.info("Landmarks Detection Model Loaded...") gaze_model = GazeDetect(args.gazeestimation, args.device, args.cpu_extension) # gaze_model.check_model() gaze_model.load_model() logger.info("Gaze Detection Model Loaded...") visualizeHeadPose = bool(distutils.util.strtobool(args.visualizeHeadPose)) visualizeGaze = bool(distutils.util.strtobool(args.visualizeGaze)) visualizeFace = bool(distutils.util.strtobool(args.visualizeFace)) pixelCount_leye = [] isEyeOpen_leye = [] pixelCount_reye = [] isEyeOpen_reye = [] isCalibrated = False isSmiling = False isMouthOpen = False moveEnabled = False islookingUp = False lastPoses = collections.deque(maxlen=20) lastGazes = collections.deque(maxlen=20) try: frame_count = 0 for ret, frame in feeder.next_batch(): ################################################################ # if any sound is deciphered from the spunned off thread then # check the last 3 words of the utterance for matching control word if (stt.qsize() > 0 and controlMode == 3): utterance = stt.get() print("From Parent: " + utterance) # need to process again only if change in utterance if (prevUtterance != utterance): control, lastWord = detectSoundEvent( utterance, controls, control_syn) if control is not None: direction = controls.index(control) mc.moveRelative(direction) else: if lastWord in numbers: lastWord = str(numbers.index(lastWord)) mc.write(lastWord) prevUtterance = utterance ################################################################ k = cv2.waitKey(1) & 0xFF # press 'q' to exit if k == ord('q'): break if not ret: break frame_count += 1 crop_face = None # inferenceBegin = time.time() crop_face, box = face_model.predict(frame.copy()) if crop_face is None: logger.error("Unable to detect the face.") continue # Draw the face box xmin, ymin, xmax, ymax = box if visualizeFace: cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (255, 0, 255), 3) orientation = head_model.predict(crop_face) box_left, box_right, \ left_eye, right_eye, \ p0, p1, p12, p13, p14, \ p2, p3, p15, p16, p17, \ p8, p9, p10, p11 = landmarks_model.predict(crop_face) # if any of the eye is not detected eye gesture and # gaze estimation are not executed if (left_eye.size * right_eye.size == 0): logger.error("Unable to detect eyes.") continue pad = 10 # Compute Right Eye: Close Snap right_eye_ball = frame[ymin + p1[1] - pad:ymin + p0[1] + pad, xmin + p1[0] - pad:xmin + p0[0] + pad] # Compute Left Eye: Close Snap left_eye_ball = frame[ymin + p3[1] - pad:ymin + p2[1] + pad, xmin + p2[0] - pad:xmin + p3[0] + pad] # pixelCount_leye_bk = pixelCount_leye #can delete this line pixelCount_reye, Rtrigger, probR = findClosurebyStats( 'Right', right_eye_ball, pixelCount_reye, frame_count) pixelCount_leye, Ltrigger, probL = findClosurebyStats( 'Left', left_eye_ball, pixelCount_leye, frame_count) print("probL: " + str(probL)) if probL < -30 and islookingUp is False: print('Click Right') controlMode = hikeControlMode(controlMode) ## to change # mc.clickRight() islookingUp = True eventText = 'Increment Control Mode' elif probL > 0: islookingUp = False if (eventText == 'Increment Control Mode'): eventText = 'No Event' # If both eyes are detected as pressed (as one eye # can shrink when the other eye is winked) then check # which eye has higher probability of closure. # Note: To close both eyes is not a gesture. if Ltrigger and Rtrigger: # print("probR = " + str(probR) + "probL = " + str(probL)) if probR > probL: Ltrigger = False else: Rtrigger = False # If you want to enable left and right wink actions, # then call corresponding functions here. if Ltrigger: print('left eye pressed') # controlMode = dipControlMode(controlMode) # writeList(pixelCount_leye_bk) # Dumping list for debugging purpose # mc.scroll(20) # you can pass the head pose up/down as param # mc.drag() if Rtrigger: print('right eye pressed') # controlMode = hikeControlMode(controlMode) # mc.clickRight() gaze, (x, y) = gaze_model.predict(left_eye, right_eye, orientation) # inferenceEnd = time.time() # inferenceTime = inferenceEnd - inferenceBegin # print("Inference Time of 4 models = " + str(inferenceTime)) yaw = orientation[0] pitch = orientation[1] roll = orientation[2] sinY = math.sin(yaw * math.pi / 180.0) sinP = math.sin(pitch * math.pi / 180.0) sinR = math.sin(roll * math.pi / 180.0) cosY = math.cos(yaw * math.pi / 180.0) cosP = math.cos(pitch * math.pi / 180.0) cosR = math.cos(roll * math.pi / 180.0) cH, cW = crop_face.shape[:2] arrowLength = 0.5 * max(cH, cW) # Drawing Eye Boxes (p0_x, p0_y) = box_left[:2] (p12_x, p12_y) = box_left[2:4] cv2.rectangle(frame, (p0_x + xmin, p0_y + ymin), (p12_x + xmin, p12_y + ymin - 5), (255, 0, 0), 3) (p2_x, p2_y) = box_right[:2] (p17_x, p17_y) = box_right[2:4] cv2.rectangle(frame, (p2_x + xmin, p2_y + ymin), (p17_x + xmin, p17_y + ymin - 5), (255, 0, 0), 3) # to draw the eye points as circles cv2.circle(frame, tuple(map(operator.add, p0, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p1, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p12, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p13, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p14, (xmin, ymin))), 1, (255, 0, 0), 2) # to draw the eye points as circles cv2.circle(frame, tuple(map(operator.add, p2, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p3, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p15, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p16, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p17, (xmin, ymin))), 1, (255, 0, 0), 2) # to draw mouth points cv2.circle(frame, tuple(map(operator.add, p8, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p9, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p10, (xmin, ymin))), 1, (255, 0, 0), 2) cv2.circle(frame, tuple(map(operator.add, p11, (xmin, ymin))), 1, (255, 0, 0), 2) # Finding Eye Center xCenter_left = int((p0_x + p12_x) / 2) + xmin yCenter_left = int((p0_y + p12_y) / 2) + ymin leftEye_Center = (xCenter_left, yCenter_left) # Finding Eye Center xCenter_right = int((p2_x + p17_x) / 2) + xmin yCenter_right = int((p2_y + p17_y) / 2) + ymin rightEye_Center = (xCenter_right, yCenter_right) ############# DRAWING DIRECTION ARROWS BASED ON HEAD POSITION ############ ## Euler angles to cartesian coordinates# # https://stackoverflow.com/questions/1568568/how-to-convert-euler-angles-to-directional-vector # Total rotation matrix is: (See correct matrix in blog) # | cos(yaw)cos(pitch) -cos(yaw)sin(pitch)sin(roll)-sin(yaw)cos(roll) -cos(yaw)sin(pitch)cos(roll)+sin(yaw)sin(roll)| # | sin(yaw)cos(pitch) -sin(yaw)sin(pitch)sin(roll)+cos(yaw)cos(roll) -sin(yaw)sin(pitch)cos(roll)-cos(yaw)sin(roll)| # | sin(pitch) cos(pitch)sin(roll) cos(pitch)sin(roll)| if visualizeHeadPose or controlMode == 2 or isCalibrated is False: # yaw and pitch are important for mouse control poseArrowX = orientation[0] #* arrowLength poseArrowY = orientation[1] #* arrowLength # Taking 2nd and 3rd row for 2D Projection ##############################LEFT EYE ################################### # cv2.arrowedLine(frame, leftEye_Center, # (int((xCenter_left + arrowLength * (cosR * cosY + sinY * sinP * sinR))), # int((yCenter_left + arrowLength * cosP * sinR))), (255, 0, 0), 4) # # center to top # cv2.arrowedLine(frame, leftEye_Center, # (int(((xCenter_left + arrowLength * (sinY * sinP * cosR - cosY * sinR)))), # int((yCenter_left + arrowLength * cosP * cosR))), (0, 0, 255), 4) # center to forward # cv2.arrowedLine(frame, leftEye_Center, \ # (int(((xCenter_left + arrowLength * sinY * cosP))), \ # int((yCenter_left - arrowLength * sinP))), (0, 255, 0), 4) ##############################RIGHT EYE ################################### # cv2.arrowedLine(frame, rightEye_Center, # (int((xCenter_right + arrowLength * (cosR * cosY + sinY * sinP * sinR))), # int((yCenter_right + arrowLength * cosP * sinR))), (255, 0, 0), 4) # # center to top # cv2.arrowedLine(frame, rightEye_Center, # (int(((xCenter_right + arrowLength * (sinY * sinP * cosR - cosY * sinR)))), # int((yCenter_right + arrowLength * cosP * cosR))), (0, 0, 255), 4) # center to forward # cv2.arrowedLine(frame, rightEye_Center, # (int(((xCenter_right + arrowLength * sinY * cosP))), # int((yCenter_right - arrowLength * sinP))), (0, 255, 0), 4) # gaze is required for calibration if visualizeGaze or controlMode == 1 or isCalibrated is False: gazeArrowX = gaze[0] * arrowLength gazeArrowY = -gaze[1] * arrowLength cv2.arrowedLine(frame, leftEye_Center, (int(leftEye_Center[0] + gazeArrowX), int(leftEye_Center[1] + gazeArrowY)), (0, 255, 0), 4) cv2.arrowedLine(frame, rightEye_Center, (int(rightEye_Center[0] + gazeArrowX), int(rightEye_Center[1] + gazeArrowY)), (0, 255, 0), 4) ############################### # Compute Mouth Aspect Ratio # ############################### mouthWidth = p9[0] - p8[0] mouthHeight = p11[1] - p10[1] if (mouthWidth != 0): mAspRatio = mouthHeight / mouthWidth else: mAspRatio = 0 # print('MAR RATIO = ' + str(mAspRatio)) # To validate face is properly facing the camera. # To avoid erroneous control mode switches coz of face turns. if (isFaceInBounds(yaw, pitch) and mAspRatio > 0): # These threshold constants need to either modified or made dynamic. # # when mouth is open if mAspRatio > 0.4 and isMouthOpen is False: # mouthHeights.clear() # isSoundControl = False print('clicking left') mc.clickLeft() isMouthOpen = True eventText = 'Click Left' elif mAspRatio < 0.35: isMouthOpen = False if (eventText == 'Click Left'): eventText = 'No Event' # when mouth is wide, i.e. smiling if mAspRatio < 0.26 and isSmiling == False: print('You are smiling...') eventText = 'Smiling' isSmiling = True elif mAspRatio > 0.3: # Reset the click flag once smile is over. isSmiling = False if (eventText == 'Smiling'): eventText = 'No Event' # controlMode = 3 # To debug a specific control mode. try: if frame_count % 5 == 0: if (mc.calibrated is False): isCalibrated = mc.captureCorners( gazeArrowX, gazeArrowY) else: # Face should be forward facing inorder to take comamnds. # if (isFaceInBounds(headYawPitchBounds, yaw, pitch)): if controlMode == 1: moveEnabled, lastGazes = \ isMoveEnabled(lastGaze, stickinessGaze, gazeArrowX, gazeArrowY, lastGazes) if moveEnabled: print('moving mouse with gaze') mc.moveWithGaze(gazeArrowX, gazeArrowY) lastGaze = [gazeArrowX, gazeArrowY ] #saving pos for stickiness elif controlMode == 2: moveEnabled, lastPoses = \ isMoveEnabled(lastPose, stickinessHead, poseArrowX, poseArrowY, lastPoses) if moveEnabled: print('moving mouse with head. Yaw: ' + str(poseArrowX) + " Pitch: " + str(poseArrowY) + " Roll: " + str(orientation[2])) mc.moveWithHead(poseArrowX, poseArrowY, headYawPitchBounds) lastPose = [poseArrowX, poseArrowY ] #saving pos for stickiness except Exception as err: print(traceback.format_exc()) PrintException() logger.error("Exception occurred while moving cursor!") # Display calibration status on video if isCalibrated: frame = cv2.putText(frame, 'Calibration is done.', (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA) frame = cv2.putText(frame, 'Control Mode: ' + modes[controlMode], (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA) frame = cv2.putText(frame, 'Event: ' + eventText, (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA) frame = cv2.putText(frame, 'MAR: ' + str(round(mAspRatio, 2)), (20, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA) frame = cv2.putText(frame, 'Mouse Loc: ' + str(mc.getLocation()), (20, 110), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA) imshow('frame', frame, width=800) # frameEnd = time.time() # frameTime = frameEnd - frameBegin # print("FPS = " + str(1/frameTime)) except Exception as err: print(traceback.format_exc()) PrintException() logger.error(err) cv2.destroyAllWindows() feeder.close()
def main(): args = build_argparser().parse_args() single_image_mode = (args.input_type == 'image') #Create and validate input feed input_feed = InputFeeder(input_type=args.input_type,input_file=args.input_path) input_feed.load_data() if not input_feed.is_open(): log.critical('Error opening input, check --input_path parameter (use --help for more info)') sys.exit(1) #Load models face_model = FaceDetection(args.face_detection_model) face_model.load_model(args.device) head_pose_model = HeadPoseEstimation(args.head_pose_model) head_pose_model.load_model(args.device) facial_landmarks_model = FacialLandmarksDetection(args.facial_landmarks_model) facial_landmarks_model.load_model(args.device) gaze_estimation_model = GazeEstimation(args.gaze_estimation_model) gaze_estimation_model.load_model(args.device) #initialize frame count for filtering count = 0 gaze_vector_accum = np.array([0,0,0],dtype='float64') gaze_vector_filtered = np.array([0,0,0],dtype='float64') #get screen calibration if not args.calibrate: run_calibration = False cal_x_limits, cal_y_limits = utils.get_calibration() else: run_calibration = True update_display = True #squares to draw on screen for calibration top_left_square = {'pt1':(0,0), 'pt2':(BOX_SIDE_LENGTH,BOX_SIDE_LENGTH)} top_right_square = {'pt1':(SCREEN_WIDTH - BOX_SIDE_LENGTH,0), 'pt2':(SCREEN_WIDTH, BOX_SIDE_LENGTH)} bottom_left_square = {'pt1':(0,SCREEN_HEIGHT - BOX_SIDE_LENGTH), 'pt2':(BOX_SIDE_LENGTH,SCREEN_HEIGHT)} bottom_right_square = {'pt1':(SCREEN_WIDTH - BOX_SIDE_LENGTH,SCREEN_HEIGHT - BOX_SIDE_LENGTH), 'pt2':(SCREEN_WIDTH,SCREEN_HEIGHT)} cal_squares = [top_left_square,top_right_square,bottom_left_square, bottom_right_square] #names of the calibration points for storing on calibration file cal_names = ['top_left', 'top_right', 'bottom_left', 'bottom_right'] #model output values for each calibration point will be stored here cal_points = {} square_iter = iter(cal_squares) name_iter = iter(cal_names) #image to display on screen for calibration base_img = get_base_img("LOOK AT THE SQUARES FOR 2 SECONDS","AND THEN PRESS n", COLORS[0]) if not single_image_mode: while True: #filter results count += 1 if(count>FILTER_QUANTITY): gaze_vector_filtered=gaze_vector_accum/FILTER_QUANTITY gaze_vector_accum=np.array([0,0,0],dtype='float64') count=0 #process frames frame = next(input_feed.next_batch()) start_time=time.time() face_boxes = run_inference(frame, face_model) cropped_faces = utils.crop_image(frame,face_boxes) if cropped_faces==0: #no face detected, nothing to process continue elif cropped_faces is None: #finished reading input feed break elif len(cropped_faces)==1: #found a single face in the frame, proceed head_pose = run_inference(cropped_faces[0], head_pose_model) eye_boxes = run_inference(cropped_faces[0], facial_landmarks_model) cropped_eyes = utils.crop_image(cropped_faces[0], eye_boxes) gaze_vector = run_inference_gaze(cropped_eyes[0], cropped_eyes[1], head_pose, gaze_estimation_model) inference_time=time.time()-start_time gaze_vector_accum += gaze_vector if run_calibration: if update_display: img = np.copy(base_img) square = next(square_iter, None) if not square is None: cv2.rectangle(img,square['pt1'], square['pt2'],COLORS[0],-1) update_display=False else: #Done with calibration cal_x_limits, cal_y_limits = utils.get_calibration(cal_points) utils.save_calibration(cal_points) run_calibration=False utils.imshow_fullscreen('window',img) if cv2.waitKey(1) & 0xFF == ord('n'): update_display = True point = np.array([ gaze_vector_filtered[0], gaze_vector_filtered[1] ]) point_name = next(name_iter) cal_points[point_name] = point else: if not args.display_all: img = get_base_img("GAZE CONTROL ENABLED", "MOVE MOUSE TO ANY CORNER OR PRESS q TO EXIT", COLORS[1]) utils.imshow_fullscreen('window',img) else: utils.display_inference_results(frame, face_boxes, head_pose, gaze_vector, inference_time) if cv2.waitKey(1) & 0xFF == ord('q'): print("User terminated program, goodbye") break screen_x, screen_y = get_screen_position(gaze_vector_filtered[0], gaze_vector_filtered[1], cal_x_limits, cal_y_limits) try: pyautogui.moveTo(screen_x,screen_y,MOUSE_MOVE_TIME) except pyautogui.FailSafeException: print("User terminated program, goodbye") break else: #Handle multiple people here if needed log.critical("ERROR: Multiple people detected, only single person supported") sys.exit(1) else: #Implement single image mode here if needed log.critical("ERROR: Single image mode not implemented") sys.exit(1) input_feed.close() cv2.destroyAllWindows()
class Inferencer: def __init__(self, device='CPU', mouse_con=False, face_dec=None, fac_land=None, head_pose=None, gaze=None, show_video=False, save_video=False): ''' all models should be put in here ''' if face_dec and fac_land and head_pose and gaze: self.face_dec, self.fac_land, self.head_pose, self.gaze = FaceDetectionModel( face_dec, device=device), FacialLandmarksDetection( fac_land, device=device), Head_Pose_Estimation( head_pose, device=device), Gaze_Estimation(gaze, device=device) self.face_dec.load_model() self.fac_land.load_model() self.head_pose.load_model() self.gaze.load_model() else: raise ValueError('Missing Arguments') if mouse_con: self.mouse_con = MouseController("low", "fast") self.show_video, self.save_video = show_video, save_video def __call__( self, input_type=None, input_file=None, ): self.run(input_type=input_type, input_file=input_file) def run( self, input_type=None, input_file=None, ): if input_type and input_file: self.input_ = InputFeeder(input_type, input_file) self.input_.load_data() if self.save_video: out = cv2.VideoWriter( 'output.mp4', 0x00000021, 30, (int(self.input_.cap.get(3)), int(self.input_.cap.get(4)))) try: fc_dec_inf_time = 0 landmark_inf_time = 0 pose_inf_time = 0 gaze_inf_time = 0 frame_counter = 0 while True: # Read the next frame try: frame = next(self.input_.next_batch()) frame_counter += 1 except StopIteration: break key_pressed = cv2.waitKey(60) # face detection start = time.time() out_frame, boxes = self.face_dec.predict(frame, display_output=True) fc_dec_inf_time += (time.time() - start) #for each box for box in boxes: face = out_frame[box[1]:box[3], box[0]:box[2]] start = time.time() out_frame, left_eye_point, right_eye_point = self.fac_land.predict( out_frame, face, box, display_output=True) landmark_inf_time += (time.time() - start) start = time.time() out_frame, headpose_angels = self.head_pose.predict( out_frame, face, box, display_output=True) pose_inf_time += (time.time() - start) start = time.time() out_frame, gazevector = self.gaze.predict( out_frame, face, box, left_eye_point, right_eye_point, headpose_angels, display_output=True) gaze_inf_time += (time.time() - start) if self.show_video: cv2.imshow('im', out_frame) if self.save_video: out.write(out_frame) if self.mouse_con: self.mouse_con.move(gazevector[0], gazevector[1]) time.sleep(1) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break if self.save_video: out.release() self.input_.close() cv2.destroyAllWindows() print( 'average inference time for face detection model is :- {:2f}ms' .format((fc_dec_inf_time / frame_counter) * 1000)) print( 'average inference time for facial landmark model is :- {:2f}ms' .format((landmark_inf_time / frame_counter) * 1000)) print( 'average inference time for head pose estimation model is :- {:2f}ms' .format((pose_inf_time / frame_counter) * 1000)) print( 'average inference time for gaze estimation model is :- {:2f}ms' .format((gaze_inf_time / frame_counter) * 1000)) except Exception as ex: logging.exception("Error in inference: " + str(ex))
def main(): # Grab command line args args = read_argument().parse_args() logger_obj = log.getLogger() if args.input == 'CAM': input_feeder = InputFeeder('cam') elif args.input.endswith('jpg') or args.input.endswith('bmp'): input_feeder = InputFeeder('image', args.input) elif args.input.endswith('mp4'): input_feeder = InputFeeder('video', args.input) else: logger_obj.error( "Unsupported input, valid inputs are image(jpg and bmp), video file(mp4) or webcam/video stream." ) # Initialize inference models face_detection_model = face_detection(args.face_detection_model, args.device, args.prob_threshold, args.cpu_extension) facial_landmarks_detection_model = facial_landmarks_detection( args.facial_landmarks_detection, args.device, args.cpu_extension) head_pose_estimation_model = head_pose_estimation( args.head_pose_estimation, args.device, args.cpu_extension) gaze_estimation_model = gaze_estimation(args.gaze_estimation, args.device, args.cpu_extension) mouse_controller = MouseController('medium', 'fast') # Load inference models start_time = time.time() face_detection_model.load_model() face_detection_model_load_time = time.time() logger_obj.error("Face detection load time in seconds: {:.2f} ms".format( (time.time() - start_time) * 1000)) facial_landmarks_detection_model.load_model() facial_landmarks_detection_load_time = time.time() logger_obj.error( "Facial Landmark detection load time in seconds: {:.2f} ms".format( (time.time() - start_time) * 1000)) head_pose_estimation_model.load_model() head_pose_estimation_load_time = time.time() logger_obj.error( "Head pose detection load time in seconds: {:.2f} ms".format( (time.time() - start_time) * 1000)) gaze_estimation_model.load_model() gaze_estimation_load_time = time.time() logger_obj.error("Gaze estimation load time in seconds: {:.2f} ms".format( (time.time() - start_time) * 1000)) # Load input feeder. input_feeder.load_data() total_model_load_time = time.time() - start_time counter = 0 inference_start_time = time.time() # run inference for flag, frame in input_feeder.next_batch(): if not flag: break pressed_key = cv2.waitKey(60) counter = counter + 1 face_detection_output, coords = face_detection_model.predict(frame) head_pose_estimation_output = head_pose_estimation_model.predict( face_detection_output) left_eye_image, right_eye_image, eye_coord = facial_landmarks_detection_model.predict( face_detection_output) mouse_controller_coordinate, gaze_estimation_vector = gaze_estimation_model.predict( left_eye_image, right_eye_image, head_pose_estimation_output) preview_flag = args.previewFlags if len(preview_flag) != 0: preview_window = frame.copy() if 'face_detect' in preview_flag: cv2.rectangle(preview_window, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 3) #logger_obj.error('inside face_detect') if 'face_landmark_detect' in preview_flag: if 'face_detect' in preview_flag: preview_window = face_detection_output cv2.rectangle(preview_window, (eye_coord[0][0], eye_coord[0][1]), (eye_coord[0][2], eye_coord[0][3]), (255, 0, 255)) cv2.rectangle(preview_window, (eye_coord[1][0], eye_coord[1][1]), (eye_coord[1][2], eye_coord[1][3]), (255, 0, 255)) #logger_obj.error('inside facial landmark') if 'head_pose' in preview_flag: cv2.putText( preview_window, "yaw:{:.1f} | pitch:{:.1f} | roll:{:.1f}".format( head_pose_estimation_output[0], head_pose_estimation_output[1], head_pose_estimation_output[2]), (20, 20), cv2.FONT_HERSHEY_COMPLEX, 0.35, (0, 0, 0), 1) #logger_obj.error('inside head pose') if 'gaze_est' in preview_flag: yaw = head_pose_estimation_output[0] pitch = head_pose_estimation_output[1] roll = head_pose_estimation_output[2] focal_length = 950 scale = 50 center_of_face = (face_detection_output.shape[1] / 2, face_detection_output.shape[0] / 2, 0) if 'face_detect' in preview_flag or 'face_landmark_detect' in preview_flag: draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) #logger_obj.error('inside gaze 1') else: draw_axes(frame, center_of_face, yaw, pitch, roll, scale, focal_length) #logger_obj.error('inside gaze 2') if len(preview_flag) != 0: #image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_window, (500, 500)))) image = cv2.resize(preview_window, (500, 500)) #logger_obj.error('hstack images side by side') else: image = cv2.resize(frame, (500, 500)) cv2.imshow('Visualization', image) mouse_controller.move(mouse_controller_coordinate[0], mouse_controller_coordinate[1]) if pressed_key == 27: logger_obj.error("exit key is pressed..") break inference_time = round(time.time() - inference_start_time, 2) fps = int(counter) / inference_time logger_obj.error("counter {} seconds".format(counter)) logger_obj.error("Total model load time in seconds: {:.2f} s".format( total_model_load_time)) logger_obj.error( "Total inference time in seconds: {:.2f} s".format(inference_time)) logger_obj.error("fps {}".format(fps)) input_feeder.close() cv2.destroyAllWindows()
def run_controller(args): # print(args.save) feeder = None if args.input == "cam": feeder = InputFeeder("cam") elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): if not os.path.isfile(args.input): log.error("Unable to find specified video file") exit(1) feeder = InputFeeder("image", args.input, args.save) else: if not os.path.isfile(args.input): log.error("Unable to find specified video file") exit(1) feeder = InputFeeder("video", args.input, args.save) feeder.load_data() mc = MouseController('medium', 'fast') model_face = Face_Detector() model_face.load_model(args.model_fd, args.device, args.extension) model_pose = Pose_Estimator() model_pose.load_model(args.model_pe, args.device, args.extension) model_landmark = Facial_Landmarks() model_landmark.load_model(args.model_fl, args.device, args.extension) model_gaze = Gaze_Estimator() model_gaze.load_model(args.model_ge, args.device, args.extension) frame_count = 0 for b, frame in feeder.next_batch(): frame_count += 1 preview = np.copy(frame) crop_face, face_count, points = model_face.predict( preview, args.thres_fd) key_pressed = cv2.waitKey(30) if (face_count == 0): if (b or key_pressed == 27): break log.error('no face is detected') feeder.save_file(preview) continue angles = model_pose.predict(preview, crop_face) left_eye, right_eye, eye_points = model_landmark.predict( preview, crop_face, points) mx, my = model_gaze.predict(preview, left_eye, right_eye, angles, eye_points) feeder.save_file(preview) if key_pressed == 27: break if frame_count % 5 == 0: if args.draw_lines: cv2.imshow('video', cv2.resize(preview, (500, 500))) else: cv2.imshow('video', cv2.resize(frame, (500, 500))) mc.move(mx, my) feeder.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() previewFlags = args.previewFlags logger = logging.getLogger() inputFile = args.input inputFeeder = None if inputFile.lower()=="cam": inputFeeder=InputFeeder("cam") if not os.path.isfile(inputFile): logger.error("Unable to find input file") exit(1) inputFeeder=InputFeeder("video",inputFile) mfd=Model_Face_Detection(args.facedetectionmodel,args.device,args.cpu_extension) mfld=Model_Facial_Landmarks_Detection(args.faciallandmarkmodel,args.device,args.cpu_extension) mge=Model_Gaze_Estimation(args.gazeestimationmodel,args.device,args.cpu_extension) mhpe=Model_Head_Pose_Estimation(args.headposemodel,args.device,args.cpu_extension) mc = MouseController('medium','fast') #inputFeeder=InputFeeder("cam") inputFeeder.load_data() mfd.load_model() mfld.load_model() mge.load_model() mhpe.load_model() frame_count = 0 for ret, frame in inputFeeder.next_batch(): if frame is not None: frame_count+=1 if frame_count%5==0: cv2.imshow('video',cv2.resize(frame,(500,500))) key = cv2.waitKey(60) croppedFace, face_coords = mfd.predict(frame.copy(), args.prob_threshold) if type(croppedFace)==int: logger.error("No face detected.") if key==27: break continue hp_out = mhpe.predict(croppedFace.copy()) left_eye, right_eye, eye_coords = mfld.predict(croppedFace.copy()) #print(left_eye new_mouse_coord, gaze_vector = mge.predict(left_eye, right_eye, hp_out) if (not len(previewFlags)==0): preview_frame = frame.copy() if 'fd' in previewFlags: preview_frame = croppedFace if 'fld' in previewFlags: cv2.rectangle(croppedFace, (eye_coords[0][0]-10, eye_coords[0][1]-10), (eye_coords[0][2]+10, eye_coords[0][3]+10), (0,255,0), 3) cv2.rectangle(croppedFace, (eye_coords[1][0]-10, eye_coords[1][1]-10), (eye_coords[1][2]+10, eye_coords[1][3]+10), (0,255,0), 3) if 'hp' in previewFlags: cv2.putText(preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(hp_out[0],hp_out[1],hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in previewFlags: x, y, w = int(gaze_vector[0]*12), int(gaze_vector[1]*12), 160 le =cv2.line(left_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2) cv2.line(le, (x-w, y+w), (x+w, y-w), (255,0,255), 2) re = cv2.line(right_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2) cv2.line(re, (x-w, y+w), (x+w, y-w), (255,0,255), 2) croppedFace[eye_coords[0][1]:eye_coords[0][3],eye_coords[0][0]:eye_coords[0][2]] = le croppedFace[eye_coords[1][1]:eye_coords[1][3],eye_coords[1][0]:eye_coords[1][2]] = re cv2.imshow("visualization",cv2.resize(preview_frame,(500,500))) if frame_count%5==0: mc.move(new_mouse_coord[0],new_mouse_coord[1]) if key==27: break logger.error("video ended...") cv2.destroyAllWindows() inputFeeder.close()
def main(): args = get_args() log.basicConfig(filename='example.log', level=log.DEBUG) inputFile = args.input #inputFile = "./bin/demo.mp4" mouse = MouseController("high", "fast") frame_count = 0 focal_length = 950.0 scale = 50 #print(f"Visual flag: {args.visual_flag}") if inputFile.lower() == "cam": feed = InputFeeder('cam') log.info("Video source: " + str(inputFile)) else: if not os.path.isfile(inputFile): log.error("Unable to find file: " + inputFile) exit(1) feed = InputFeeder("video", inputFile) log.info("Video source: " + str(inputFile)) log.info("InputFeeder initialized") log.info("Device: " + str(args.device)) log.info("Face detection model: " + str(args.facedetectionmodel)) log.info("Facial landmarks model: " + str(args.faciallandmarksmodel)) log.info("Head pose estimation model: " + str(args.headposemodel)) log.info("Gaze estimation model: " + str(args.gazeestimationmodel)) if args.stats == 1: print("Running statistics...") inference_times = [] fdm_inference_times = [] hpm_inference_times = [] flm_inference_times = [] gem_inference_times = [] start_time = time.time() # Create instances of the different models fdm = FaceDetector(args.facedetectionmodel, args.device, args.cpu_extension) if args.stats == 1: start_time = time.time() fdm.load_model() fdm_load_time = time.time() - start_time else: fdm.load_model() fdm.check_model() hpm = HeadPoseEstimator(args.headposemodel, args.device, args.cpu_extension) if args.stats == 1: start_time = time.time() hpm.load_model() hpm_load_time = time.time() - start_time else: hpm.load_model() hpm.check_model() flm = FacialLandmarksDetector(args.faciallandmarksmodel, args.device, args.cpu_extension) if args.stats == 1: start_time = time.time() flm.load_model() flm_load_time = time.time() - start_time else: flm.load_model() flm.check_model() gem = GazeEstimator(args.gazeestimationmodel, args.device, args.cpu_extension) if args.stats == 1: start_time = time.time() gem.load_model() gem_load_time = time.time() - start_time else: gem.load_model() gem.check_model() if args.stats == 1: duration_loading = time.time() - start_time print( f"Duration for loading and checking the models: {duration_loading}" ) log.info( f"Duration for loading and checking the models: {duration_loading}" ) cv2.namedWindow('preview', cv2.WINDOW_NORMAL) cv2.resizeWindow('preview', 600, 600) feed.load_data() for ret, frame in feed.next_batch(): if not ret: break if frame is not None: frame_count += 1 key = cv2.waitKey(60) if args.stats == 1: start_time = time.time() # Run face detection face_crop, face_coords = fdm.predict(frame.copy()) print("Face crop shape: " + str(face_crop.shape)) frame_h, frame_w = frame.shape[:2] (xmin, ymin, xmax, ymax) = face_coords face_frame = frame[ymin:ymax, xmin:xmax] #center_of_face = (xmin + face_frame.shape[1] / 2, ymin + face_frame.shape[0] / 2, 0) # 0 for colour channel #print("Center of face " + str(center_of_face)) try: # Check if face was detected if type(face_coords) == int: print("Unable to detect face") if key == 27: break continue # Facial landmark detection left_eye_crop, right_eye_crop, landmarks, crop_coords = flm.predict( face_crop.copy()) #print("Landmarks" +str(landmarks)) left_eye = (landmarks[0], landmarks[1]) right_eye = (landmarks[2], landmarks[3]) # Landmark position based on complete frame landmarks_viz = landmarks landmarks_viz[0] = landmarks_viz[0] + xmin landmarks_viz[1] = landmarks_viz[1] + ymin landmarks_viz[2] = landmarks_viz[2] + xmin landmarks_viz[3] = landmarks_viz[3] + ymin crop_coords_viz = (crop_coords[0] + xmin, crop_coords[1] + ymin, crop_coords[2] + xmin, crop_coords[3] + ymin, crop_coords[4] + xmin, crop_coords[5] + ymin, crop_coords[6] + xmin, crop_coords[7] + ymin) left_eye_viz = (landmarks_viz[0], landmarks_viz[1]) right_eye_viz = (landmarks_viz[2], landmarks_viz[3]) third_eye_viz_x = (landmarks_viz[2] - landmarks_viz[0]) / 2 + landmarks_viz[0] third_eye_viz_y = (landmarks_viz[3] - landmarks_viz[1]) / 2 + landmarks_viz[1] third_eye_viz = (third_eye_viz_x, third_eye_viz_y) #print(landmarks_viz[0], landmarks_viz[2], third_eye_viz_x) # Head pose estimation head_pose = hpm.predict(face_crop.copy()) print("Head pose: " + str(head_pose)) (yaw, pitch, roll) = head_pose frame = display_head_pose(frame, pitch, roll, yaw) # Send inputs to GazeEstimator gaze_vector = gem.predict(head_pose, left_eye_crop, right_eye_crop) if args.stats == 1: inference_time = time.time() - start_time inference_times.append(inference_time) print(gaze_vector) frame = display_gaze(frame, gaze_vector) # Control the mouse if frame_count % 5 == 0: mouse_x, mouse_y = get_mouse_vector(gaze_vector, roll) print("Mouse vector:" + str(mouse_x) + " - " + str(mouse_y)) mouse.move(mouse_x, mouse_y) currentMouseX, currentMouseY = pyautogui.position() print("Mouse coordinates: " + str(currentMouseX) + ", " + str(currentMouseY)) if args.visual_flag == 1: frame = draw_bounding_box(frame, face_coords) left_eye_frame = crop_coords_viz[0:4] right_eye_frame = crop_coords_viz[4:] frame = draw_bounding_box(frame, left_eye_frame) frame = draw_bounding_box(frame, right_eye_frame) frame = visualize_landmark(frame, left_eye_viz) frame = visualize_landmark(frame, right_eye_viz, color=(0, 0, 255)) frame = visualize_gaze(frame, gaze_vector, landmarks_viz) # visualize the axes of the HeadPoseEstimator results #frame = hpm.draw_axes(frame.copy(), center_of_face, yaw, pitch, roll, scale, focal_length) frame = hpm.draw_axes(frame.copy(), third_eye_viz, yaw, pitch, roll, scale, focal_length) #hdm.draw_axes(frame.copy(), center_of_face, yaw, pitch, roll, scale, focal_length) cv2.imshow('preview', frame) cv2.imshow('left eye', left_eye_crop) cv2.imshow('right eye', right_eye_crop) except Exception as e: print("Unable to predict using model" + str(e) + " for frame " + str(frame_count)) log.error("Unable to predict using model" + str(e) + " for frame " + str(frame_count)) continue if args.stats == 1: avg_inference_time = sum(inference_times) / len(inference_times) print("Average inference time: " + str(avg_inference_time)) log.info("Average inference time: " + str(avg_inference_time)) log.info("Load time for face detection model: " + str(fdm_load_time)) log.info("Load time for facial landmarks model: " + str(flm_load_time)) log.info("Load time for head pose detection model: " + str(hpm_load_time)) log.info("Load time for gaze estimation model: " + str(gem_load_time)) cv2.destroyAllWindows() feed.close()
def infer_on_video(args): args.ct = float(args.ct) input_file = args.i # Check if 'cam' or video file was chosen? if input_file.lower() == 'cam': i_feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_file): log.error( "Wasn't able to find video file, please correct directory!") exit(1) i_feeder = InputFeeder(input_type='video', input_file=input_file) # Load image/frame of chosen medium i_feeder.load_data() # Initialize the Inference Engine for each model fd_plugin = Fd_Network() lr_plugin = Lr_Network() hp_plugin = Hp_Network() ge_plugin = Ge_Network() # Load the network models into the IE and get the net input shape start_load_time = time.time() fd_plugin.load_model(args.fdm, args.d) lr_plugin.load_model(args.lrm, args.d) hp_plugin.load_model(args.hpm, args.d) ge_plugin.load_model(args.gem, args.d) total_load_time = time.time() - start_load_time log.info("Time it took to load all models: " + str(total_load_time)) mouse_controller = MouseController('medium', 'fast') # Get net input shape of models fd_net_input_shape = fd_plugin.get_input_shape() lr_net_input_shape = lr_plugin.get_input_shape() hp_net_input_shape = hp_plugin.get_input_shape() # Currently not used as it didn't return the needed shape correctly for gaze estimation # ge_net_input_shape = ge_plugin.get_input_shape() # frame_count for FPS calc and start_inf_time, to calc total inference time frame_count = 0 start_inf_time = time.time() # Process frames until the video ends, or process is exited for ret, frame in i_feeder.next_batch(): if not ret: break frame_count += 1 key_pressed = cv2.waitKey(60) height, width = frame.shape[:2] ##### FACE-DETECTION #START# # Pre-process the frame fd_frame = preprocess_image(frame, fd_net_input_shape[3], fd_net_input_shape[2], "face-detection") # Perform inference on the frame fd_plugin.async_inference(fd_frame) # Get the output of inference if fd_plugin.wait() == 0: result = fd_plugin.extract_output() # Get frame with bounding box for face, a cropped version and it's coords cropped_face, coords_face = detect_face(frame, result, args, width, height) ##### FACE-DETECTION #END# ##### LANDMARK REGRESSION MODEL #START# lr_frame = preprocess_image(cropped_face, lr_net_input_shape[3], lr_net_input_shape[2], "landmark-regression") lr_plugin.async_inference(lr_frame) if lr_plugin.wait() == 0: lr_result = lr_plugin.extract_output() l_eye_img, r_eye_img, eye_coords = preprocess_lr_output( lr_result, cropped_face) ###### LANDMARK REGRESSION MODEL #END# ##### HEAD POSE MODEL #START# hp_frame = preprocess_image(cropped_face, hp_net_input_shape[3], hp_net_input_shape[2], "head-pose") hp_plugin.async_inference(hp_frame) if hp_plugin.wait() == 0: hp_result = hp_plugin.extract_output() hp_output = preprocess_hp_output(hp_result) ##### HEAD POSE MODEL #END# ##### GAZE AND MOUSE #START# # Hard-coded value because net-input-shape didn't return correctly for the gaze-estimation model p_l_eye_img = preprocess_image(l_eye_img, 60, 60, "gaze-estimation") p_r_eye_img = preprocess_image(r_eye_img, 60, 60, "gaze-estimation") # Perform inference on eye images and head pose output ge_plugin.async_inference(p_l_eye_img, p_r_eye_img, hp_output) if ge_plugin.wait() == 0: ge_result = ge_plugin.extract_output() mouse_coord, gaze_vector = preprocess_ge_output( ge_result, hp_output) ##### GAZE AND MOUSE #END# # Draw on frame if at least one flag was entered via command line if len(args.flags) != 0: draw_results(frame, cropped_face, coords_face, l_eye_img, r_eye_img, eye_coords, hp_output, gaze_vector, args.flags, height) cv2.imshow("cropped", cropped_face) # cv2.imshow("Left Eye", l_eye_img) # cv2.imshow("Right Eye", r_eye_img) cv2.imshow("frame", frame) if frame_count % 5 == 0: mouse_controller.move(mouse_coord[0], mouse_coord[1]) # Break if escape key pressed if key_pressed == 27: break total_inf_time = time.time() - start_inf_time fps = (frame_count / total_inf_time) log.info("Total-Inference-Time:" + str(total_inf_time)) log.info("FPS: " + str(fps)) # Release the capture and destroy any OpenCV windows i_feeder.close() cv2.destroyAllWindows()
def main(args): # set log level levels = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR } log_level = levels.get(args.log_level, logging.ERROR) logging.basicConfig(level=log_level) mouse_control = MouseController('high', 'fast') logging.info("Model Loading Please Wait ..") face_det = FaceDetection(args.face_detection, args.device) facial_det = FaceLandmark(args.face_landmark, args.device) head_pose_est = HeadPoseEstimation(args.head_pose, args.device) gaze_est = GazeEstimation(args.gaze_estimation, args.device) logging.info("Model loading successfully") inp = InputFeeder(input_type='video', input_file=args.input) inp.load_data() face_det.load_model() facial_det.load_model() head_pose_est.load_model() gaze_est.load_model() video_writer = cv2.VideoWriter(args.output_dir + '/demo_output11.mp4', cv2.VideoWriter_fourcc(*'MPEG'), 15, (1920, 1080), True) cv2.namedWindow('gaze') for frame in inp.next_batch(): try: frame.shape except Exception as err: break crop_face, crop_coords = face_det.predict(frame, visualize=args.visualize) left_eye, right_eye, left_eye_crop, right_eye_crop = facial_det.predict( crop_face, visualize=args.visualize) head_pose = head_pose_est.predict(crop_face, visualize=args.visualize) (new_x, new_y), gaze_vector = gaze_est.predict(left_eye_crop, right_eye_crop, head_pose) left_eye_gaze = int(left_eye[0] + gaze_vector[0] * 100), int(left_eye[1] - gaze_vector[1] * 100) right_eye_gaze = int(right_eye[0] + gaze_vector[0] * 100), int(right_eye[1] - gaze_vector[1] * 100) cv2.arrowedLine(crop_face, left_eye, left_eye_gaze, (0, 0, 255), 2) cv2.arrowedLine(crop_face, right_eye, right_eye_gaze, (0, 0, 255), 2) video_writer.write(frame) mouse_control.move(new_x, new_y) if args.show_result: cv2.imshow('gaze', frame) cv2.waitKey(1) inp.close() video_writer.release() cv2.destroyAllWindows()
def main(): ## calling argparser args = build_argparser().parse_args() # create a log file logging.basicConfig(filename='Project_log.log', level=logging.INFO) logger = logging.getLogger() ## get args input variable input_path = args.input ## get args visualization flags visual_flags = args.flag_visualization ## put all keys for visualization in dict Dict_visual_keys = { 'args_face': 'fd', 'args_land': 'fl', 'args_head': 'hp', 'args_gaze': 'ge', 'args_crop': 'crop', 'args_win': 'win' } ## check if using CAMERA or video file or image if input_path == "CAM" or input_path=="cam": print("\n## You are using CAMERA right now..." + input_path + " detected!") logger.info("\n## You are using CAMERA right now..." + input_path + " detected!") feeder_in = InputFeeder(input_path.lower()) else: ## check if input file exists in given path if not os.path.isfile(input_path): print("\nInput file not exists in Path: " + input_path + ". Please check again !!!") logger.error("## Input file not exists in Path: " + input_path + ". Please check again !!!") exit(1) else: print('\nInput path exists: '+ input_path + '\n') logger.info('\nInput path exists: '+ input_path) feeder_in = InputFeeder("video", input_path) ## handler for mouse moving by precision and speed mouse_handler = MouseController('medium', 'fast') ## initialize 4 models model_fd, model_fld, model_hpe, model_ge = models_handler(logger, args) feeder_in.load_data() print("\n## Loaded Input Feeder ") logger.info("## Loaded Input Feeder ") ## load face detection model model_fd_start_time = time.time() model_fd.load_model() model_fd_load_time = (time.time() - model_fd_start_time)*1000 logger.info('FaceDetection load time: ' + str(round(model_fd_load_time, 3)) + ' ms') ## load facial landmarks detection model model_fld_start_time = time.time() model_fld.load_model() model_fld_load_time = (time.time() - model_fld_start_time)*1000 logger.info('FacialLandmarkDetection load time: ' + str(round(model_fld_load_time, 3)) + ' ms') ## load head pose estimation model model_hpe_start_time = time.time() model_hpe.load_model() model_hpe_load_time = (time.time() - model_hpe_start_time)*1000 logger.info('HeadPoseEstimation load time: ' + str(round(model_hpe_load_time, 3)) + ' ms') ## load gaze estimation model model_ge_start_time = time.time() model_ge.load_model() model_ge_load_time, total_load_time = (time.time() - model_ge_start_time)*1000, (time.time() - model_fd_start_time)*1000 logger.info('GazeEstimation load time: ' + str(round(model_ge_load_time, 3)) + ' ms') ## Model load time in total logger.info('Total Load time: ' + str(round(total_load_time, 3)) + ' ms') print('\n## All model successfully loaded!') logger.info('## All model successfully loaded!') frame_count = 0 print("## Start inference on frame!") logger.info("## Start inference on frame!") ## empty list for each model to accumulate infer time and later get average infer time fd_infer_time = [] fld_infer_time = [] hpe_infer_time = [] ge_infer_time = [] start_infer_time = time.time() ## loop through each frame and start inference on each model for flag_return, frame in feeder_in.next_batch(): # print(flag_return) if not flag_return: print('\nflag_return: ' + str(flag_return) + '. Video has reach to the end...') logger.error('flag_return: ' + str(flag_return) + '. Video has reach to the end...') break event_key = cv2.waitKey(60) ## frame count add by 1 frame_count += 1 if args.show_info: print('\nNo. frame: {}'.format(frame_count)) if event_key ==27: print("\nUser keyboard exit!....") break ## Face detection ## t0 = time.time() cropped_face, face_coords = model_fd.predict(frame.copy(), args.prob_threshold, args.perf_counts) # print(cropped_face.shape) ## face_coords ## top left, bottom right fd_infer_time.append((time.time() - t0)*1000) # print(fd_infer_time) if args.show_info: print("Average inference time of FaceDetection model: {} ms".format(np.average(np.asarray(fd_infer_time)))) ## if no face detected if len(face_coords)==0: print("## No Face detected...") logger.error("## No face detected. Please check once again!") continue ## Landmarks detection ## t1 = time.time() l_eye_box, r_eye_box, eyes_coords = model_fld.predict(cropped_face.copy(), args.perf_counts) # print(l_eye_box.shape, r_eye_box.shape) # left eye and right eye image ## [left eye box, right eye box] ## [[leye_xmin,leye_ymin,leye_xmax,leye_ymax], [reye_xmin,reye_ymin,reye_xmax,reye_ymax]] # print(eyes_coords) fld_infer_time.append((time.time()- t1)*1000) # print(fld_infer_time) if args.show_info: print("Average inference time of FacialLandmarkDetection model: {} ms".format(np.average(np.asarray(fld_infer_time)))) ## Head pose detection ## t2 = time.time() hpe_output = model_hpe.predict(cropped_face.copy(), args.perf_counts) # [6.927431583404541, -4.0265960693359375, -1.8397517204284668] # print(hpe_output) # yaw, pitch, roll hpe_infer_time.append((time.time() - t2)*1000) if args.show_info: print("Average inference time of HeadPoseEstimation model: {} ms".format(np.average(np.asarray(hpe_infer_time)))) ## Gaze estimation ## t3 = time.time() mouse_position, gaze_vector = model_ge.predict(l_eye_box, r_eye_box, hpe_output, args.perf_counts) ## mouse position (x, y), gaze_vector [-0.13984774, -0.38296703, -0.9055522 ] ge_infer_time.append((time.time() - t3)*1000) if args.show_info: print("Average inference time of GazeEstimation model: {} ms".format(np.average(np.asarray(ge_infer_time)))) # print('@@@@@@@@@@@@@', len(visual_flags)) ## Visualize the result if visual_flags activated if len(visual_flags) > 0 and len(visual_flags) <= 6 and Dict_visual_keys['args_win'] in visual_flags: frame_copy = frame.copy() if Dict_visual_keys['args_face'] in visual_flags: # Face cv2.rectangle(frame_copy, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 2) if Dict_visual_keys['args_land'] in visual_flags: # Facial Landmark left right eyes cv2.rectangle(frame_copy, (face_coords[0] + eyes_coords[0][0], face_coords[1] + eyes_coords[0][1]), (face_coords[0]+eyes_coords[0][2], face_coords[1]+eyes_coords[0][3]),(255,255,255), 2) cv2.rectangle(frame_copy, (face_coords[0] + eyes_coords[1][0], face_coords[1] + eyes_coords[1][1]), (face_coords[0]+eyes_coords[1][2], face_coords[1]+eyes_coords[1][3]),(255,255,255), 2) if Dict_visual_keys['args_crop'] in visual_flags: ## cropped face with landmarks left and right eyes ## land_frame = cropped_face.copy() cv2.rectangle(land_frame, (eyes_coords[0][0], eyes_coords[0][1]), (eyes_coords[0][2],eyes_coords[0][3]),(0,255,0), 2) cv2.rectangle(land_frame, (eyes_coords[1][0], eyes_coords[1][1]), (eyes_coords[1][2],eyes_coords[1][3]),(0,255,0), 2) cv2.imshow('FacialLandmark', cv2.resize(land_frame, (300, 400))) if Dict_visual_keys['args_head'] in visual_flags: # Head Pose values cv2.putText(frame_copy, "Angles of Head Pose:", (10, 25), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 255, 255), 2) cv2.putText(frame_copy, "Yaw: {:.2f}".format(hpe_output[0]), (10, 55), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 255, 255), 2) cv2.putText(frame_copy, "Pitch: {:.2f}".format(hpe_output[1]), (10, 85), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 255, 255), 2) cv2.putText(frame_copy, "Roll: {:.2f}".format(hpe_output[2]), (10, 115), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 255, 255), 2) if Dict_visual_keys['args_gaze'] in visual_flags: # Gaze arrow left right eyes x, y = gaze_vector[0:2] len_add = 400 ## eye left center point (x, y) eye_left_center = (int(face_coords[0] + (eyes_coords[0][0]+eyes_coords[0][2])/2), int(face_coords[1] + (eyes_coords[0][1]+eyes_coords[1][3])/2)) ## eye right center point (x, y) eye_right_center = (int(face_coords[0] + (eyes_coords[1][0]+eyes_coords[1][2])/2), int(face_coords[1] + (eyes_coords[1][1]+eyes_coords[1][3])/2)) ## draw arrow line for both gaze of eyes cv2.arrowedLine(frame_copy, eye_left_center, (int(eye_left_center[0]+x*len_add), int(eye_left_center[1]-y*len_add)), (0,0,255), 3) cv2.arrowedLine(frame_copy, eye_right_center, (int(eye_right_center[0]+x*len_add), int(eye_right_center[1]-y*len_add)), (0,0,255), 3) ## if with '-show win' without model keys will only display normal video stream cv2.imshow('Visualization', cv2.resize(frame_copy, (800,700))) else: print("\n## No Visualization, Only information displaying... \n## If needs visualization please add '-show' with specific keys...") if frame_count % 4 == 0: ## start move mouse each 4 frames mouse_handler.move(mouse_position[0], mouse_position[1]) total_infer_time = time.time() - start_infer_time fps = frame_count / round(total_infer_time, 3) # print(args.show_info) if args.show_info: print('Total inference time: ' + str(round(total_infer_time*1000, 3)) + ' ms') print("Total frame: " + str(frame_count)) print('FPS: ' + str(fps)) ## loggging into project log file # logger.info('Total inference time: ' + str(round(total_infer_time, 3)) + ' s') logger.info("Average inference time of FaceDetection model: {} ms".format(np.average(np.asarray(fd_infer_time)))) logger.info("Average inference time of FacialLandmarkDetection model: {} ms".format(np.average(np.asarray(fld_infer_time)))) logger.info("Average inference time of HeadPoseEstimation model: {} ms".format(np.average(np.asarray(hpe_infer_time)))) logger.info("Average inference time of GazeEstimation model: {} ms".format(np.average(np.asarray(ge_infer_time)))) logger.info('Total inference time: ' + str(round(total_infer_time*1000, 3)) + ' ms') logger.info("Total frame: " + str(frame_count)) logger.info('FPS: ' + str(fps)) logger.error("### Camera Stream or Video Stream has reach to the end...###") cv2.destroyAllWindows() feeder_in.close()