def run_inference(args): feed = InputFeeder(input_type='video', input_file=args.input) feed.load_data() for batch in feed.next_batch(): cv2.imshow("Output", cv2.resize(batch, (500, 500))) key = cv2.waitKey(60) if (key == 27): break # getting face faceDetection = FaceDetection(model_name=args.face_detection_model) faceDetection.load_model() face = faceDetection.predict(batch) # getting eyes facialLandmarksDetection = FacialLandmarksDetection( args.facial_landmarks_detection_model) facialLandmarksDetection.load_model() left_eye, right_eye = facialLandmarksDetection.predict(face) # getting head pose angles headPoseEstimation = HeadPoseEstimation( args.head_pose_estimation_model) headPoseEstimation.load_model() head_pose = headPoseEstimation.predict(face) print("head pose angles: ", head_pose) # get mouse points gazeEstimation = GazeEstimation(args.gaze_estimation_model) gazeEstimation.load_model() mouse_coords = gazeEstimation.predict(left_eye, right_eye, head_pose) print("gaze output: ", mouse_coords) feed.close()
class MoveMouse: ''' Main Class for the Mouse Controller app. This is the class where all the models are stitched together to control the mouse pointer ''' def __init__(self, args): ''' This method instances variables for the Facial Landmarks Detection Model. Args: args = All arguments parsed by the arguments parser function Return: None ''' init_start_time = time.time() self.output_path = args.output_path self.show_output = args.show_output self.total_processing_time = 0 self.count_batch = 0 self.inference_speed = [] self.avg_inference_speed = 0 if args.all_devices != 'CPU': args.face_device = args.all_devices args.face_landmark_device = args.all_devices args.head_pose_device = args.all_devices args.gaze_device = args.all_devices model_init_start = time.time() self.face_model = FaceDetection(args.face_model, args.face_device, args.face_device_ext, args.face_prob_threshold) self.landmarks_model = FacialLandmarksDetection( args.face_landmark_model, args.face_landmark_device, args.face_landmark_device_ext, args.face_landmark_prob_threshold) self.head_pose_model = HeadPoseEstimation( args.head_pose_model, args.head_pose_device, args.head_pose_device_ext, args.head_pose_prob_threshold) self.gaze_model = GazeEstimation(args.gaze_model, args.gaze_device, args.gaze_device_ext, args.gaze_prob_threshold) self.model_init_time = time.time() - model_init_start log.info('[ Main ] All required models initiallized') self.mouse_control = MouseController(args.precision, args.speed) log.info('[ Main ] Mouse controller successfully initialized') self.input_feeder = InputFeeder(args.batch_size, args.input_type, args.input_file) log.info('[ Main ] Initialized input feeder') model_load_start = time.time() self.face_model.load_model() self.landmarks_model.load_model() self.head_pose_model.load_model() self.gaze_model.load_model() self.model_load_time = time.time() - model_load_start self.app_init_time = time.time() - init_start_time log.info('[ Main ] All moadels loaded to Inference Engine\n') return None def draw_face_box(self, frame, face_coords): ''' Draws face's bounding box on the input frame Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with bounding box of faces drawn on it ''' start_point = (face_coords[0][0], face_coords[0][1]) end_point = (face_coords[0][2], face_coords[0][3]) thickness = 5 color = (255, 86, 0) frame = cv2.rectangle(frame, start_point, end_point, color, thickness) return frame def draw_eyes_boxes(self, frame, left_eye_coords, right_eye_coords): ''' Draws face's bounding box on the input frame Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with bounding box of left and right eyes drawn on it ''' left_eye_start_point = (left_eye_coords[0], left_eye_coords[1]) left_eye_end_point = (left_eye_coords[2], left_eye_coords[3]) right_eye_start_point = (right_eye_coords[0], right_eye_coords[1]) right_eye_end_point = (right_eye_coords[2], right_eye_coords[3]) thickness = 5 color = (0, 210, 0) frame = cv2.rectangle(frame, left_eye_start_point, left_eye_end_point, color, thickness) frame = cv2.rectangle(frame, right_eye_start_point, right_eye_end_point, color, thickness) return frame def draw_outputs(self, frame): ''' Draws the inference outputs (bounding boxes of the face and both eyes and the 3D head pose directions) of the four models onto the frames. Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with all inference outputs drawn on it ''' frame = self.draw_face_box(frame, self.face_coords) frame = self.draw_eyes_boxes(frame, self.left_eye_coords, self.right_eye_coords) frame_id = f'Batch id = {self.count_batch}' avg_inference_speed = f'Avg. inference speed = {self.avg_inference_speed:.3f}fps' total_processing_time = f'Total infer. time = {self.total_processing_time:.3f}s' cv2.putText(frame, frame_id, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) cv2.putText(frame, avg_inference_speed, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) cv2.putText(frame, total_processing_time, (15, 45), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) return frame def run_inference(self, frame): ''' Performs inference on the input video or image by passing it through all four models to get the desired coordinates for moving the mouse pointer. Args: frame = Input image, frame from video or camera feed Return: None ''' self.input_feeder.load_data() for frame in self.input_feeder.next_batch(): if self.input_feeder.frame_flag == True: log.info('[ Main ] Started processing a new batch') start_inference = time.time() self.face_coords, self.face_crop = self.face_model.predict( frame) if self.face_coords == []: log.info( '[ Main ] No face detected.. Waiting for you to stare at the camera' ) f.write('[ Error ] No face was detected') else: self.head_pose_angles = self.head_pose_model.predict( self.face_crop) self.left_eye_coords, self.left_eye_image, self.right_eye_coords, self.right_eye_image = self.landmarks_model.predict( self.face_crop) self.x, self.y = self.gaze_model.predict( self.left_eye_image, self.right_eye_image, self.head_pose_angles) log.info( f'[ Main ] Relative pointer coordinates: [{self.x:.2f}, {self.y:.2f}]' ) batch_process_time = time.time() - start_inference self.total_processing_time += batch_process_time self.count_batch += 1 log.info( f'[ Main ] Finished processing batch. Time taken = {batch_process_time}s\n' ) self.mouse_control.move(self.x, self.y) if self.show_output: self.draw_outputs(frame) cv2.imshow('Computer Pointer Controller Output', frame) self.inference_speed.append(self.count_batch / self.total_processing_time) self.avg_inference_speed = sum(self.inference_speed) / len( self.inference_speed) with open(os.path.join(self.output_path, 'outputs.txt'), 'w+') as f: f.write('INFERENCE STATS\n') f.write( f'Total model initialization time : {self.model_init_time:.2f}s\n' ) f.write( f'Total model load time: {self.model_load_time:.2f}s\n' ) f.write( f'App initialization time: {self.app_init_time:.2f}s\n' ) f.write( f'Total processing time: {self.total_processing_time:.2f}s\n' ) f.write( f'Average inference speed: {self.avg_inference_speed:.2f}FPS\n' ) f.write(f'Batch count: {self.count_batch}\n\n') f.write('LAST OUTPUTS\n') f.write(f'Face coordinates: {self.face_coords}\n') f.write(f'Left eye coordinates: {self.left_eye_coords}\n') f.write( f'Right eye coordinates: {self.right_eye_coords}\n') f.write(f'Head pose angles: {self.head_pose_angles}\n') f.write( f'Relative pointer coordinates/ Gaze vector: [{self.x:.2f}, {self.y:.2f}]' ) else: self.input_feeder.close() cv2.destroyAllWindows() log.info( f'[ Main ] All input Batches processed in {self.total_processing_time:.2f}s' ) log.info('[ Main ] Shutting down app...') log.info('[ Main ] Mouse controller app has been shut down.') break return
def main(): args = get_args().parse_args() path_filender = args.input four_flags = args.flags_checker loger = logging.getLogger() feeder_in = None out_path = args.out_path if path_filender.lower() == "cam": feeder_in = InputFeeder("cam") else: if not os.path.isfile(path_filender): loger.error("The video was not found") exit(1) feeder_in = InputFeeder("video", path_filender) model_locations = { 'FaceDetection': args.face_detection_model, 'HeadPoseEstimation': args.head_pose_estimation_model, 'FacialLandmarksDetection': args.facial_landmarks_detection_model, 'GazeEstimation': args.gaze_estimation_model } for key_name in model_locations.keys(): if not os.path.isfile(model_locations[key_name]): loger.error("The system cannot find the " + key_name + " xml file") exit(1) dt = FaceDetection(model_locations['FaceDetection'], args.device, args.cpu_extension) pe = HeadPoseEstimation(model_locations['HeadPoseEstimation'], args.device, args.cpu_extension) ld = FacialLandmarksDetection(model_locations['FacialLandmarksDetection'], args.device, args.cpu_extension) ge = GazeEstimation(model_locations['GazeEstimation'], args.device, args.cpu_extension) cursor = MouseController('medium', 'fast') feeder_in.load_data() model_load_time_start = time.time() dt.load_model() pe.load_model() ld.load_model() ge.load_model() total_load_time = time.time() - model_load_time_start frame_counter = 0 inference_time_start = time.time() for ret, frame in feeder_in.next_batch(): if not ret: break frame_counter = frame_counter + 1 if frame_counter % 1 == 0: cv2.imshow('video', cv2.resize(frame, (600, 600))) key = cv2.waitKey(60) face_detected, coords_face = dt.predict(frame, args.p_th) if type(face_detected) == int: loger.error("The system cannot detect any face.") if key == 27: break continue head_pose_output = pe.predict(face_detected) eye_left_detect, eye_right_detect, eye_coordinates_detect = ld.predict( face_detected) coordi_update_pointer, coordi_gaze = ge.predict( eye_left_detect, eye_right_detect, head_pose_output) if (not len(four_flags) == 0): result_app = frame if 'fad' in four_flags: result_app = face_detected if 'hpe' in four_flags: cv2.putText( result_app, "HP Angles: YAW:{:.3f} * PITCH:{:.3f} * ROLL:{:.3f}". format(head_pose_output[0], head_pose_output[1], head_pose_output[2]), (5, 40), cv2.FONT_HERSHEY_COMPLEX, 0.25, (153, 76, 0), 0) if 'fld' in four_flags: cv2.rectangle(face_detected, (eye_coordinates_detect[0][0] - 4, eye_coordinates_detect[0][1] - 4), (eye_coordinates_detect[0][2] + 4, eye_coordinates_detect[0][3] + 4), (255, 255, 0), 4) cv2.rectangle(face_detected, (eye_coordinates_detect[1][0] - 4, eye_coordinates_detect[1][1] - 4), (eye_coordinates_detect[1][2] + 4, eye_coordinates_detect[1][3] + 4), (255, 255, 0), 4) if 'gae' in four_flags: x = int(coordi_gaze[0] * 2) y = int(coordi_gaze[1] * 2) w = 150 right_E = cv2.line(eye_right_detect, (x - w, y - w), (x + w, y + w), (51, 255, 153), 1) cv2.line(right_E, (x - w, y + w), (x + w, y - w), (51, 255, 253), 1) left_E = cv2.line(eye_left_detect, (x - w, y - w), (x + w, y + w), (51, 255, 153), 1) cv2.line(left_E, (x - w, y + w), (x + w, y - w), (51, 255, 253), 1) face_detected[ eye_coordinates_detect[1][1]:eye_coordinates_detect[1][3], eye_coordinates_detect[1][0]:eye_coordinates_detect[1] [2]] = right_E face_detected[ eye_coordinates_detect[0][1]:eye_coordinates_detect[0][3], eye_coordinates_detect[0][0]:eye_coordinates_detect[0] [2]] = left_E cv2.imshow("Result of the App", cv2.resize(result_app, (600, 600))) if frame_counter % 5 == 0: cursor.move(coordi_update_pointer[0], coordi_update_pointer[1]) if key == 27: break total_time = time.time() - inference_time_start total_time_for_inference = round(total_time, 1) fps = frame_counter / total_time_for_inference with open(out_path + 'stats.txt', 'w') as f: f.write('Inference time: ' + str(total_time_for_inference) + '\n') f.write('FPS: ' + str(fps) + '\n') f.write('Model load time: ' + str(total_load_time) + '\n') loger.error("The video stream is over...") cv2.destroyAllWindows() feeder_in.close()
def main(): args = build_argparser().parse_args() device_name = args.device prob_threshold = args.prob_threshold logger_object = log.getLogger() # Initialize variables with the input arguments model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'FacialLandmarkModel': args.facialLandmarksModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } # Instantiate model face_model = FaceDetection(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_model = FacialLandmarksDetection(model_path_dict['FacialLandmarkModel'], device_name, threshold=prob_threshold) head_pose_model = HeadPoseEstimation(model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_model = GazeEstimation(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) mouse_controller = MouseController('medium', 'fast') # Load Models and get time start_time = time.time() face_model.load_model() logger_object.error("Face detection model loaded: time: {:.3f} ms".format((time.time() - start_time) * 1000)) first_mark = time.time() landmark_model.load_model() logger_object.error( "Facial landmarks detection model loaded: time: {:.3f} ms".format((time.time() - first_mark) * 1000)) second_mark = time.time() head_pose_model.load_model() logger_object.error("Head pose estimation model loaded: time: {:.3f} ms".format((time.time() - second_mark) * 1000)) third_mark = time.time() gaze_model.load_model() logger_object.error("Gaze estimation model loaded: time: {:.3f} ms".format((time.time() - third_mark) * 1000)) load_total_time = time.time() - start_time logger_object.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000)) logger_object.error("All models are loaded successfully..") # Check extention of these unsupported layers face_model.check_model() landmark_model.check_model() head_pose_model.check_model() gaze_model.check_model() preview_flags = args.previewFlags input_filename = args.input output_path = args.output_path prob_threshold = args.prob_threshold if input_filename.lower() == 'cam': input_feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger_object.error("Unable to find specified video file") exit(1) input_feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger_object.error("Unable to find specified model file" + str(model_path)) exit(1) input_feeder.load_data() width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS)) out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps, (width, height), True) frame_counter = 0 start_inf_time = time.time() for ret, frame in input_feeder.next_batch(): if not ret: break frame_counter += 1 key = cv2.waitKey(60) try: cropped_image, face_cords = face_model.predict(frame, prob_threshold) if type(cropped_image) == int: print("Unable to detect the face") if key == 27: break continue left_eye, right_eye, eye_cords = landmark_model.predict(cropped_image) pose_output = head_pose_model.predict(cropped_image) x, y, z = gaze_model.predict(left_eye, right_eye, pose_output, cropped_image, eye_cords) mouse_controller.move(x, y) except Exception as e: print(str(e) + " for frame " + str(frame_counter)) continue image = cv2.resize(frame, (width, height)) if not len(preview_flags) == 0: preview_frame = frame.copy() if 'fd' in preview_flags: if len(preview_flags) != 1: preview_frame = cropped_image cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (0, 0, 255), 3) if 'hp' in preview_flags: cv2.putText( frame, "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".format( pose_output[0], pose_output[1], pose_output[2]), (20, 40), cv2.FONT_HERSHEY_DUPLEX, 1, (255, 0, 0), 3) if 'ge' in preview_flags: cv2.putText( frame, "Gaze vector: x= {:.2f} , y= {:.2f} , z= {:.2f}".format( x, y, z), (15, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 3) image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(image) if frame_counter % 5 == 0: mouse_controller.move(x, y) if key == 27: break inference_time = round(time.time() - start_inf_time, 1) fps = int(frame_counter) / inference_time logger_object.error("counter {} seconds".format(frame_counter)) logger_object.error("total inference time {} seconds".format(inference_time)) logger_object.error("fps {} frame/second".format(fps)) with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f: f.write('inference time : ' + str(inference_time) + '\n') f.write('fps: ' + str(fps) + '\n') f.write('Models Loading: '+ str(load_total_time) + '\n') logger_object.error('Video stream ended') cv2.destroyAllWindows() input_feeder.close()
def main(): args = argparser().parse_args() device = args.device input_feed = args.input log = logging.getLogger() model_paths = { 'facedet': args.face_detection_model + 'xml', 'faceldmdet': args.landmark_detection_model + 'xml', 'headpose': args.pose_estimation_model + 'xml', 'gaze': args.gaze_estimation_model + 'xml' } for mp in model_paths.keys(): if not os.path.isfile(model_paths[mp]): print(model_paths[mp]) print('Recheck file path and try again') log.error("Not a file") raise FileNotFoundError if input_feed == 'cam': feed = InputFeeder(input_type='cam') elif not os.path.isfile(input_feed): print('Recheck file path and try again') log.error("Unable to find specified video file") raise FileNotFoundError else: feed = InputFeeder(input_type='video', input_file=input_feed) facedet = FaceDetection(args.face_detection_model, args.device, args.extensions, args.async_mode) faceldmdet = FacialLandmarksDetection(args.landmark_detection_model, args.device, args.extensions, args.async_mode) headpose = HeadPose(args.pose_estimation_model, args.device, args.extensions, args.async_mode) gaze = GazeEstimation(args.gaze_estimation_model, args.device, args.extensions, args.async_mode) try: log.info('Loading models...') facedet.load_model() faceldmdet.load_model() headpose.load_model() gaze.load_model() feed.load_data() log.info('Models loaded successfully!') except: log.error('One or more of the models failed to load..') exit(1) log.info('Initializing mouse controller') mouse = MouseController(precision='medium', speed='fast') for batch in feed.next_batch(): face = facedet.predict(batch) eyes, eye_coords = faceldmdet.predict(face) pose = headpose.predict(face) point = gaze.predict(pose, eyes) #print('Gaze values = ', point[0], point[1]) log.info('All inference complete') #print('view_inter = ', args.view_intermediate) if args.input == 'cam': point[0] = -point[0] mouse.move(point[0], point[1]) if args.view_intermediate == True: visualize(pose, face, eye_coords, point)
def main(): arg_parser = ArgParser() args = arg_parser.get_args() input_file = args.input # If input file defined then use it else use the webcam if input_file: if not os.path.isfile(input_file): log.error("Input file cannot be found") exit() input_feeder = InputFeeder("video", input_file) else: input_feeder = InputFeeder("cam") face_detection_model = FaceDetection(args.face_detection_model, args.device, args.extensions) face_detection_model.load_model() facial_landmarks_model = FacialLandmarksDetection( args.facial_landmark_detection_model, args.device, args.extensions) facial_landmarks_model.load_model() gaze_model = GazeEstimation(args.gaze_estimation_model, args.device, args.extensions) gaze_model.load_model() head_pose_model = HeadPoseEstimation(args.head_pose_estimation_model, args.device, args.extensions) head_pose_model.load_model() mouse_controller = MouseController('medium', 'fast') input_feeder.load_data() frame_count = 0 total_face_detection_inference_time = 0 total_facial_landmark_inference_time = 0 total_head_pose_inference_time = 0 total_gaze_estimation_inference_time = 0 total_inference_time = 0 for ret, frame in input_feeder.next_batch(): if not ret: log.error("ret variable not found") break frame_count += 1 if frame_count % args.mouse_update_interval == 0: cv2.imshow('Input', frame) key_pressed = cv2.waitKey(60) # Run inference on the face detection model start_time = time.time() cropped_face, face_coordinates = face_detection_model.predict( frame.copy(), args.probability_threshold) finish_time = time.time() total_face_detection_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # If no face detected get the next frame if len(face_coordinates) == 0: continue # Run inference on the facial landmark detection model start_time = time.time() results = facial_landmarks_model.predict(cropped_face.copy()) finish_time = time.time() left_eye_coordinates = results[0] right_eye_coordinates = results[1] left_eye_image = results[2] right_eye_image = results[3] left_eye_crop_coordinates = results[4] right_eye_crop_coordinates = results[5] total_facial_landmark_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # Run inference on the head pose estimation model start_time = time.time() head_pose = head_pose_model.predict(cropped_face.copy()) finish_time = time.time() total_head_pose_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # Run inference on the gaze estimation model start_time = time.time() new_mouse_x_coordinate, new_mouse_y_coordinate, gaze_vector = gaze_model.predict( left_eye_image, right_eye_image, head_pose) finish_time = time.time() total_gaze_estimation_inference_time += finish_time - start_time total_inference_time += finish_time - start_time if frame_count % args.mouse_update_interval == 0: log.info("Mouse controller new coordinates: x = {}, y = {}".format( new_mouse_x_coordinate, new_mouse_y_coordinate)) mouse_controller.move(new_mouse_x_coordinate, new_mouse_y_coordinate) # Optional visualization configuration: if args.show_detected_face: showDetectedFace(frame, face_coordinates) if args.show_head_pose: showHeadPose(frame, head_pose) if args.show_facial_landmarks: showFacialLandmarks(cropped_face, left_eye_crop_coordinates, right_eye_crop_coordinates) if args.show_gaze_estimation: showGazeEstimation(frame, right_eye_coordinates, left_eye_coordinates, gaze_vector, cropped_face, face_coordinates) # Break if escape key pressed if key_pressed == 27: log.warning("Keyboard interrupt triggered") break # Release the capture and destroy any OpenCV windows cv2.destroyAllWindows() input_feeder.close() log.info("Average face detection inference time: {} seconds".format( total_face_detection_inference_time / frame_count)) log.info( "Average facial landmark detection inference time: {} seconds".format( total_facial_landmark_inference_time / frame_count)) log.info("Average head pose estimation inference time: {} seconds".format( total_head_pose_inference_time / frame_count)) log.info("Average gaze estimation inference time: {} seconds".format( total_gaze_estimation_inference_time / frame_count)) log.info("Average total inference time: {} seconds".format( total_inference_time / frame_count))
def test_run(args): logging.getLogger().setLevel(logging.INFO) feeder = None activate_frame_count = 10 logging.warning("Running default value activate frame count = 10") if args.input_type == 'video' or args.input_type == 'image': feeder = InputFeeder(args.input_type, args.input) if args.input == '../bin/demo.mp4': logging.warning("Running default setting and input") elif args.input_type == 'webcam': feeder = InputFeeder(args.input_type, args.input) else: logging.error("Input not found") exit(1) mouse_controller = MouseController(args.precision, args.speed) feeder.load_data() start_time = 0 face_model_load_time = 0 start_time = time.time() face_model = FaceDetection(args.face, args.device, args.cpu_extension) face_model.load_model() face_model_load_time = time.time() - start_time logging.info("Face Detection Model Loaded...") head_pose_estimation_load_time = 0 start_time = time.time() head_pose_estimation = HeadPoseEstimation(args.headpose, args.device, args.cpu_extension) head_pose_estimation.load_model() head_pose_estimation_load_time = time.time() - start_time logging.info("Head Pose Detection Model Loaded...") facial_landmarks_detection_load_time = 0 start_time = time.time() facial_landmarks_detection = FacialLandmarksDetection( args.landmarks, args.device, args.cpu_extension) facial_landmarks_detection.load_model() facial_landmarks_detection_load_time = time.time() - start_time logging.info("Facial Landmark Detection Model Loaded...") gaze_model_load_time = 0 start_time = time.time() gaze_model = GazeEstimation(args.gazeestimation, args.device, args.cpu_extension) gaze_model.load_model() gaze_model_load_time = time.time() - start_time logging.info("Gaze Estimation Model Loaded...") frame_count = 0 total_face_model_inference_time = 0 total_head_pose_estimation_inference_time = 0 total_facial_landmarks_detection_inference_time = 0 total_gaze_model_inference_time = 0 start_time = 0 for frame in feeder.next_batch(): if frame is None: break frame_count += 1 key = cv2.waitKey(60) start_time = time.time() first_face_box, first_face = face_model.predict(frame.copy()) total_face_model_inference_time = total_face_model_inference_time + ( time.time() - start_time) start_time = time.time() head_pose_output = head_pose_estimation.predict(first_face_box.copy()) total_head_pose_estimation_inference_time = total_head_pose_estimation_inference_time + ( time.time() - start_time) start_time = time.time() left_eye, right_eye, eye_coords = facial_landmarks_detection.predict( first_face_box.copy()) total_facial_landmarks_detection_inference_time = total_facial_landmarks_detection_inference_time + ( time.time() - start_time) start_time = time.time() move_to_coors_mouse = gaze_model.predict(left_eye, right_eye, head_pose_output) total_gaze_model_inference_time = total_gaze_model_inference_time + ( time.time() - start_time) if frame_count % activate_frame_count == 0 and (args.flag == "3" or args.flag == "4"): mouse_controller.move(move_to_coors_mouse[0], move_to_coors_mouse[1]) cv2.imshow('video', frame) key = cv2.waitKey(60) if key == 27: break if args.flag == "1": cv2.rectangle(frame, (first_face[0], first_face[1]), (first_face[2], first_face[3]), (255, 0, 0)) cv2.imshow('video', frame) key = cv2.waitKey(60) elif args.flag == "2": cv2.rectangle(facial_landmarks_detection.image, (eye_coords[0], eye_coords[1]), (eye_coords[2], eye_coords[3]), (255, 0, 0)) cv2.imshow('video', facial_landmarks_detection.image) key = cv2.waitKey(60) elif args.flag == "3": if frame_count == 1: logging.info("Printing mouse coors: ") logging.info(move_to_coors_mouse) #Print Report if args.flag == "0": print('------------- BEGIN REPORT -------------') avg_inference_face_model = total_face_model_inference_time / frame_count avg_inference_headpose = total_head_pose_estimation_inference_time / frame_count avg_inference_facial_landmark = total_facial_landmarks_detection_inference_time / frame_count avg_inference_gaze_model = total_gaze_model_inference_time / frame_count print("Face Detection Model Load Time: ", args.face) print("Loading time: ", face_model_load_time) print("Inference time: ", avg_inference_face_model) print("Head Pose Detection Model: ", args.headpose) print("Loading time: ", head_pose_estimation_load_time) print("Inference time:", avg_inference_headpose) print("Facial Landmark Detection Model Load Time: ", args.landmarks) print("Loading time: ", facial_landmarks_detection_load_time) print("Inference time:", avg_inference_facial_landmark) print("Gaze Estimation Model Load Time: ", args.gazeestimation) print("Loading time: ", gaze_model_load_time) print("Inference time:", avg_inference_gaze_model) print('------------- END REPORT -------------')
def main(): """ Load inference networks, stream video to network, and output stats and video. :return: None """ # Logger init logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") # Get command line args args = get_arg() #Load Preferencies with open(args.config_file, "r") as yamlfile: cfg = yaml.load(yamlfile, Loader=yaml.FullLoader) models = cfg['models'] input_source = args.input video_path = cfg['video_path'] face_model = FaceDetection(models['face_detection']) head_pose_model = HeadPoseEstimation(models['head_pose_estimation']) facial_landmarks_model = FacialLandmarksDetection(models['facial_landmarks_detection']) gaze_estimation_model = GazeEstimation(models['gaze_estimation']) # Initialise the MouseController mouse_contr = MouseController("low","fast") # Load the models and log timing start_time = time.time() face_model.load_model(args.device) logging.info("Load Face Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() facial_landmarks_model.load_model(args.device) logging.info("Load Facial Landmarks Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() head_pose_model.load_model(args.device) logging.info("Load Head Pose Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) start_time = time.time() gaze_estimation_model.load_model(args.device) logging.info("Load Gaze Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) ) # Get and open video or camera capture #input_feed = InputFeeder('video', args.input) #input_feed.load_data() input_feed = InputFeeder(input_type=input_source, input_file=video_path) input_feed.load_data() if not input_feed.cap.isOpened(): log.critical('Error opening input, check --video_path parameter') sys.exit(1) # FPS = input_feed.get_fps() # Grab the shape of the input # width = input_feed.get_width() # height = input_feed.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### facedetect_infer_time = 0 landmark_infer_time = 0 headpose_infer_time = 0 gaze_infer_time = 0 while True: # Read the next frame try: frame = next(input_feed.next_batch()) except StopIteration: break if frame is None: break key_pressed = cv2.waitKey(60) frame_count += 1 input_height, input_width, _ = frame.shape logging.info("frame {count} size {w}, {h}".format(count= frame_count, w = input_width, h =input_height)) # face detection p_frame = face_model.preprocess_input(frame) start_time = time.time() fnoutput = face_model.predict(p_frame) facedetect_infer_time += time.time() - start_time out_frame,fboxes = face_model.preprocess_output(fnoutput,frame,args.overlay, args.prob_threshold) #for each face for fbox in fboxes: face = frame[fbox[1]:fbox[3],fbox[0]:fbox[2]] p_frame = facial_landmarks_model.preprocess_input(face) start_time = time.time() lmoutput = facial_landmarks_model.predict(p_frame) landmark_infer_time += time.time() - start_time out_frame,left_eye_point,right_eye_point = facial_landmarks_model.preprocess_output(lmoutput, fbox, out_frame,args.overlay, args.prob_threshold) # get head pose estimation p_frame = head_pose_model.preprocess_input(face) start_time = time.time() hpoutput = head_pose_model.predict(p_frame) headpose_infer_time += time.time() - start_time out_frame, headpose_angels = head_pose_model.preprocess_output(hpoutput,out_frame, face,fbox,args.overlay, args.prob_threshold) # get gaze estimation out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input(out_frame,face,left_eye_point,right_eye_point,args.overlay) start_time = time.time() geoutput = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels) gaze_infer_time += time.time() - start_time out_frame, gazevector = gaze_estimation_model.preprocess_output(geoutput,out_frame,fbox, left_eye_point,right_eye_point,args.overlay, args.prob_threshold) cv2.imshow('im', out_frame) if(args.mouse_move): logging.info("mouse move vector : x ={}, y={}".format(gazevector[0], gazevector[1])) mouse_contr.move(gazevector[0], gazevector[1]) #use only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if(frame_count>0): logging.info("***** Models Inference time *****") logging.info("Face Detection:{:.1f}ms".format(1000* facedetect_infer_time/frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format(1000* landmark_infer_time/frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format(1000* headpose_infer_time/frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format(1000* gaze_infer_time/frame_count)) # Release the capture and destroy any OpenCV windows input_feed.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() single_image_mode = (args.input_type == 'image') #Create and validate input feed input_feed = InputFeeder(input_type=args.input_type,input_file=args.input_path) input_feed.load_data() if not input_feed.is_open(): log.critical('Error opening input, check --input_path parameter (use --help for more info)') sys.exit(1) #Load models face_model = FaceDetection(args.face_detection_model) face_model.load_model(args.device) head_pose_model = HeadPoseEstimation(args.head_pose_model) head_pose_model.load_model(args.device) facial_landmarks_model = FacialLandmarksDetection(args.facial_landmarks_model) facial_landmarks_model.load_model(args.device) gaze_estimation_model = GazeEstimation(args.gaze_estimation_model) gaze_estimation_model.load_model(args.device) #initialize frame count for filtering count = 0 gaze_vector_accum = np.array([0,0,0],dtype='float64') gaze_vector_filtered = np.array([0,0,0],dtype='float64') #get screen calibration if not args.calibrate: run_calibration = False cal_x_limits, cal_y_limits = utils.get_calibration() else: run_calibration = True update_display = True #squares to draw on screen for calibration top_left_square = {'pt1':(0,0), 'pt2':(BOX_SIDE_LENGTH,BOX_SIDE_LENGTH)} top_right_square = {'pt1':(SCREEN_WIDTH - BOX_SIDE_LENGTH,0), 'pt2':(SCREEN_WIDTH, BOX_SIDE_LENGTH)} bottom_left_square = {'pt1':(0,SCREEN_HEIGHT - BOX_SIDE_LENGTH), 'pt2':(BOX_SIDE_LENGTH,SCREEN_HEIGHT)} bottom_right_square = {'pt1':(SCREEN_WIDTH - BOX_SIDE_LENGTH,SCREEN_HEIGHT - BOX_SIDE_LENGTH), 'pt2':(SCREEN_WIDTH,SCREEN_HEIGHT)} cal_squares = [top_left_square,top_right_square,bottom_left_square, bottom_right_square] #names of the calibration points for storing on calibration file cal_names = ['top_left', 'top_right', 'bottom_left', 'bottom_right'] #model output values for each calibration point will be stored here cal_points = {} square_iter = iter(cal_squares) name_iter = iter(cal_names) #image to display on screen for calibration base_img = get_base_img("LOOK AT THE SQUARES FOR 2 SECONDS","AND THEN PRESS n", COLORS[0]) if not single_image_mode: while True: #filter results count += 1 if(count>FILTER_QUANTITY): gaze_vector_filtered=gaze_vector_accum/FILTER_QUANTITY gaze_vector_accum=np.array([0,0,0],dtype='float64') count=0 #process frames frame = next(input_feed.next_batch()) start_time=time.time() face_boxes = run_inference(frame, face_model) cropped_faces = utils.crop_image(frame,face_boxes) if cropped_faces==0: #no face detected, nothing to process continue elif cropped_faces is None: #finished reading input feed break elif len(cropped_faces)==1: #found a single face in the frame, proceed head_pose = run_inference(cropped_faces[0], head_pose_model) eye_boxes = run_inference(cropped_faces[0], facial_landmarks_model) cropped_eyes = utils.crop_image(cropped_faces[0], eye_boxes) gaze_vector = run_inference_gaze(cropped_eyes[0], cropped_eyes[1], head_pose, gaze_estimation_model) inference_time=time.time()-start_time gaze_vector_accum += gaze_vector if run_calibration: if update_display: img = np.copy(base_img) square = next(square_iter, None) if not square is None: cv2.rectangle(img,square['pt1'], square['pt2'],COLORS[0],-1) update_display=False else: #Done with calibration cal_x_limits, cal_y_limits = utils.get_calibration(cal_points) utils.save_calibration(cal_points) run_calibration=False utils.imshow_fullscreen('window',img) if cv2.waitKey(1) & 0xFF == ord('n'): update_display = True point = np.array([ gaze_vector_filtered[0], gaze_vector_filtered[1] ]) point_name = next(name_iter) cal_points[point_name] = point else: if not args.display_all: img = get_base_img("GAZE CONTROL ENABLED", "MOVE MOUSE TO ANY CORNER OR PRESS q TO EXIT", COLORS[1]) utils.imshow_fullscreen('window',img) else: utils.display_inference_results(frame, face_boxes, head_pose, gaze_vector, inference_time) if cv2.waitKey(1) & 0xFF == ord('q'): print("User terminated program, goodbye") break screen_x, screen_y = get_screen_position(gaze_vector_filtered[0], gaze_vector_filtered[1], cal_x_limits, cal_y_limits) try: pyautogui.moveTo(screen_x,screen_y,MOUSE_MOVE_TIME) except pyautogui.FailSafeException: print("User terminated program, goodbye") break else: #Handle multiple people here if needed log.critical("ERROR: Multiple people detected, only single person supported") sys.exit(1) else: #Implement single image mode here if needed log.critical("ERROR: Single image mode not implemented") sys.exit(1) input_feed.close() cv2.destroyAllWindows()
def main(): try: args = build_argparser().parse_args() logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("computer-pointer-controller.log"), logging.StreamHandler() ]) print_output_frame = args.print_output_frame logger = logging.getLogger() input_file_path = args.input feeder = None if input_file_path.lower() == "CAM": feeder = InputFeeder("cam") else: if not os.path.isfile(input_file_path): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder("video", input_file_path) mc = MouseController('low', 'fast') feeder.load_data() modelPathDict = { 'FaceDetectionModel': args.face, 'FacialLandmarksDetectionModel': args.landmark, 'GazeEstimationModel': args.gazeestimation, 'HeadPoseEstimationModel': args.headpose } for fileNameKey in modelPathDict.keys(): if not os.path.isfile(modelPathDict[fileNameKey] + '.xml'): logger.error("Unable to find specified " + fileNameKey + " xml file") exit(1) logging.info("============== Models Load time ===============") face_detection = FaceDetection(args.face, args.device, args.prob_threshold, args.cpu_extension) start_time = time.time() face_detection.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) landmarks_detection = FacialLandmarksDetection(args.landmark, args.device, args.cpu_extension) start_time = time.time() landmarks_detection.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) gaze_estimation = GazeEstimation(args.gazeestimation, args.device, args.cpu_extension) start_time = time.time() gaze_estimation.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) headpose_estimation = HeadPoseEstimation(args.headpose, args.device, args.cpu_extension) start_time = time.time() headpose_estimation.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) logging.info("============== End =====================") frame_count = 0 fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) start_time = time.time() cropped_face, face_coords = face_detection.predict(frame.copy()) fd_infertime += time.time() - start_time if len(cropped_face) == 0: logger.error("Unable to detect the face.") continue start_time = time.time() headpose_out = headpose_estimation.predict(cropped_face.copy()) hp_infertime += time.time() - start_time start_time = time.time() left_eye, right_eye, eye_coords = landmarks_detection.predict( cropped_face.copy()) lm_infertime += time.time() - start_time start_time = time.time() new_mouse_coord, gaze_vector = gaze_estimation.predict( left_eye, right_eye, headpose_out) ge_infertime += time.time() - start_time if print_output_frame: preview_frame = frame.copy() if 'fd' in print_output_frame: preview_frame = cropped_face cv2.rectangle(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255, 0, 0), 3) if 'fl' in print_output_frame: cv2.rectangle(cropped_face, (eye_coords[0][0], eye_coords[0][1]), (eye_coords[0][2], eye_coords[0][3]), (0, 255, 0), 2) cv2.rectangle(cropped_face, (eye_coords[1][0], eye_coords[1][1]), (eye_coords[1][2], eye_coords[1][3]), (0, 255, 0), 2) if 'hp' in print_output_frame: cv2.putText( cropped_face, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(headpose_out[0], headpose_out[1], headpose_out[2]), (0, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.25, (0, 0, 0), 1) face = frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] xmin, ymin, _, _ = face_coords face_center = (xmin + face.shape[1] / 2, ymin + face.shape[0] / 2, 0) headpose_estimation.draw_axes(frame, face_center, headpose_out[0], headpose_out[1], headpose_out[2]) if 'ge' in print_output_frame: cropped_h, cropped_w = cropped_face.shape[:2] arrow_length = 0.3 * cropped_h gaze_arrow_x = gaze_vector[0] * arrow_length gaze_arrow_y = -gaze_vector[1] * arrow_length cv2.arrowedLine(cropped_face, (eye_coords[0][0], eye_coords[0][1]), (int(eye_coords[0][2] + gaze_arrow_x), int(eye_coords[0][3] + gaze_arrow_y)), (0, 255, 0), 2) cv2.arrowedLine(cropped_face, (eye_coords[1][0], eye_coords[1][1]), (int(eye_coords[1][2] + gaze_arrow_x), int(eye_coords[1][3] + gaze_arrow_y)), (0, 255, 0), 2) #frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = cropped_face if len(preview_frame) != 0: img_hor = np.hstack((cv2.resize(preview_frame, (800, 800)), cv2.resize(frame, (800, 800)))) else: img_hor = cv2.resize(frame, (800, 800)) cv2.imshow("Monitor", img_hor) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") logger.info("Video stream ended...") cv2.destroyAllWindows() feeder.close() except Exception as ex: logging.exception("Error in inference") logging.exception("Exception type:") logging.exception(type(ex)) logging.exception("Exception args:") logging.exception(ex.args) logging.exception("Exception:") logging.exception(ex)