def run_inference(args): feed = InputFeeder(input_type='video', input_file=args.input) feed.load_data() for batch in feed.next_batch(): cv2.imshow("Output", cv2.resize(batch, (500, 500))) key = cv2.waitKey(60) if (key == 27): break # getting face faceDetection = FaceDetection(model_name=args.face_detection_model) faceDetection.load_model() face = faceDetection.predict(batch) # getting eyes facialLandmarksDetection = FacialLandmarksDetection( args.facial_landmarks_detection_model) facialLandmarksDetection.load_model() left_eye, right_eye = facialLandmarksDetection.predict(face) # getting head pose angles headPoseEstimation = HeadPoseEstimation( args.head_pose_estimation_model) headPoseEstimation.load_model() head_pose = headPoseEstimation.predict(face) print("head pose angles: ", head_pose) # get mouse points gazeEstimation = GazeEstimation(args.gaze_estimation_model) gazeEstimation.load_model() mouse_coords = gazeEstimation.predict(left_eye, right_eye, head_pose) print("gaze output: ", mouse_coords) feed.close()
def __init__(self, args): ''' This method instances variables for the Facial Landmarks Detection Model. Args: args = All arguments parsed by the arguments parser function Return: None ''' init_start_time = time.time() self.output_path = args.output_path self.show_output = args.show_output self.total_processing_time = 0 self.count_batch = 0 self.inference_speed = [] self.avg_inference_speed = 0 if args.all_devices != 'CPU': args.face_device = args.all_devices args.face_landmark_device = args.all_devices args.head_pose_device = args.all_devices args.gaze_device = args.all_devices model_init_start = time.time() self.face_model = FaceDetection(args.face_model, args.face_device, args.face_device_ext, args.face_prob_threshold) self.landmarks_model = FacialLandmarksDetection( args.face_landmark_model, args.face_landmark_device, args.face_landmark_device_ext, args.face_landmark_prob_threshold) self.head_pose_model = HeadPoseEstimation( args.head_pose_model, args.head_pose_device, args.head_pose_device_ext, args.head_pose_prob_threshold) self.gaze_model = GazeEstimation(args.gaze_model, args.gaze_device, args.gaze_device_ext, args.gaze_prob_threshold) self.model_init_time = time.time() - model_init_start log.info('[ Main ] All required models initiallized') self.mouse_control = MouseController(args.precision, args.speed) log.info('[ Main ] Mouse controller successfully initialized') self.input_feeder = InputFeeder(args.batch_size, args.input_type, args.input_file) log.info('[ Main ] Initialized input feeder') model_load_start = time.time() self.face_model.load_model() self.landmarks_model.load_model() self.head_pose_model.load_model() self.gaze_model.load_model() self.model_load_time = time.time() - model_load_start self.app_init_time = time.time() - init_start_time log.info('[ Main ] All moadels loaded to Inference Engine\n') return None
def models_handler(logger, args): ## put all path of model from args in to dict Dict_model_path = { 'Face': args.face_detection_path, 'Landmarks': args.facial_landmarks_path, 'Headpose': args.head_pose_path, 'Gaze': args.gaze_estimation_path } ## check if model exists in given path for model_key in Dict_model_path.keys(): # print(Dict_model_path[model_key]) if not os.path.isfile(Dict_model_path[model_key]): print("\n## " + model_key + " Model path not exists: " + Dict_model_path[model_key] + ' Please try again !!!') logger.error("## " + model_key + " Model path not exists: " + Dict_model_path[model_key] + ' Please try again !!!') exit(1) else: print('## '+model_key + " Model path is correct: " + Dict_model_path[model_key] + '\n') logger.info('## '+model_key + " Model path is correct: " + Dict_model_path[model_key]) ## initialize face detection mode model_fd = FaceDetection(Dict_model_path['Face'], args.device, args.cpu_extension) ## initialize facial landmarks detection model model_fld = FacialLandmarkDetection(Dict_model_path['Landmarks'], args.device, args.cpu_extension) ## initialize head pose estimation model model_hpe = HeadPoseEstimation(Dict_model_path['Headpose'], args.device, args.cpu_extension) ## initialize gaze estimation model model_ge = GazeEstimation(Dict_model_path['Gaze'], args.device, args.cpu_extension) return model_fd, model_fld, model_hpe, model_ge
def main(args): # set log level levels = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR } log_level = levels.get(args.log_level, logging.ERROR) logging.basicConfig(level=log_level) mouse_control = MouseController('high', 'fast') logging.info("Model Loading Please Wait ..") face_det = FaceDetection(args.face_detection, args.device) facial_det = FaceLandmark(args.face_landmark, args.device) head_pose_est = HeadPoseEstimation(args.head_pose, args.device) gaze_est = GazeEstimation(args.gaze_estimation, args.device) logging.info("Model loading successfully") inp = InputFeeder(input_type='video', input_file=args.input) inp.load_data() face_det.load_model() facial_det.load_model() head_pose_est.load_model() gaze_est.load_model() video_writer = cv2.VideoWriter(args.output_dir + '/demo_output11.mp4', cv2.VideoWriter_fourcc(*'MPEG'), 15, (1920, 1080), True) cv2.namedWindow('gaze') for frame in inp.next_batch(): try: frame.shape except Exception as err: break crop_face, crop_coords = face_det.predict(frame, visualize=args.visualize) left_eye, right_eye, left_eye_crop, right_eye_crop = facial_det.predict( crop_face, visualize=args.visualize) head_pose = head_pose_est.predict(crop_face, visualize=args.visualize) (new_x, new_y), gaze_vector = gaze_est.predict(left_eye_crop, right_eye_crop, head_pose) left_eye_gaze = int(left_eye[0] + gaze_vector[0] * 100), int(left_eye[1] - gaze_vector[1] * 100) right_eye_gaze = int(right_eye[0] + gaze_vector[0] * 100), int(right_eye[1] - gaze_vector[1] * 100) cv2.arrowedLine(crop_face, left_eye, left_eye_gaze, (0, 0, 255), 2) cv2.arrowedLine(crop_face, right_eye, right_eye_gaze, (0, 0, 255), 2) video_writer.write(frame) mouse_control.move(new_x, new_y) if args.show_result: cv2.imshow('gaze', frame) cv2.waitKey(1) inp.close() video_writer.release() cv2.destroyAllWindows()
def main(): args = get_args().parse_args() path_filender = args.input four_flags = args.flags_checker loger = logging.getLogger() feeder_in = None out_path = args.out_path if path_filender.lower() == "cam": feeder_in = InputFeeder("cam") else: if not os.path.isfile(path_filender): loger.error("The video was not found") exit(1) feeder_in = InputFeeder("video", path_filender) model_locations = { 'FaceDetection': args.face_detection_model, 'HeadPoseEstimation': args.head_pose_estimation_model, 'FacialLandmarksDetection': args.facial_landmarks_detection_model, 'GazeEstimation': args.gaze_estimation_model } for key_name in model_locations.keys(): if not os.path.isfile(model_locations[key_name]): loger.error("The system cannot find the " + key_name + " xml file") exit(1) dt = FaceDetection(model_locations['FaceDetection'], args.device, args.cpu_extension) pe = HeadPoseEstimation(model_locations['HeadPoseEstimation'], args.device, args.cpu_extension) ld = FacialLandmarksDetection(model_locations['FacialLandmarksDetection'], args.device, args.cpu_extension) ge = GazeEstimation(model_locations['GazeEstimation'], args.device, args.cpu_extension) cursor = MouseController('medium', 'fast') feeder_in.load_data() model_load_time_start = time.time() dt.load_model() pe.load_model() ld.load_model() ge.load_model() total_load_time = time.time() - model_load_time_start frame_counter = 0 inference_time_start = time.time() for ret, frame in feeder_in.next_batch(): if not ret: break frame_counter = frame_counter + 1 if frame_counter % 1 == 0: cv2.imshow('video', cv2.resize(frame, (600, 600))) key = cv2.waitKey(60) face_detected, coords_face = dt.predict(frame, args.p_th) if type(face_detected) == int: loger.error("The system cannot detect any face.") if key == 27: break continue head_pose_output = pe.predict(face_detected) eye_left_detect, eye_right_detect, eye_coordinates_detect = ld.predict( face_detected) coordi_update_pointer, coordi_gaze = ge.predict( eye_left_detect, eye_right_detect, head_pose_output) if (not len(four_flags) == 0): result_app = frame if 'fad' in four_flags: result_app = face_detected if 'hpe' in four_flags: cv2.putText( result_app, "HP Angles: YAW:{:.3f} * PITCH:{:.3f} * ROLL:{:.3f}". format(head_pose_output[0], head_pose_output[1], head_pose_output[2]), (5, 40), cv2.FONT_HERSHEY_COMPLEX, 0.25, (153, 76, 0), 0) if 'fld' in four_flags: cv2.rectangle(face_detected, (eye_coordinates_detect[0][0] - 4, eye_coordinates_detect[0][1] - 4), (eye_coordinates_detect[0][2] + 4, eye_coordinates_detect[0][3] + 4), (255, 255, 0), 4) cv2.rectangle(face_detected, (eye_coordinates_detect[1][0] - 4, eye_coordinates_detect[1][1] - 4), (eye_coordinates_detect[1][2] + 4, eye_coordinates_detect[1][3] + 4), (255, 255, 0), 4) if 'gae' in four_flags: x = int(coordi_gaze[0] * 2) y = int(coordi_gaze[1] * 2) w = 150 right_E = cv2.line(eye_right_detect, (x - w, y - w), (x + w, y + w), (51, 255, 153), 1) cv2.line(right_E, (x - w, y + w), (x + w, y - w), (51, 255, 253), 1) left_E = cv2.line(eye_left_detect, (x - w, y - w), (x + w, y + w), (51, 255, 153), 1) cv2.line(left_E, (x - w, y + w), (x + w, y - w), (51, 255, 253), 1) face_detected[ eye_coordinates_detect[1][1]:eye_coordinates_detect[1][3], eye_coordinates_detect[1][0]:eye_coordinates_detect[1] [2]] = right_E face_detected[ eye_coordinates_detect[0][1]:eye_coordinates_detect[0][3], eye_coordinates_detect[0][0]:eye_coordinates_detect[0] [2]] = left_E cv2.imshow("Result of the App", cv2.resize(result_app, (600, 600))) if frame_counter % 5 == 0: cursor.move(coordi_update_pointer[0], coordi_update_pointer[1]) if key == 27: break total_time = time.time() - inference_time_start total_time_for_inference = round(total_time, 1) fps = frame_counter / total_time_for_inference with open(out_path + 'stats.txt', 'w') as f: f.write('Inference time: ' + str(total_time_for_inference) + '\n') f.write('FPS: ' + str(fps) + '\n') f.write('Model load time: ' + str(total_load_time) + '\n') loger.error("The video stream is over...") cv2.destroyAllWindows() feeder_in.close()
def main(): """ Load the network and parse the output. :return: None """ global INFO global DELAY global POSE_CHECKED #controller = MouseController() log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = args_parser().parse_args() logger = log.getLogger() if args.input == 'cam': input_stream = 0 else: input_stream = args.input assert os.path.isfile(args.input), "Specified input file doesn't exist" cap = cv2.VideoCapture(input_stream) initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) out = cv2.VideoWriter(os.path.join(args.output_dir, "shopper.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True) frame_count = 0 job_id = 1 #os.environ['PBS_JOBID'] progress_file_path = os.path.join(args.output_dir, 'i_progress_' + str(job_id) + '.txt') infer_time_start = time.time() if input_stream: cap.open(args.input) # Adjust DELAY to match the number of FPS of the video file DELAY = 1000 / cap.get(cv2.CAP_PROP_FPS) if not cap.isOpened(): logger.error("ERROR! Unable to open video source") return # Initialise the class if args.cpu_extension: facedet = FaceDetection(args.facemodel, args.confidence, extensions=args.cpu_extension) posest = HeadPoseEstimation(args.posemodel, args.confidence, extensions=args.cpu_extension) landest = FaceLandmarksDetection(args.landmarksmodel, args.confidence, extensions=args.cpu_extension) gazeest = GazeEstimation(args.gazemodel, args.confidence, extensions=args.cpu_extension) else: facedet = FaceDetection(args.facemodel, args.confidence) posest = HeadPoseEstimation(args.posemodel, args.confidence) landest = FaceLandmarksDetection(args.landmarksmodel, args.confidence) gazeest = GazeEstimation(args.gazemodel, args.confidence) # infer_network_pose = Network() # Load the network to IE plugin to get shape of input layer facedet.load_model() posest.load_model() landest.load_model() gazeest.load_model() print("loaded models") ret, frame = cap.read() while ret: looking = 0 POSE_CHECKED = False ret, frame = cap.read() frame_count += 1 if not ret: print("checkpoint *BREAKING") break if frame is None: log.error("checkpoint ERROR! blank FRAME grabbed") break initial_w = int(cap.get(3)) initial_h = int(cap.get(4)) # Start asynchronous inference for specified request inf_start_fd = time.time() # Results of the output layer of the network coords, frame = facedet.predict(frame) det_time_fd = time.time() - inf_start_fd if len(coords) > 0: [xmin, ymin, xmax, ymax] = coords[0] # use only the first detected face head_pose = frame[ymin:ymax, xmin:xmax] inf_start_hp = time.time() is_looking, pose_angles = posest.predict(head_pose) if is_looking: det_time_hp = time.time() - inf_start_hp POSE_CHECKED = True #print(is_looking) inf_start_lm = time.time() coords, f = landest.predict(head_pose) frame[ymin:ymax, xmin:xmax] = f det_time_lm = time.time() - inf_start_lm [[xlmin, ylmin, xlmax, ylmax], [xrmin, yrmin, xrmax, yrmax]] = coords left_eye_image = frame[ylmin:ylmax, xlmin:xlmax] right_eye_image = frame[yrmin:yrmax, xrmin:xrmax] output = gazeest.predict(left_eye_image, right_eye_image, pose_angles) # Draw performance stats inf_time_message = "Face Inference time: {:.3f} ms.".format( det_time_fd * 1000) if POSE_CHECKED: cv2.putText( frame, "Head pose Inference time: {:.3f} ms.".format( det_time_hp * 1000), (0, 35), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) cv2.putText(frame, inf_time_message, (0, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 255, 255), 1) out.write(frame) print("frame", frame_count) if frame_count % 10 == 0: print(time.time() - infer_time_start) progressUpdate(progress_file_path, int(time.time() - infer_time_start), frame_count, video_len) if args.output_dir: total_time = time.time() - infer_time_start with open(os.path.join(args.output_dir, 'stats.txt'), 'w') as f: f.write(str(round(total_time, 1)) + '\n') f.write(str(frame_count) + '\n') facedet.clean() posest.clean() landest.clean() gazeest.clean() out.release() cap.release() cv2.destroyAllWindows()
def inference(args): time_sheet = { 'face_infr': [], 'landmark_infr': [], 'head_infr': [], 'gaze_infr': [], 'infr_per_frame': [] } logging.basicConfig(filename='result.log', level=logging.INFO) logging.info( "=================================================================================" ) logging.info("Precision(face,landmark,head,gaze): FP32-INT1,FP{0},FP{1},FP{2}".format(\ args.landmark_model.split("FP")[1].split("\\")[0], args.head_model.split("FP")[1].split("\\")[0], args.gaze_model.split("FP")[1].split("\\")[0])) model_load_start = time.time() face_detection = FaceDetection(args.face_model) face_detection.load_model() landmark_regression = LandmarkRegression(args.landmark_model) landmark_regression.load_model() head_pose = HeadPose(args.head_model) head_pose.load_model() gaze_estimation = GazeEstimation(args.gaze_model) gaze_estimation.load_model() logging.info("4 models load time: {0:.4f}sec".format(time.time() - model_load_start)) mouse_controller = MouseController('high', 'fast') cv2.namedWindow('preview', cv2.WND_PROP_FULLSCREEN) cv2.setWindowProperty('preview', cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) input_feeder = InputFeeder(args.input_type, args.input_file) input_feeder.load_data() total_infr_start = time.time() for image in input_feeder.next_batch(): if image is None: break face_infr_start = time.time() face_image = face_detection.predict(image) time_sheet['face_infr'].append(time.time() - face_infr_start) landmark_infr_start = time.time() left_eye_image, right_eye_image = landmark_regression.predict( np.copy(face_image)) time_sheet['landmark_infr'].append(time.time() - landmark_infr_start) head_infr_start = time.time() head_pose_angles = head_pose.predict(np.copy(face_image)) time_sheet['head_infr'].append(time.time() - head_infr_start) gaze_infr_start = time.time() x, y, z = gaze_estimation.predict(left_eye_image, right_eye_image, head_pose_angles) time_sheet['gaze_infr'].append(time.time() - gaze_infr_start) time_sheet['infr_per_frame'].append(time.time() - face_infr_start) cv2.imshow('preview', image) mouse_controller.move(x, y) key = cv2.waitKey(20) if key == 27: # exit on ESC break logging.info("Face model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['face_infr']))) logging.info("Landmark model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['landmark_infr']))) logging.info("Head model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['head_infr']))) logging.info("Gaze model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['gaze_infr']))) logging.info("4 Model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['infr_per_frame']))) logging.info("Total inference time: {0:.4f}sec".format(time.time() - total_infr_start)) logging.info( "====================================END==========================================\n" ) input_feeder.close() cv2.destroyAllWindows()
def main(args): model = args.faceModel headpose = args.headpose device = args.device facialLandmark = args.facialLandmark gazeEstimation = args.gazeEstimation input_arg = args.input threshold = float(args.threshold) logging.basicConfig(filename='error_log.log', filemode='w') error_log = logging.getLogger() if input_arg == 'cam': input_stream = 0 cap = cv2.VideoCapture(input_stream) else: if os.path.isfile(input_arg): input_stream = input_arg cap = cv2.VideoCapture(input_stream) else: print( 'Could not determine the file location or Could not load the desired format, please use .mp4 or cam' ) error_log.error( 'Could not determine the file location or Could not load the desired format, please use .mp4 or cam' ) exit(1) return model_load_time = time.time() # Load Face Detection Model face_detection = Face_Model(model, threshold, device=device) face_net = face_detection.load_model() # Load Head Pose Detection Model head_pose = HeadPose(headpose, threshold, device=device) head_net = head_pose.load_model() # Load Facial Landmarks Model facial_landmarks = FacialLandmark(facialLandmark, threshold, device=device) landmark_net = facial_landmarks.load_model() # Load Gaze Estimation Model gaze_estimation = GazeEstimation(gazeEstimation, threshold, device=device) gaze_net = gaze_estimation.load_model() total_loading_time = time.time() - model_load_time mouse_controller = MouseController('medium', 'slow') width = int(cap.get(3)) height = int(cap.get(4)) out_video = cv2.VideoWriter('out_video.mp4', cv2.VideoWriter_fourcc(*'avc1'), int(cv2.CAP_PROP_FPS), (width, height), True) frame_count = 0 inference_time = time.time() print(cap.get(cv2.CAP_PROP_FRAME_COUNT)) while cap.isOpened(): flag, frame = cap.read() if not flag: break key_pressed = cv2.waitKey(60) out_frame, face_coords = face_detection.predict( frame, face_net, width, height) if out_frame is not None: if not (out_frame.shape[1] == 0 or out_frame.shape[0] == 0): yaw, pitch, roll = head_pose.predict(out_frame, head_net) head_pose_angles = [yaw, pitch, roll] left_eye_image, right_eye_image, eye_cords = facial_landmarks.predict( out_frame, landmark_net) mouse_pointer, gaze_vector = gaze_estimation.predict( gaze_net, left_eye_image, right_eye_image, head_pose_angles) mouse_controller.move(-mouse_pointer[0], mouse_pointer[1]) if frame_count % 5 == 0: cv2.putText( frame, "Head Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}" .format(head_pose_angles[0], head_pose_angles[1], head_pose_angles[2]), (50, 50), cv2.FONT_HERSHEY_TRIPLEX, 1.0, (255, 255, 0), 1) #cv2.arrowedLine(frame, (x, y), (x +5,y + 5), (255, 255, 0), 2) cv2.rectangle(frame, (eye_cords[0][0] + face_coords[0] - 10, eye_cords[0][1] + face_coords[1] - 10), (eye_cords[0][2] + face_coords[0] + 10, eye_cords[0][3] + face_coords[1] + 10), (255, 255, 0), 2) cv2.rectangle(frame, (eye_cords[1][0] + face_coords[0] - 10, eye_cords[1][1] + face_coords[1] - 10), (eye_cords[1][2] + face_coords[0] + 10, eye_cords[1][3] + face_coords[1] + 10), (255, 255, 0), 2) cv2.rectangle(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255, 255, 0), 2) #cv2.imshow('prev', frame) out_video.write(frame) frame_count += 10 cap.set(1, frame_count) if frame_count % 10 == 0: print(frame_count) if frame_count == cap.get(cv2.CAP_PROP_FRAME_COUNT): break if key_pressed == 27: break cap.release() cv2.destroyAllWindows() total_inference_time = time.time() - inference_time total_fps = frame_count / total_inference_time with open('result.txt', 'w') as f: f.write(str(total_loading_time) + '\n') f.write(str(total_inference_time) + '\n') f.write(str(total_fps) + '\n')
def main(): """ Load the network and parse the output. :return: None """ global POSE_CHECKED log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = args_parser().parse_args() logger = log.getLogger() if args.input == 'cam': input_stream = 0 else: input_stream = args.input assert os.path.isfile(args.input), "Specified input file doesn't exist" cap = cv2.VideoCapture(input_stream) initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) out = cv2.VideoWriter(os.path.join(args.output_dir, "output.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True) if args.write_intermediate == 'yes': out_fm = cv2.VideoWriter( os.path.join(args.output_dir, "output_fm.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True) out_lm = cv2.VideoWriter( os.path.join(args.output_dir, "output_lm.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True) out_pm = cv2.VideoWriter( os.path.join(args.output_dir, "output_pm.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True) out_gm = cv2.VideoWriter( os.path.join(args.output_dir, "output_gm.mp4"), cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True) frame_count = 0 job_id = 1 infer_time_start = time.time() if input_stream: cap.open(args.input) # Adjust DELAY to match the number of FPS of the video file if not cap.isOpened(): logger.error("ERROR! Unable to open video source") return if args.mode == 'sync': async_mode = False else: async_mode = True # Initialise the class if args.cpu_extension: face_det = FaceDetection(args.facemodel, args.confidence, extensions=args.cpu_extension, async_mode=async_mode) pose_det = HeadPoseEstimation(args.posemodel, args.confidence, extensions=args.cpu_extension, async_mode=async_mode) land_det = FaceLandmarksDetection(args.landmarksmodel, args.confidence, extensions=args.cpu_extension, async_mode=async_mode) gaze_est = GazeEstimation(args.gazemodel, args.confidence, extensions=args.cpu_extension, async_mode=async_mode) else: face_det = FaceDetection(args.facemodel, args.confidence, async_mode=async_mode) pose_det = HeadPoseEstimation(args.posemodel, args.confidence, async_mode=async_mode) land_det = FaceLandmarksDetection(args.landmarksmodel, args.confidence, async_mode=async_mode) gaze_est = GazeEstimation(args.gazemodel, args.confidence, async_mode=async_mode) # infer_network_pose = Network() # Load the network to IE plugin to get shape of input layer face_det.load_model() pose_det.load_model() land_det.load_model() gaze_est.load_model() model_load_time = time.time() - infer_time_start print("All models are loaded successfully") try: pass except Exception as e: print("Could not run Inference: ", e) while cap.isOpened(): ret, frame = cap.read() if not ret: print("checkpoint *BREAKING") break frame_count += 1 looking = 0 POSE_CHECKED = False if frame is None: log.error("checkpoint ERROR! blank FRAME grabbed") break initial_w = int(cap.get(3)) initial_h = int(cap.get(4)) # Start asynchronous inference for specified request inf_start_fd = time.time() # Results of the output layer of the network coords, frame = face_det.predict(frame) if args.write_intermediate == 'yes': out_fm.write(frame) det_time_fd = time.time() - inf_start_fd if len(coords) > 0: [xmin, ymin, xmax, ymax] = coords[0] # use only the first detected face head_pose = frame[ymin:ymax, xmin:xmax] inf_start_hp = time.time() is_looking, pose_angles = pose_det.predict(head_pose) if args.write_intermediate == 'yes': p = "Pose Angles {}, is Looking? {}".format( pose_angles, is_looking) cv2.putText(frame, p, (50, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 0, 0), 1) out_pm.write(frame) if is_looking: det_time_hp = time.time() - inf_start_hp POSE_CHECKED = True inf_start_lm = time.time() coords, f = land_det.predict(head_pose) frame[ymin:ymax, xmin:xmax] = f if args.write_intermediate == "yes": out_lm.write(frame) det_time_lm = time.time() - inf_start_lm [[xlmin, ylmin, xlmax, ylmax], [xrmin, yrmin, xrmax, yrmax]] = coords left_eye_image = f[ylmin:ylmax, xlmin:xlmax] right_eye_image = f[yrmin:yrmax, xrmin:xrmax] output, gaze_vector = gaze_est.predict(left_eye_image, right_eye_image, pose_angles) if args.write_intermediate == 'yes': p = "Gaze Vector {}".format(gaze_vector) cv2.putText(frame, p, (50, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 0, 0), 1) fl = draw_gaze(left_eye_image, gaze_vector) fr = draw_gaze(right_eye_image, gaze_vector) f[ylmin:ylmax, xlmin:xlmax] = fl f[yrmin:yrmax, xrmin:xrmax] = fr # cv2.arrowedLine(f, (xlmin, ylmin), (xrmin, yrmin), (0,0,255), 5) out_gm.write(frame) # Draw performance stats inf_time_message = "Face Inference time: {:.3f} ms.".format( det_time_fd * 1000) # if POSE_CHECKED: cv2.putText( frame, "Head pose Inference time: {:.3f} ms.".format( det_time_hp * 1000), (0, 35), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) cv2.putText(frame, inf_time_message, (0, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 0, 0), 1) out.write(frame) if frame_count % 10 == 0: print("Inference time = ", int(time.time() - infer_time_start)) print('Frame count {} and vidoe len {}'.format( frame_count, video_len)) if args.output_dir: total_time = time.time() - infer_time_start with open(os.path.join(args.output_dir, 'stats.txt'), 'w') as f: f.write(str(round(total_time, 1)) + '\n') f.write(str(frame_count) + '\n') if args.output_dir: with open(os.path.join(args.output_dir, 'stats.txt'), 'a') as f: f.write(str(round(model_load_time)) + '\n') # Clean all models face_det.clean() pose_det.clean() land_det.clean() gaze_est.clean() # release cv2 cap cap.release() cv2.destroyAllWindows() # release all out writer out.release() if args.write_intermediate == 'yes': out_fm.release() out_pm.release() out_lm.release() out_gm.release()
def main(): arg_parser = ArgParser() args = arg_parser.get_args() input_file = args.input # If input file defined then use it else use the webcam if input_file: if not os.path.isfile(input_file): log.error("Input file cannot be found") exit() input_feeder = InputFeeder("video", input_file) else: input_feeder = InputFeeder("cam") face_detection_model = FaceDetection(args.face_detection_model, args.device, args.extensions) face_detection_model.load_model() facial_landmarks_model = FacialLandmarksDetection( args.facial_landmark_detection_model, args.device, args.extensions) facial_landmarks_model.load_model() gaze_model = GazeEstimation(args.gaze_estimation_model, args.device, args.extensions) gaze_model.load_model() head_pose_model = HeadPoseEstimation(args.head_pose_estimation_model, args.device, args.extensions) head_pose_model.load_model() mouse_controller = MouseController('medium', 'fast') input_feeder.load_data() frame_count = 0 total_face_detection_inference_time = 0 total_facial_landmark_inference_time = 0 total_head_pose_inference_time = 0 total_gaze_estimation_inference_time = 0 total_inference_time = 0 for ret, frame in input_feeder.next_batch(): if not ret: log.error("ret variable not found") break frame_count += 1 if frame_count % args.mouse_update_interval == 0: cv2.imshow('Input', frame) key_pressed = cv2.waitKey(60) # Run inference on the face detection model start_time = time.time() cropped_face, face_coordinates = face_detection_model.predict( frame.copy(), args.probability_threshold) finish_time = time.time() total_face_detection_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # If no face detected get the next frame if len(face_coordinates) == 0: continue # Run inference on the facial landmark detection model start_time = time.time() results = facial_landmarks_model.predict(cropped_face.copy()) finish_time = time.time() left_eye_coordinates = results[0] right_eye_coordinates = results[1] left_eye_image = results[2] right_eye_image = results[3] left_eye_crop_coordinates = results[4] right_eye_crop_coordinates = results[5] total_facial_landmark_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # Run inference on the head pose estimation model start_time = time.time() head_pose = head_pose_model.predict(cropped_face.copy()) finish_time = time.time() total_head_pose_inference_time += finish_time - start_time total_inference_time += finish_time - start_time # Run inference on the gaze estimation model start_time = time.time() new_mouse_x_coordinate, new_mouse_y_coordinate, gaze_vector = gaze_model.predict( left_eye_image, right_eye_image, head_pose) finish_time = time.time() total_gaze_estimation_inference_time += finish_time - start_time total_inference_time += finish_time - start_time if frame_count % args.mouse_update_interval == 0: log.info("Mouse controller new coordinates: x = {}, y = {}".format( new_mouse_x_coordinate, new_mouse_y_coordinate)) mouse_controller.move(new_mouse_x_coordinate, new_mouse_y_coordinate) # Optional visualization configuration: if args.show_detected_face: showDetectedFace(frame, face_coordinates) if args.show_head_pose: showHeadPose(frame, head_pose) if args.show_facial_landmarks: showFacialLandmarks(cropped_face, left_eye_crop_coordinates, right_eye_crop_coordinates) if args.show_gaze_estimation: showGazeEstimation(frame, right_eye_coordinates, left_eye_coordinates, gaze_vector, cropped_face, face_coordinates) # Break if escape key pressed if key_pressed == 27: log.warning("Keyboard interrupt triggered") break # Release the capture and destroy any OpenCV windows cv2.destroyAllWindows() input_feeder.close() log.info("Average face detection inference time: {} seconds".format( total_face_detection_inference_time / frame_count)) log.info( "Average facial landmark detection inference time: {} seconds".format( total_facial_landmark_inference_time / frame_count)) log.info("Average head pose estimation inference time: {} seconds".format( total_head_pose_inference_time / frame_count)) log.info("Average gaze estimation inference time: {} seconds".format( total_gaze_estimation_inference_time / frame_count)) log.info("Average total inference time: {} seconds".format( total_inference_time / frame_count))
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ #if args.input == 'cam': # args.input = 0 output_intermediate_model = args.output_intermediate_model ### TODO: Handle the input stream ### feed = InputFeeder(input_type=args.input_type, input_file=args.input_file) cap = feed.load_data() width = int(cap.get(3)) height = int(cap.get(4)) fps = int(cap.get(5)) # Initialise the class try: infer_network_face_detection = BasePointer() infer_network_head_pose_estimation = BasePointer() infer_network_landmarks_regression_retail = BasePointer() infer_network_gaze_estimation = GazeEstimation() except: logging.error("Error in initializing models") exit(1) ### TODO: Load the model through `infer_network_face_detection` ### try: start_loading_time_face_detection = time.time() infer_network_face_detection.load_model(args.model1, args.device) load_model_face_detection_time_taken = time.time( ) - start_loading_time_face_detection start_loading_time_head_pose_estimation = time.time() infer_network_head_pose_estimation.load_model(args.model2, args.device) load_model_head_pose_estimation_time_taken = time.time( ) - start_loading_time_head_pose_estimation start_loading_time_landmarks_regression_retail = time.time() infer_network_landmarks_regression_retail.load_model( args.model3, args.device) load_model_landmarks_regression_retail_time_taken = time.time( ) - start_loading_time_landmarks_regression_retail start_loading_time_gaze_estimation = time.time() infer_network_gaze_estimation.load_model(args.model4, args.device) load_model_gaze_estimation_time_taken = time.time( ) - start_loading_time_gaze_estimation except: logging.error("Error in loading the models") exit(1) logging.debug( "Loading times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} " .format(load_model_face_detection_time_taken, load_model_landmarks_regression_retail_time_taken, load_model_head_pose_estimation_time_taken, load_model_gaze_estimation_time_taken)) if output_intermediate_model == 'true': out = cv2.VideoWriter('out.mp4', CODEC, fps, (width, height)) total_time_taken_to_infer_inf_face_detection = 0 total_time_taken_to_infer_landmarks_regression_retail = 0 total_time_taken_to_infer_inf_head_pose_estimation = 0 total_time_taken_to_infer_gaze_estimation = 0 ### TODO: Loop until stream is over ### for batch in feed.next_batch(): ### TODO: Read from the video capture ### flag, frame = batch if not flag: break key_pressed = cv2.waitKey(60) ### TODO: Start inference for face detection ### start_inf_face_detection = time.time() outputs_face_detection = infer_network_face_detection.predict(frame) time_taken_to_infer_inf_face_detection = time.time( ) - start_inf_face_detection coords, frame = infer_network_face_detection.preprocess_output_face_detection( outputs_face_detection, width, height, args.prob_threshold, frame) if output_intermediate_model == 'true': out.write(frame) frame_crop_face = crop_face(coords, frame, output_intermediate_model) start_inf_head_pose_estimation = time.time() outputs_head_pose_estimation = infer_network_head_pose_estimation.predict( frame_crop_face) time_taken_to_infer_inf_head_pose_estimation = time.time( ) - start_inf_head_pose_estimation yaw, pitсh, roll = infer_network_head_pose_estimation.preprocess_output_head_pose_estimation( outputs_head_pose_estimation, frame_crop_face) head_pose_angles = [yaw, pitсh, roll] if output_intermediate_model == 'true': cv2.putText(frame, ("Yaw: " + str(int(yaw))), (100, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) cv2.putText(frame, ("Pitch: " + str(int(pitсh))), (100, 140), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) cv2.putText(frame, ("Roll: " + str(int(roll))), (100, 180), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1) height_crop_face = coords[0][3] - coords[0][1] width_crop_face = coords[0][2] - coords[0][0] start_inf_landmarks_regression_retail = time.time() outputs_landmarks_regression_retail = infer_network_landmarks_regression_retail.predict( frame_crop_face) time_taken_to_infer_landmarks_regression_retail = time.time( ) - start_inf_landmarks_regression_retail coord_landmarks_regression_retail = infer_network_landmarks_regression_retail.preprocess_output_landmarks_regression_retail( outputs_landmarks_regression_retail, width_crop_face, height_crop_face, args.prob_threshold, frame) center_left_eye = ((coords[0][0] + coord_landmarks_regression_retail[0]), coords[0][1] + coord_landmarks_regression_retail[1]) center_right_eye = ((coords[0][0] + coord_landmarks_regression_retail[2]), coords[0][1] + coord_landmarks_regression_retail[3]) xmin_left_eye = center_left_eye[0] - 30 ymin_left_eye = center_left_eye[1] - 30 xmax_left_eye = center_left_eye[0] + 30 ymax_left_eye = center_left_eye[1] + 30 xmin_right_eye = center_right_eye[0] - 30 ymin_right_eye = center_right_eye[1] - 30 xmax_right_eye = center_right_eye[0] + 30 ymax_right_eye = center_right_eye[1] + 30 frame_landmarks_regression_retail = cv2.circle(frame, center_left_eye, 2, (0, 255, 0), thickness=3) frame_landmarks_regression_retail = cv2.circle(frame, center_right_eye, 2, (0, 255, 0), thickness=3) box_left_eye = cv2.rectangle(frame, (xmin_left_eye, ymin_left_eye), (xmax_left_eye, ymax_left_eye), (0, 255, 0), 3) box_right_eye = cv2.rectangle(frame, (xmin_right_eye, ymin_right_eye), (xmax_right_eye, ymax_right_eye), (0, 255, 0), 3) if output_intermediate_model == 'true': out.write(frame_landmarks_regression_retail) ### TODO: Start inference for gaze estimation ### start_inf_gaze_estimation = time.time() outputs_gaze_estimation = infer_network_gaze_estimation.predict( box_left_eye, box_right_eye, head_pose_angles) time_taken_to_infer_gaze_estimation = time.time( ) - start_inf_gaze_estimation total_time_taken_to_infer_inf_face_detection = time_taken_to_infer_inf_face_detection + total_time_taken_to_infer_inf_face_detection total_time_taken_to_infer_landmarks_regression_retail = time_taken_to_infer_landmarks_regression_retail + total_time_taken_to_infer_landmarks_regression_retail total_time_taken_to_infer_inf_head_pose_estimation = time_taken_to_infer_inf_head_pose_estimation + total_time_taken_to_infer_inf_head_pose_estimation total_time_taken_to_infer_gaze_estimation = time_taken_to_infer_gaze_estimation + total_time_taken_to_infer_gaze_estimation arrow = 100 g_x = int(outputs_gaze_estimation[0] * arrow) g_y = int(-(outputs_gaze_estimation[1]) * arrow) frame = cv2.arrowedLine(frame, (center_left_eye), ((center_left_eye[0] + g_x), (center_left_eye[1] + g_y)), (0, 0, 255), 3) frame = cv2.arrowedLine(frame, (center_right_eye), ((center_right_eye[0] + g_x), (center_right_eye[1] + g_y)), (0, 0, 255), 3) if output_intermediate_model == 'true': out.write(frame) mouse_controler_pc = MouseController("high", "fast") mouse_controler_pc.move(outputs_gaze_estimation[0], outputs_gaze_estimation[1]) if key_pressed == 27: break feed.close() logging.debug( "total inference times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} " .format(total_time_taken_to_infer_inf_face_detection, total_time_taken_to_infer_landmarks_regression_retail, total_time_taken_to_infer_inf_head_pose_estimation, total_time_taken_to_infer_gaze_estimation)) if output_intermediate_model == 'true': out.release() #cap.release() cv2.destroyAllWindows()
def main(args): #model=args.model fd_model = args.face flmd_model = args.landmarks hp_model = args.head ge_model = args.gaze device = args.device display_flag = args.display # Init and load models fd = FaceDetection(fd_model, device) logger.info("######## Model loading Time #######") start = time.time() fd.load_model() logger.info("Face Detection Model: {:.1f}ms".format(1000 * (time.time() - start))) flmd = FacialLandMarksDetection(flmd_model, device) start = time.time() flmd.load_model() logger.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start))) hpe = HeadPoseEstimation(hp_model, device) start = time.time() hpe.load_model() logger.info("HeadPose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start))) ge = GazeEstimation(ge_model, device) start = time.time() ge.load_model() logger.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start))) # Mouse controller mc = MouseController("low", "fast") feed = InputFeeder(input_type=args.input_type, input_file=args.input_file) feed.load_data() frame_count = 0 fd_inference_time = 0 lm_inference_time = 0 hp_inference_time = 0 ge_inference_time = 0 move_mouse = False for batch in feed.next_batch(): frame_count += 1 # Preprocessed output from face detection face_boxes, image, fd_time = fd.predict(batch, display_flag) fd_inference_time += fd_time for face in face_boxes: cropped_face = batch[face[1]:face[3], face[0]:face[2]] #print(f"Face boxe = {face}") # Get preprocessed result from landmarks image, left_eye, right_eye, lm_time = flmd.predict( image, cropped_face, face, display_flag) lm_inference_time += lm_time # Get preprocessed result from pose estimation image, headpose_angels, hp_time = hpe.predict( image, cropped_face, face, display_flag) hp_inference_time += hp_time # Get preprocessed result from Gaze estimation model image, gazevector, ge_time = ge.predict(image, cropped_face, face, left_eye, right_eye, headpose_angels, display_flag) #cv2.imshow('Face', cropped_face) ge_inference_time += ge_time #print(f"Gaze vect {gazevector[0],gazevector[1]}") cv2.imshow('img', image) if (not move_mouse): mc.move(gazevector[0], gazevector[1]) break if cv2.waitKey(1) & 0xFF == ord("k"): break if (frame_count > 0): logger.info("###### Models Inference time ######") logger.info( f"Face Detection inference time = {(fd_inference_time*1000)/frame_count} ms" ) logger.info( f"Facial Landmarks Detection inference time = {(lm_inference_time*1000)/frame_count} ms" ) logger.info( f"Headpose Estimation inference time = {(hp_inference_time*1000)/frame_count} ms" ) logger.info( f"Gaze estimation inference time = {(ge_inference_time*1000)/frame_count} ms" ) feed.close()
def main(args): device = args.device video_file = args.video input_type = args.input_type toggle = args.toggle stats = args.stats model = args.model if stats == 'true': stats = True else: stats = False if toggle == 'true': toggle = True else: toggle = False # Start Model Loading start_model_load_time = time.time() print(f'[INFO] Started Model Loading...........') face_model = FaceDetection(parse_models_file( label='face_detection', path=model), device) face_model.load_model() # Load Landmark model landmark_model = LandMarksDetection( parse_models_file(label='facial_landmarks_detection', path=model), device) landmark_model.load_model() pose_estimation_model = HeadPoseEstimation( parse_models_file(label='head_pose_estimation', path=model), device) pose_estimation_model.load_model() gaze_estimation_model = GazeEstimation( parse_models_file(label='gaze_estimation', path=model), device) gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time print('[TOTAL] Loaded in {:.3f} ms'.format(total_model_load_time)) # End Model Loading mouse = MouseController('high', 'fast') if not toggle: cv2.namedWindow(MAIN_WINDOW_NAME, cv2.WINDOW_AUTOSIZE) try: feed = InputFeeder(input_type=input_type, input_file=video_file) feed.load_data() initial_w = int(feed.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) initial_h = int(feed.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) counter = 0 if not toggle: cv2.namedWindow(MAIN_WINDOW_NAME, cv2.WINDOW_NORMAL) for frame, _ in feed.next_batch(): if not _: break try: counter += 1 # Start Inferences coord = face_model.predict(frame, (initial_w, initial_h)) for i in range(len(coord)): xmin, ymin, xmax, ymax = coord[i] cropped_image = frame[ymin:ymax, xmin:xmax] # Landmark Inference cropped_left, cropped_right = landmark_model.predict(cropped_image) if cropped_right.shape[0] < 60 or cropped_left.shape[1] < 60: break if cropped_right.shape[1] < 60 or cropped_left.shape[0] < 60: break # Pose Estimation Inference poses = pose_estimation_model.predict(cropped_image) # Gaze Estimation Inference gz = gaze_estimation_model.predict(poses, cropped_left, cropped_right) # Mouse Controller mouse.move(gz[0][0], gz[0][1]) # If user pass statistics argument to true if stats: # Print performance performance_counts( face_model.performance_counter(0) ) performance_counts( pose_estimation_model.performance_counter(0) ) performance_counts( landmark_model.performance_counter(0) ) performance_counts( gaze_estimation_model.performance_counter(0) ) if not toggle: # Output Camera or Video #cv2.resizeWindow(MAIN_WINDOW_NAME, 480, 320) cv2.imshow(MAIN_WINDOW_NAME, frame) else: # Print Statistics only no camera or video performance_counts( face_model.performance_counter(0) ) performance_counts( pose_estimation_model.performance_counter(0) ) performance_counts( landmark_model.performance_counter(0) ) performance_counts( gaze_estimation_model.performance_counter(0) ) cv2.waitKey(1) except Exception as e: print('Could not run Inference', e) feed.close() except Exception as e: print("Could not run Inference: ", e)
if __name__ == '__main__': logger = logging.getLogger() args = arg_parse() input_file = args.input face_model_path = args.face_detection head_pose_path = args.head_pose facial_landmark_path = args.facial_landmark gaze_model_path = args.gaze_model face_model = FaceDetection(model_name=face_model_path) head_pose_model = HeadPoseEstimation(model_name=head_pose_path) facial_landmark_model = FacialLandmarkDetection( model_name=facial_landmark_path) gaze_estimation_model = GazeEstimation(model_name=gaze_model_path) mouse_controller = MouseController('medium', 'fast') start_time = time.time() face_model.load_model() face_loading_time = (time.time() - start_time) * 1000 head_start_time = time.time() head_pose_model.load_model() head_pose_time = (time.time() - head_start_time) * 1000 facial_landmark_start = time.time() facial_landmark_model.load_model() facial_landmark_time = (time.time() - facial_landmark_start) * 1000
def test_run(args): logging.getLogger().setLevel(logging.INFO) feeder = None activate_frame_count = 10 logging.warning("Running default value activate frame count = 10") if args.input_type == 'video' or args.input_type == 'image': feeder = InputFeeder(args.input_type, args.input) if args.input == '../bin/demo.mp4': logging.warning("Running default setting and input") elif args.input_type == 'webcam': feeder = InputFeeder(args.input_type, args.input) else: logging.error("Input not found") exit(1) mouse_controller = MouseController(args.precision, args.speed) feeder.load_data() start_time = 0 face_model_load_time = 0 start_time = time.time() face_model = FaceDetection(args.face, args.device, args.cpu_extension) face_model.load_model() face_model_load_time = time.time() - start_time logging.info("Face Detection Model Loaded...") head_pose_estimation_load_time = 0 start_time = time.time() head_pose_estimation = HeadPoseEstimation(args.headpose, args.device, args.cpu_extension) head_pose_estimation.load_model() head_pose_estimation_load_time = time.time() - start_time logging.info("Head Pose Detection Model Loaded...") facial_landmarks_detection_load_time = 0 start_time = time.time() facial_landmarks_detection = FacialLandmarksDetection( args.landmarks, args.device, args.cpu_extension) facial_landmarks_detection.load_model() facial_landmarks_detection_load_time = time.time() - start_time logging.info("Facial Landmark Detection Model Loaded...") gaze_model_load_time = 0 start_time = time.time() gaze_model = GazeEstimation(args.gazeestimation, args.device, args.cpu_extension) gaze_model.load_model() gaze_model_load_time = time.time() - start_time logging.info("Gaze Estimation Model Loaded...") frame_count = 0 total_face_model_inference_time = 0 total_head_pose_estimation_inference_time = 0 total_facial_landmarks_detection_inference_time = 0 total_gaze_model_inference_time = 0 start_time = 0 for frame in feeder.next_batch(): if frame is None: break frame_count += 1 key = cv2.waitKey(60) start_time = time.time() first_face_box, first_face = face_model.predict(frame.copy()) total_face_model_inference_time = total_face_model_inference_time + ( time.time() - start_time) start_time = time.time() head_pose_output = head_pose_estimation.predict(first_face_box.copy()) total_head_pose_estimation_inference_time = total_head_pose_estimation_inference_time + ( time.time() - start_time) start_time = time.time() left_eye, right_eye, eye_coords = facial_landmarks_detection.predict( first_face_box.copy()) total_facial_landmarks_detection_inference_time = total_facial_landmarks_detection_inference_time + ( time.time() - start_time) start_time = time.time() move_to_coors_mouse = gaze_model.predict(left_eye, right_eye, head_pose_output) total_gaze_model_inference_time = total_gaze_model_inference_time + ( time.time() - start_time) if frame_count % activate_frame_count == 0 and (args.flag == "3" or args.flag == "4"): mouse_controller.move(move_to_coors_mouse[0], move_to_coors_mouse[1]) cv2.imshow('video', frame) key = cv2.waitKey(60) if key == 27: break if args.flag == "1": cv2.rectangle(frame, (first_face[0], first_face[1]), (first_face[2], first_face[3]), (255, 0, 0)) cv2.imshow('video', frame) key = cv2.waitKey(60) elif args.flag == "2": cv2.rectangle(facial_landmarks_detection.image, (eye_coords[0], eye_coords[1]), (eye_coords[2], eye_coords[3]), (255, 0, 0)) cv2.imshow('video', facial_landmarks_detection.image) key = cv2.waitKey(60) elif args.flag == "3": if frame_count == 1: logging.info("Printing mouse coors: ") logging.info(move_to_coors_mouse) #Print Report if args.flag == "0": print('------------- BEGIN REPORT -------------') avg_inference_face_model = total_face_model_inference_time / frame_count avg_inference_headpose = total_head_pose_estimation_inference_time / frame_count avg_inference_facial_landmark = total_facial_landmarks_detection_inference_time / frame_count avg_inference_gaze_model = total_gaze_model_inference_time / frame_count print("Face Detection Model Load Time: ", args.face) print("Loading time: ", face_model_load_time) print("Inference time: ", avg_inference_face_model) print("Head Pose Detection Model: ", args.headpose) print("Loading time: ", head_pose_estimation_load_time) print("Inference time:", avg_inference_headpose) print("Facial Landmark Detection Model Load Time: ", args.landmarks) print("Loading time: ", facial_landmarks_detection_load_time) print("Inference time:", avg_inference_facial_landmark) print("Gaze Estimation Model Load Time: ", args.gazeestimation) print("Loading time: ", gaze_model_load_time) print("Inference time:", avg_inference_gaze_model) print('------------- END REPORT -------------')
def pipeline(args): feed = InputFeeder(args.i) feed.load_data() FaceDetectionPipe = FaceDetection(args.m_fd, args.pt, args.d, args.cpu_ext) load_time = time.time() FaceDetectionPipe.load_model() load_time_fd = time.time() - load_time FacialLandmarksPipe = FacialLandmarks(args.m_ld, args.d, args.cpu_ext) load_time = time.time() FacialLandmarksPipe.load_model() load_time_ld = time.time() - load_time HeadPoseEstimationPipe = HeadPoseEstimation(args.m_hpe, args.d, args.cpu_ext) load_time = time.time() HeadPoseEstimationPipe.load_model() load_time_hpe = time.time() - load_time GazeEstimationPipe = GazeEstimation(args.m_ge, args.d, args.cpu_ext) load_time = time.time() GazeEstimationPipe.load_model() load_time_ge = time.time() - load_time log.info('Load time for face detection model: ' + str(load_time_fd)) log.info('Load time for landmark detection model: ' + str(load_time_ld)) log.info('Load time for head pose estimation model: ' + str(load_time_hpe)) log.info('Load time for gaze estimation model: ' + str(load_time_ge)) inf_time_fd = inf_time_ld = inf_time_hpe = inf_time_ge = frame_count = 0 for frame in feed.next_batch(): if frame is None: break if cv2.waitKey(1) & 0xFF == ord('q'): break frame_count += 1 inf_time = time.time() fd_img_output, fd_coords = FaceDetectionPipe.predict(frame) inf_time_fd = time.time() - inf_time if (fd_coords == []): log.info('No face detected') else: inf_time = time.time() eye_l_image, eye_r_image, ld_coords = FacialLandmarksPipe.predict( fd_img_output) inf_time_ld = time.time() - inf_time inf_time = time.time() hpe_output = HeadPoseEstimationPipe.predict(fd_img_output) inf_time_hpe = time.time() - inf_time yaw, pitch, roll = hpe_output inf_time = time.time() ge_output = GazeEstimationPipe.predict(eye_l_image, eye_r_image, [yaw, pitch, roll]) inf_time_ge = time.time() - inf_time if frame_count % 5 == 0: pointer = MouseController('medium', 'fast') pointer.move(ge_output[0], ge_output[1]) fps_fd = 1 / inf_time_fd fps_ld = 1 / inf_time_ld fps_hpe = 1 / inf_time_hpe fps_ge = 1 / inf_time_ge if (args.v): v = Visualizer(frame, fd_img_output, fd_coords, ld_coords, hpe_output) v.visualize() log.info('Average inference time for face detection model: ' + str(inf_time_fd)) log.info('Average inference time for landmark detection model: ' + str(inf_time_ld)) log.info( 'Average inference time for head pose estimation model: ' + str(inf_time_hpe)) log.info('Average inference time for gaze estimation model: ' + str(inf_time_ge)) log.info('FPS for face detection model: ' + str(fps_fd)) log.info('FPS for landmark detection model: ' + str(fps_ld)) log.info('FPS for head pose estimation model: ' + str(fps_hpe)) log.info('FPS for gaze estimation model: ' + str(fps_ge)) log.info('Frames Count:' + str(frame_count)) mm = ModelMetrics() log.info('Writing stats to file...') mm.save_to_file('stats_fd.txt', 'FD/' + model_precision, inf_time_fd, fps_fd, load_time_fd) mm.save_to_file('stats_ld.txt', model_precision, inf_time_ld, fps_ld, load_time_ld) mm.save_to_file('stats_hpe.txt', model_precision, inf_time_hpe, fps_hpe, load_time_hpe) mm.save_to_file('stats_ge.txt', model_precision, inf_time_ge, fps_ge, load_time_ge) feed.close()
def main(): # Grab command line args logger = logging.getLogger() args = build_argparser().parse_args() flags=args.previewflags inputfile=args.input inputfeed=None if inputfile.lower()=='cam': inputfeed=InputFeeder('cam') elif inputfile.endswith('.jpg') or inputfile.endswith('.bmp') : inputfeed=InputFeeder("image",inputfile) #elif inputfile.endswith('.mp4') : #inputfeed=InputFeeder("video",inputfile) else: if not (os.path.isfile(inputfile)): print((inputfile)) logger.error("Specified input file doesn't exist") exit(1) inputfeed=InputFeeder("video",inputfile) model_paths={'GazeEstimation':args.gazeestimationnmodel,'FacialLandmarkDetection':args.faciallandmarkmodel,'HeadPoseEstimation':args.headposemodel,'FaceDetection':args.facedetectionmodel} for file in model_paths.keys(): if not os.path.isfile(model_paths[file]): logger.error("Unable to find specified "+file+" xml file") flm=FacialLandmarkDetection(model_paths['FacialLandmarkDetection'],args.device,args.cpu_extension) gze= GazeEstimation(model_paths['GazeEstimation'],args.device,args.cpu_extension) hpe=HeadPoseEstimation(model_paths['HeadPoseEstimation'],args.device,args.cpu_extension) fd=FaceDetection(model_paths['FaceDetection'],args.device,args.cpu_extension) flm.load_model() fd.load_model() gze.load_model() hpe.load_model() mc=MouseController('medium','fast') inputfeed.load_data() frame_count=0 for ret, frame in inputfeed.next_batch(): if not ret: break frame_count+=1 if frame_count%3==0: cv2.imshow('video',cv2.resize(frame,(300,300))) cv2.waitKey(1) facecoords,cropped_image=fd.predict(frame.copy(),args.prob_threshold) if type(cropped_image)==int: logger.error('unable to detect face') head_out=hpe.predict(cropped_image) left_eye,right_eye,eye=flm.predict(cropped_image) mouse_coords,gaze_vector=gze.predict(left_eye,right_eye,head_out) if (len(flags)!=0): preview_frame=frame.copy() if 'fd' in flags: preview_frame=cropped_image if 'fld' in flags: cv2.rectangle(cropped_image,(eye[0][0]-15,eye[0][1]-15),(eye[0][2]+15,eye[0][3]+15),(0,0,255)) cv2.rectangle(cropped_image,(eye[1][0]-15,eye[1][1]-15),(eye[1][2]+15,eye[1][3]+15),(0,0,255)) if 'hp' in flags: cv2.putText(preview_frame,"Pose Angles: roll:{:2f}|pitch:{:2f}|yaw:{:2f}|".format(head_out[2],head_out[1],head_out[0]),(10,20),cv2.FONT_HERSHEY_COMPLEX,0.25,(255,0,0),1) if 'ge' in flags: x,y,w=(int(gaze_vector[1]*12),int(gaze_vector[0]*12),130) left =cv2.line(left_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,0), 2) cv2.line(left, (x-w, y+w), (x+w, y-w), (255,0,0), 2) right = cv2.line(right_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,0), 2) cv2.line(right, (x-w, y+w), (x+w, y-w), (255,0,0), 2) cropped_image[eye[0][1]:eye[0][3],eye[0][0]:eye[0][2]] = left cropped_image[eye[1][1]:eye[1][3],eye[1][0]:eye[1][2]] = right cv2.imshow("visualisation_frame",cv2.resize(preview_frame,(300,300))) if frame_count%3==0: mc.move(mouse_coords[0],mouse_coords[1]) logger.error("video ended...") cv2.destroyAllWindows() inputfeed.close()
def infer_on_stream(args): models = None # Check selected precision model if "FP32" in args.precision: models = select_precision(args.precision) if "FP16" in args.precision: models = select_precision(args.precision) if "INT8" in args.precision: models = select_precision(args.precision) # Get Input input_feeder = InputFeeder(args.input_type, args.input_file) input_feeder.load_data() # Load face detection model face = FaceDetection(model_name=models[0], device=args.device, extensions=args.cpu_extension) face.load_model() # Load head pose model head = HeadPoseEstimation(model_name=models[1], device=args.device, extensions=args.cpu_extension) head.load_model() # Load facial landmark model landmark = FacialLandmarkDetection(model_name=models[2], device=args.device, extensions=args.cpu_extension) landmark.load_model() # Load gaze estimation model gaze = GazeEstimation(model_name=models[3], device=args.device, extensions=args.cpu_extension) gaze.load_model() # Initalize mouse controller mouse = MouseController('high', 'fast') for frame in input_feeder.next_batch(): # Break if number of next frame less then number of batch if frame is None: break # Estimate face region output_frame, cropped_face, box_coord = face.predict(frame) # Estimate head pose position head_pose = head.predict(cropped_face) head_pose = np.array(head_pose) # Estimate eyes landmark coordinates lr_eyes = landmark.predict(cropped_face) eyes = [] # Calculate eye image region for coord in lr_eyes: x = int(coord[0] + box_coord[0]) y = int(coord[1] + box_coord[1]) cv2.circle(output_frame, (x, y), 5, (255, 0, 0), -1) eye_box, cropped_eye = eyes_crop(output_frame, x, y, 40) cv2.rectangle(output_frame, eye_box[0], eye_box[1], (255, 0, 0), 1) eyes.append(cropped_eye) # Estimate gaze direction gaze_coords = gaze.predict(eyes[0], eyes[1], head_pose) # Move the mouse cursor mouse.move(gaze_coords[0], gaze_coords[1]) if "True" in args.visualize: cv2.imshow('Capture', output_frame) if cv2.waitKey(30) & 0xFF == ord('q'): break input_feeder.close() if "True" in args.visualize: cv2.destroyAllWindows()
def main(): ''' Main Function for Eye Gaze Based Mouse Controller Program using Multiple OpenVino Models ''' try: log.basicConfig(level=log.ERROR) #Argument parser args = build_argparser().parse_args() # Checking input type image ,video or cam(0) image_file_extension_list = ['jpg','jpeg','png','bmp','tiff','gif','webp'] if(args.input!='0'): input_file_extension = args.input.split('.')[len(args.input.split('.'))-1] if input_file_extension in image_file_extension_list: input_type ='image' else: input_type ='video' input_file = args.input else: input_type ='cam' input_file = None #Set show frame and annotate flag based on argument args.show_frame=True if args.show_frame else False if(args.show_frame): args.annot_frame=True if args.annot_frame else False else: args.annot_frame=False feed =InputFeeder(input_type,input_file) feed.load_data() #fps = feed.get_fps() #Set Performace Stats Levels based on which performance stats will be printed in console and written in stat file(if provided) perf_stat_lvl=args.perf_stat_lvl if(perf_stat_lvl>0 and (args.perf_stat_file is not None)): perf_stat_file = open(args.perf_stat_file ,'w') perf_stat_file.writelines(["##############################OpenVino Model Performance Stats##############################"]) else: perf_stat_file = None #Initialization of performance counters for all Models total_model_load_time = 0 all_model_infer_time =0 all_model_infer_time_min =99999999999999 all_model_infer_time_max =0 all_model_infer_time_avg =0 all_model_infer_time_total =0 face_detect_infer_time =0 face_detect_infer_time_min =99999999999999 face_detect_infer_time_max =0 face_detect_infer_time_avg =0 face_detect_infer_time_total=0 face_landmarks_infer_time =0 face_landmarks_infer_time_min =99999999999999 face_landmarks_infer_time_max =0 face_landmarks_infer_time_avg =0 face_landmarks_infer_time_total=0 head_estimation_infer_time =0 head_estimation_infer_time_min =99999999999999 head_estimation_infer_time_max =0 head_estimation_infer_time_avg =0 head_estimation_infer_time_total=0 gaze_estimation_infer_time =0 gaze_estimation_infer_time_min =99999999999999 gaze_estimation_infer_time_max =0 gaze_estimation_infer_time_avg =0 gaze_estimation_infer_time_total=0 #Instantiate Face Detection Class & Load corresponding model face_detect=FaceDetection(args.face_detect_model,args.device,args.cpu_extension) start_time=timeit.default_timer() face_detect.load_model() end_time = timeit.default_timer() model_load_time = end_time-start_time # Record Model Load time total_model_load_time = total_model_load_time + model_load_time log_perf_stat("Face Detection Model Loading Time: {0:.1f}ms".format(model_load_time*1000),perf_stat_lvl,perf_stat_file) #Instantiate Face Landmarks Detection Class & Load corresponding model face_lm_detect = FaceLandmarksDetection(args.face_landmarks_model,args.device,args.cpu_extension) start_time=timeit.default_timer() face_lm_detect.load_model() end_time = timeit.default_timer() model_load_time = end_time-start_time total_model_load_time = total_model_load_time + model_load_time log_perf_stat("Face Landmarks Detection Model Loading Time: {0:.1f}ms".format(model_load_time*1000),perf_stat_lvl,perf_stat_file) #Instantiate Head Pose Estimate Class & Load corresponding model head_pose_estimate = HeadPoseEstimation(args.head_pose_model,args.device,args.cpu_extension) start_time=timeit.default_timer() head_pose_estimate.load_model() end_time = timeit.default_timer() model_load_time = end_time-start_time total_model_load_time = total_model_load_time + model_load_time log_perf_stat("Head Estimation Model Loading Time: {0:.1f}ms".format(model_load_time*1000),perf_stat_lvl,perf_stat_file) #Instantiate Gaze Estimate Class & Load corresponding model gaze_estimate = GazeEstimation(args.gaze_estimation_model,args.device,args.cpu_extension) start_time=timeit.default_timer() gaze_estimate.load_model() end_time = timeit.default_timer() model_load_time = end_time-start_time total_model_load_time = total_model_load_time + model_load_time log_perf_stat("Gaze Estimation Model Loading Time: {0:.1f}ms".format(model_load_time*1000),perf_stat_lvl,perf_stat_file) #Instantiate Mouse Controller Class and reset mouse pointer to center of screen mouse_control = MouseController(args.mouse_prec,args.mouse_speed) mouse_control.move_mouse_to_center() #If Show frame flag is set open frame window if(args.show_frame): cv2.namedWindow('Output Image',cv2.WINDOW_NORMAL) cv2.resizeWindow('Output Image', 600,450) cv2.moveWindow('Output Image', 600,300) frame_no=0 frame_no_with_face =0 try: for image in feed.next_batch(): # Read frame one by one if (image is None): break if(input_type =='cam'): image =cv2.flip(image,1) # In case of cam input, flip image frame_no+=1 image =cv2.resize(image,(1920,1080)) #Run Face Detection Inferrence pipeline(Pre-process Input, Predict & Pre-process Output) face_detect_infer_time,face_detected ,bb_coord,annotated_image = run_infer_pipeline_face_detection(face_detect,image,args.prob_threshold,args.annot_frame) # Calculate face detection statistical(min,max ,avg) inference time across frames face_detect_infer_time_min,face_detect_infer_time_max,face_detect_infer_time_total =calculate_historical_infer_stats(face_detect_infer_time,face_detect_infer_time_min,face_detect_infer_time_max,face_detect_infer_time_total ) if(perf_stat_lvl>1):#Log frame by frame stats if perf stat level is more than 1 log_perf_stat("Face Detect Model, Frame No. {} Infer time : {:.2f}ms".format(frame_no,face_detect_infer_time*1000),perf_stat_lvl,perf_stat_file) if(face_detected):#if face detected run next models inference in pipeline frame_no_with_face+=1 #Run Face Landmark Detection Inferrence pipeline(Pre-process Input, Predict & Pre-process Output) face_landmarks_infer_time,left_eye_image,right_eye_image,annotated_image=run_infer_pipeline_face_landmark_detection(face_lm_detect,image,bb_coord,annotated_image,args.annot_frame) # Calculate face landmark detection statistical(min,max ,avg) inference time across frames face_landmarks_infer_time_min,face_landmarks_infer_time_max,face_landmarks_infer_time_total =calculate_historical_infer_stats(face_landmarks_infer_time,face_landmarks_infer_time_min,face_landmarks_infer_time_max,face_landmarks_infer_time_total ) if(perf_stat_lvl>1):#Log frame by frame stats if perf stat level is more than 1 log_perf_stat("Face LandMarks Detection Model, Frame No. {} Infer time : {:.6f}ms".format(frame_no_with_face,face_landmarks_infer_time*1000),perf_stat_lvl,perf_stat_file) #Run Head Estimation Inferrence pipeline(Pre-process Input, Predict & Pre-process Output) head_estimation_infer_time,head_angles,annotated_image=run_infer_pipeline_head_estimation(head_pose_estimate,image,bb_coord,annotated_image,args.annot_frame) # Calculate Head Estimate statistical(min,max ,avg) inference time across frames head_estimation_infer_time_min,head_estimation_infer_time_max,head_estimation_infer_time_total =calculate_historical_infer_stats(head_estimation_infer_time,head_estimation_infer_time_min,head_estimation_infer_time_max,head_estimation_infer_time_total ) if(perf_stat_lvl>1):#Log frame by frame stats if perf stat level is more than 1 log_perf_stat("Head Angles Estimation Model, Frame No. {} Infer time : {:.2f}ms".format(frame_no_with_face,head_estimation_infer_time*1000),perf_stat_lvl,perf_stat_file) #Run Gaze Estimation Inferrence pipeline(Pre-process Input, Predict & Pre-process Output) gaze_estimation_infer_time,annotated_image,gaze_output =run_infer_pipeline_gaze_estimation(gaze_estimate,image,left_eye_image,right_eye_image,head_angles,annotated_image,args.annot_frame) # Calculate Gaze Estimate statistical(min,max ,avg) inference time across frames gaze_estimation_infer_time_min,gaze_estimation_infer_time_max,gaze_estimation_infer_time_total =calculate_historical_infer_stats(gaze_estimation_infer_time,gaze_estimation_infer_time_min,gaze_estimation_infer_time_max,gaze_estimation_infer_time_total ) if(perf_stat_lvl>1): log_perf_stat("Gaze Estimation Model, Frame No. {} Infer time : {:.2f}ms".format(frame_no_with_face,gaze_estimation_infer_time*1000),perf_stat_lvl,perf_stat_file) # Calculate All 4 models total statistical(min,max ,avg) inference time across frames all_model_infer_time = face_detect_infer_time + face_landmarks_infer_time +head_estimation_infer_time + gaze_estimation_infer_time all_model_infer_time_min,all_model_infer_time_max,all_model_infer_time_total =calculate_historical_infer_stats(all_model_infer_time,all_model_infer_time_min,all_model_infer_time_max,all_model_infer_time_total ) if(perf_stat_lvl>0): log_perf_stat("All 4 Models Infer Time for Frame No. {} : {:.2f}ms".format(frame_no_with_face,all_model_infer_time*1000),perf_stat_lvl,perf_stat_file) #Set mouse x,y relative movement position base on gaze estimation model output move_x = gaze_output[0][0] move_y = gaze_output[0][1] mouse_control.move(move_x,move_y) annoatation_text = "Total All Models Inference time : {:.2f}ms".format(all_model_infer_time*1000) else: print ("No Face Detected") annoatation_text = "No Face Detected" #If annotation and show frame is True if(args.annot_frame): annotated_image= cv2.putText(annotated_image,annoatation_text , (20,90), cv2.FONT_HERSHEY_SIMPLEX,1, (255,0,0), 4, cv2.LINE_AA) if(args.show_frame): cv2.imshow('Output Image' ,annotated_image) if(input_type =='image'): cv2.waitKey(0) break else: if(cv2.waitKey(30)>0): break #At last log Summary of performance stats for each models and total all 4 models inference time across all frames if(perf_stat_lvl>0): log_perf_stat("#######Performance Summary Stats#######",perf_stat_lvl,perf_stat_file) face_detect_infer_time_avg =face_detect_infer_time_total/frame_no log_perf_stat("Face Detect Model Inference Time Summary : Min {:.2f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(face_detect_infer_time_min*1000,face_detect_infer_time_avg*1000,face_detect_infer_time_max*1000),perf_stat_lvl,perf_stat_file) if(frame_no_with_face!=0): face_landmarks_infer_time_avg =face_landmarks_infer_time_total/frame_no_with_face log_perf_stat("Face Landmarks Model Inference Time Summary: Min {:.5f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(face_landmarks_infer_time_min*1000,face_landmarks_infer_time_avg*1000,face_landmarks_infer_time_max*1000),perf_stat_lvl,perf_stat_file) head_estimation_infer_time_avg =head_estimation_infer_time_total/frame_no_with_face log_perf_stat("Head Estimation Model Inference Time Summary: Min {:.2f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(head_estimation_infer_time_min*1000,head_estimation_infer_time_avg*1000,head_estimation_infer_time_max*1000),perf_stat_lvl,perf_stat_file) gaze_estimation_infer_time_avg =gaze_estimation_infer_time_total/frame_no_with_face log_perf_stat("Gaze Estimation Model Inference Time Summary: Min {:.2f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(gaze_estimation_infer_time_min*1000,gaze_estimation_infer_time_avg*1000,gaze_estimation_infer_time_max*1000),perf_stat_lvl,perf_stat_file) all_model_infer_time_avg =all_model_infer_time_total/frame_no_with_face log_perf_stat("All 4 Models Total Inference Time Summary: Min {:.2f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(all_model_infer_time_min*1000,all_model_infer_time_avg*1000,all_model_infer_time_max*1000),perf_stat_lvl,perf_stat_file) feed.close() if(perf_stat_file is not None): perf_stat_file.close() cv2.destroyAllWindows() except KeyboardInterrupt: if(not perf_stat_file.closed ): perf_stat_file.close() if(feed.is_cap_open): feed.close() cv2.destroyAllWindows() print("Keyboard Interrupt, Exiting!!!") sys.exit() except Exception as e: log.error("Unexpected Error Happened, see below for more details") log.error("Exception Error Type:{}".format(str(e))) log.error("###Below is traceback for Debug###") log.error(traceback.format_exc()) log.error("Program will Exit!!!") sys.exit(0)
def main(): #Building the arguments args = build_parser().parse_args() previewFlag = args.previewFlags log = logging.getLogger() input_path = args.input inputFeed = None if input_path.lower() == 'cam': inputFeed = InputFeeder('cam') else: if not os.path.isfile(input_path): log.error("Unable to find the input file specified.") exit(1) inputFeed = InputFeeder('video', input_path) #Creating Model paths model_path = { 'FaceDetectionModel': args.facedetectionmodel, 'FacialLandmarksDetectionModel': args.faciallandmarkmodel, 'GazeEstimationModel': args.gazeestimationmodel, 'HeadPoseEstimationModel': args.headposemodel } for fnameKey in model_path.keys(): if not os.path.isfile(model_path[fnameKey]): log.error('Unable to find the specified ' + fnameKey + 'binary file(.xml)') exit(1) #Creating Model Instances fd = FaceDetection(model_path['FaceDetectionModel'], args.device, args.cpu_extension) flm = FacialLandmarkDetection(model_path['FacialLandmarksDetectionModel'], args.device, args.cpu_extension) gm = GazeEstimation(model_path['GazeEstimationModel'], args.device, args.cpu_extension) hpe = Head_Pose_estimation(model_path['HeadPoseEstimationModel'], args.device, args.cpu_extension) m_control = MouseController('medium', 'fast') #Loading data inputFeed.load_data() fd.load_model() flm.load_model() hpe.load_model() gm.load_model() frame_count = 0 for ret, frame in inputFeed.next_batch(): if not ret: break frame_count += 1 if frame_count % 10 == 0: cv2.imshow('Original Video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) coords, img = fd.predict(frame, args.prob_threshold) if type(img) == int: log.error("No face detected") if key == 27: break continue hpout = hpe.predict(img) left_eye, right_eye, eye_coord = flm.predict(img) mouse_coord, gaze_vec = gm.predict(left_eye, right_eye, hpout) if (not len(previewFlag) == 0): preview_img = img if 'fd' in previewFlag: preview_img = img if 'fld' in previewFlag: start_l = (eye_coord[0][0] - 10, eye_coord[0][1] - 10) end_l = (eye_coord[0][2] + 10, eye_coord[0][3] + 10) start_r = (eye_coord[1][0] - 10, eye_coord[1][1] - 10) end_r = (eye_coord[1][2] + 10, eye_coord[1][3] + 10) cv2.rectangle(img, start_l, end_l, (0, 255, 0), 2) cv2.rectangle(img, start_r, end_r, (0, 255, 0), 2) if 'hp' in previewFlag: cv2.putText( preview_img, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hpout[0], hpout[1], hpout[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (255, 255, 255), 1) if 'ge' in previewFlag: x, y, w = int(gaze_vec[0] * 12), int(gaze_vec[1] * 12), 160 lefteye = cv2.line(left_eye, (x - w, y - w), (x + w, y + w), (100, 0, 255), 1) cv2.line(lefteye, (x - w, y + w), (x + w, y - w), (100, 0, 255), 1) righteye = cv2.line(right_eye, (x - w, y - w), (x + w, y + w), (100, 0, 255), 1) cv2.line(righteye, (x - w, y + w), (x + w, y - w), (100, 0, 255), 1) img[eye_coord[0][1]:eye_coord[0][3], eye_coord[0][0]:eye_coord[0][2]] = lefteye img[eye_coord[1][1]:eye_coord[1][3], eye_coord[1][0]:eye_coord[1][2]] = righteye cv2.imshow("Detections", cv2.resize(preview_img, (500, 500))) if frame_count % 10 == 0: m_control.move(mouse_coord[0], mouse_coord[1]) if key == 27: break log.error("Videostream Completed") cv2.destroyAllWindows() inputFeed.close()
def main(): # Get command line arguments args = parser.parse_args() device = args.device cpu_extensions = args.extensions threshold = args.threshold gaze_estimation_precision = args.gaze_estimation_precision head_pose_precision = args.head_pose_precision face_detection_precision = args.face_detection_precision landmarks_precision = args.landmarks_precision input_feeder = InputFeeder(args) control_mouse = MouseController(args) gaze_model = 'models/intel/gaze-estimation-adas-0002/{}/gaze-estimation-adas-0002'.format( gaze_estimation_precision) face_detector_model = 'models/intel/face-detection-adas-binary-0001/{}/face-detection-adas-binary-0001'.format( face_detection_precision) facial_landmark_model = 'models/intel/landmarks-regression-retail-0009/{}/landmarks-regression-retail-0009'.format( landmarks_precision) head_pose_model = 'models/intel/head-pose-estimation-adas-0001/{}/head-pose-estimation-adas-0001'.format( head_pose_precision) # Initialize the models face_detector = FaceDetector(face_detector_model, args) facial_landmarks = FacialLandmarksDetector( model_name=facial_landmark_model, device=device, extensions=cpu_extensions) head_pose_estimation = HeadPoseEstimation(model_name=head_pose_model, device=device, extensions=cpu_extensions) gaze_estimation = GazeEstimation(model_name=gaze_model, device=device, extensions=cpu_extensions) # Load the models start_time = time.time() face_detector.load_model() face_detector_loadtime = time.time() - start_time start_time = time.time() facial_landmarks.load_model() facial_landmark_loadtime = time.time() - start_time start_time = time.time() head_pose_estimation.load_model() head_pose_estimation_loadtime = time.time() - start_time start_time = time.time() gaze_estimation.load_model() gaze_estimation_loadtime = time.time() - start_time logging.info('FINISH LOADING MODELS') try: width, height = input_feeder.load_data() except TypeError: logging.error('Invalid file type.') return output_handler = OutputHandler(args) output_handler.initalize_video_writer(width, height) frame_count = 0 start_time = 0 capture = input_feeder.cap inputs = args.input if input_feeder.input_type == 'cam': inputs = 0 else: capture.open(inputs) while capture.isOpened(): flag, frame = capture.read() if start_time == 0: start_time = time.time() if inputs == 0 and time.time() - start_time >= 1: gaze_estimate = run_inference(frame, face_detector, facial_landmarks, head_pose_estimation, gaze_estimation, output_handler) if gaze_estimate is None: break if gaze_estimate[0][0]: x, y = gaze_estimate[0][:2] control_mouse.move(x, y) start_time = 0 frame_count += 1 elif not inputs == 0: gaze_estimate = run_inference(frame, face_detector, facial_landmarks, head_pose_estimation, gaze_estimation, output_handler) if gaze_estimate is None: break if gaze_estimate[0][0] and time.time() - start_time >= 0.5: x, y = gaze_estimate[0][:2] control_mouse.move(x, y) start_time = 0 frame_count += 1 input_feeder.close() logging.info('TOTOAL FRAMES PROCESSED: {}'.format(frame_count)) logging.info('Time to load face detector model is {:.5f}'.format( face_detector_loadtime)) logging.info('Time to load head pose estimation model is {:.5f}'.format( head_pose_estimation_loadtime)) logging.info('Time to load facial landmarks model model is {:.5f}'.format( facial_landmark_loadtime)) logging.info('Time to load gaze estimation model is {:.5f}'.format( gaze_estimation_loadtime))
def main(): args = argparser().parse_args() device = args.device input_feed = args.input log = logging.getLogger() model_paths = { 'facedet': args.face_detection_model + 'xml', 'faceldmdet': args.landmark_detection_model + 'xml', 'headpose': args.pose_estimation_model + 'xml', 'gaze': args.gaze_estimation_model + 'xml' } for mp in model_paths.keys(): if not os.path.isfile(model_paths[mp]): print(model_paths[mp]) print('Recheck file path and try again') log.error("Not a file") raise FileNotFoundError if input_feed == 'cam': feed = InputFeeder(input_type='cam') elif not os.path.isfile(input_feed): print('Recheck file path and try again') log.error("Unable to find specified video file") raise FileNotFoundError else: feed = InputFeeder(input_type='video', input_file=input_feed) facedet = FaceDetection(args.face_detection_model, args.device, args.extensions, args.async_mode) faceldmdet = FacialLandmarksDetection(args.landmark_detection_model, args.device, args.extensions, args.async_mode) headpose = HeadPose(args.pose_estimation_model, args.device, args.extensions, args.async_mode) gaze = GazeEstimation(args.gaze_estimation_model, args.device, args.extensions, args.async_mode) try: log.info('Loading models...') facedet.load_model() faceldmdet.load_model() headpose.load_model() gaze.load_model() feed.load_data() log.info('Models loaded successfully!') except: log.error('One or more of the models failed to load..') exit(1) log.info('Initializing mouse controller') mouse = MouseController(precision='medium', speed='fast') for batch in feed.next_batch(): face = facedet.predict(batch) eyes, eye_coords = faceldmdet.predict(face) pose = headpose.predict(face) point = gaze.predict(pose, eyes) #print('Gaze values = ', point[0], point[1]) log.info('All inference complete') #print('view_inter = ', args.view_intermediate) if args.input == 'cam': point[0] = -point[0] mouse.move(point[0], point[1]) if args.view_intermediate == True: visualize(pose, face, eye_coords, point)
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("gaze-app.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") #mc.move(100,100) fdnet = FaceDetection(args.fdmodel) lmnet = FacialLandmarks(args.lmmodel) hpnet = HeadPoseEstimation(args.hpmodel) genet = GazeEstimation(args.gemodel) ### Load the model through ### logging.info("============== Models Load time ===============") start_time = time.time() fdnet.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) fdnet.check_model() logging.info("Face Detection estimation layers loaded correctly") start_time = time.time() lmnet.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) lmnet.check_model() logging.info("Facial Landmarks estimation layers loaded correctly") start_time = time.time() hpnet.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) hpnet.check_model() logging.info("Head pose estimation layers loaded correctly") start_time = time.time() genet.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) genet.check_model() logging.info("Gaze estimation layers loaded correctly") logging.info("============== End =====================") # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() # FPS = feeder.get_fps() # Grab the shape of the input # width = feeder.get_width() # height = feeder.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 #print(int((frame_count) % int(FPS))) # face detection fd_process_time = time.time() p_frame = fdnet.preprocess_input(frame) start_time = time.time() fnoutput = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, fboxes = fdnet.preprocess_output( fnoutput, frame, args.print) logging.info( "Face Detection Model processing time : {:.1f}ms".format( 1000 * (time.time() - fd_process_time))) #for each face for fbox in fboxes: # fbox = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]] lm_process_time = time.time() p_frame = lmnet.preprocess_input(face) start_time = time.time() lmoutput = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lmoutput, fbox, out_frame, args.print) logging.info( "Landmarks model processing time : {:.1f}ms".format( 1000 * (time.time() - lm_process_time))) # get head pose estimation hp_process_time = time.time() p_frame = hpnet.preprocess_input(face) start_time = time.time() hpoutput = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angels = hpnet.preprocess_output( hpoutput, out_frame, face, fbox, args.print) logging.info( "Headpose estimation model processing time : {:.1f}ms". format(1000 * (time.time() - hp_process_time))) # get gaze estimation gaze_process_time = time.time() out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() geoutput = genet.predict(left_eye, right_eye, headpose_angels) ge_infertime += time.time() - start_time out_frame, gazevector = genet.preprocess_output( geoutput, out_frame, fbox, left_eye_point, right_eye_point, args.print) logging.info( "Gaze estimation model processing time : {:.1f}ms".format( 1000 * (time.time() - gaze_process_time))) if (not args.no_video): cv2.imshow('im', out_frame) if (not args.no_move): mc.move(gazevector[0], gazevector[1]) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") # Release the capture and destroy any OpenCV windows feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception("Error in inference:" + str(ex))
def main(): # command line args args = build_argparser().parse_args() input_file_path = args.input log_object = log.getLogger() oneneneflags = args.visualization_flag # Initialise the classes fd_object = FaceDetection(model_name=args.face_detection_model, device=args.device, threshold=args.prob_threshold, extensions=args.cpu_extension) fl_object = FacialLandmarkDetection(model_name=args.facial_landmarks_model, device=args.device, extensions=args.cpu_extension) hp_object = HeadPoseEstimation(model_name=args.head_pose_model, device=args.device, extensions=args.cpu_extension) ge_object = GazeEstimation(model_name=args.gaze_estimation_model, device=args.device, extensions=args.cpu_extension) mouse_controller_object = MouseController('low', 'fast') ### Loading the models ### log_object.error( "=================== Models Load Time ====================") start_time = time.time() fd_object.load_model() log_object.error("Face detection model loaded in {:.3f} ms".format( (time.time() - start_time) * 1000)) fl_start = time.time() fl_object.load_model() log_object.error( "Facial landmarks detection model loaded in {:.3f} ms".format( (time.time() - fl_start) * 1000)) hp_start = time.time() hp_object.load_model() log_object.error("Head pose estimation model loaded in {:.3f} ms".format( (time.time() - hp_start) * 1000)) ge_start = time.time() ge_object.load_model() log_object.error("Gaze estimation model loaded in {:.3f} ms".format( (time.time() - ge_start) * 1000)) total_time = time.time() - start_time log_object.error( "=================== Models loaded successfully ===================") log_object.error("Total loading time is {:.3f} ms".format(total_time * 1000)) counter = 0 infer_start = time.time() log_object.error( "=================== Start inferencing on input video ====================" ) if input_file_path == "CAM": input_feeder = InputFeeder("cam") else: if not os.path.isfile(input_file_path): exit(1) input_feeder = InputFeeder("video", input_file_path) log_object.error("Input feeders are loaded") input_feeder.load_data() for frame in input_feeder.next_batch(): # if not flag: # break pressed_key = cv2.waitKey(60) counter += 1 face_coordinates, face_image = fd_object.predict(frame.copy()) if face_coordinates == 0: continue hp_output = hp_object.predict(face_image) left_eye_image, right_eye_image, eye_coord = fl_object.predict( face_image) mouse_coordinate, gaze_vector = ge_object.predict( left_eye_image, right_eye_image, hp_output) if len(oneneneflags) != 0: preview_window = frame.copy() if 'fd' in oneneneflags: if len(oneneneflags) != 1: preview_window = face_image else: cv2.rectangle(preview_window, (face_coordinates[0], face_coordinates[1]), (face_coordinates[2], face_coordinates[3]), (0, 150, 0), 3) if 'fl' in oneneneflags: if not 'fd' in oneneneflags: preview_window = face_image.copy() cv2.rectangle(preview_window, (eye_coord[0][0], eye_coord[0][1]), (eye_coord[0][2], eye_coord[0][3]), (150, 0, 150)) cv2.rectangle(preview_window, (eye_coord[1][0], eye_coord[1][1]), (eye_coord[1][2], eye_coord[1][3]), (150, 0, 150)) if 'hp' in oneneneflags: cv2.putText( preview_window, "yaw:{:.1f} | pitch:{:.1f} | roll:{:.1f}".format( hp_output[0], hp_output[1], hp_output[2]), (20, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1) if 'ge' in oneneneflags: yaw = hp_output[0] pitch = hp_output[1] roll = hp_output[2] focal_length = 950.0 scale = 50 center_of_face = (face_image.shape[1] / 2, face_image.shape[0] / 2, 0) if 'fd' in oneneneflags or 'fl' in oneneneflags: draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) else: draw_axes(frame, center_of_face, yaw, pitch, roll, scale, focal_length) if len(oneneneflags) != 0: img_hor = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_window, (500, 500)))) else: img_hor = cv2.resize(frame, (500, 500)) cv2.imshow('Visualization', img_hor) mouse_controller_object.move(mouse_coordinate[0], mouse_coordinate[1]) if pressed_key == 27: log_object.error("exit key is pressed..") break infer_time = round(time.time() - infer_start, 1) fps = int(counter) / infer_time log_object.error("counter {} seconds".format(counter)) log_object.error("total inference time {} seconds".format(infer_time)) log_object.error("fps {} frame/second".format(fps)) log_object.error("Video session has ended") with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f: f.write(str(infer_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_time) + '\n') input_feeder.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() device_name = args.device prob_threshold = args.prob_threshold logger_object = log.getLogger() # Initialize variables with the input arguments model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'FacialLandmarkModel': args.facialLandmarksModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } # Instantiate model face_model = FaceDetection(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_model = FacialLandmarksDetection(model_path_dict['FacialLandmarkModel'], device_name, threshold=prob_threshold) head_pose_model = HeadPoseEstimation(model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_model = GazeEstimation(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) mouse_controller = MouseController('medium', 'fast') # Load Models and get time start_time = time.time() face_model.load_model() logger_object.error("Face detection model loaded: time: {:.3f} ms".format((time.time() - start_time) * 1000)) first_mark = time.time() landmark_model.load_model() logger_object.error( "Facial landmarks detection model loaded: time: {:.3f} ms".format((time.time() - first_mark) * 1000)) second_mark = time.time() head_pose_model.load_model() logger_object.error("Head pose estimation model loaded: time: {:.3f} ms".format((time.time() - second_mark) * 1000)) third_mark = time.time() gaze_model.load_model() logger_object.error("Gaze estimation model loaded: time: {:.3f} ms".format((time.time() - third_mark) * 1000)) load_total_time = time.time() - start_time logger_object.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000)) logger_object.error("All models are loaded successfully..") # Check extention of these unsupported layers face_model.check_model() landmark_model.check_model() head_pose_model.check_model() gaze_model.check_model() preview_flags = args.previewFlags input_filename = args.input output_path = args.output_path prob_threshold = args.prob_threshold if input_filename.lower() == 'cam': input_feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger_object.error("Unable to find specified video file") exit(1) input_feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger_object.error("Unable to find specified model file" + str(model_path)) exit(1) input_feeder.load_data() width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS)) out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps, (width, height), True) frame_counter = 0 start_inf_time = time.time() for ret, frame in input_feeder.next_batch(): if not ret: break frame_counter += 1 key = cv2.waitKey(60) try: cropped_image, face_cords = face_model.predict(frame, prob_threshold) if type(cropped_image) == int: print("Unable to detect the face") if key == 27: break continue left_eye, right_eye, eye_cords = landmark_model.predict(cropped_image) pose_output = head_pose_model.predict(cropped_image) x, y, z = gaze_model.predict(left_eye, right_eye, pose_output, cropped_image, eye_cords) mouse_controller.move(x, y) except Exception as e: print(str(e) + " for frame " + str(frame_counter)) continue image = cv2.resize(frame, (width, height)) if not len(preview_flags) == 0: preview_frame = frame.copy() if 'fd' in preview_flags: if len(preview_flags) != 1: preview_frame = cropped_image cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (0, 0, 255), 3) if 'hp' in preview_flags: cv2.putText( frame, "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".format( pose_output[0], pose_output[1], pose_output[2]), (20, 40), cv2.FONT_HERSHEY_DUPLEX, 1, (255, 0, 0), 3) if 'ge' in preview_flags: cv2.putText( frame, "Gaze vector: x= {:.2f} , y= {:.2f} , z= {:.2f}".format( x, y, z), (15, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 3) image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(image) if frame_counter % 5 == 0: mouse_controller.move(x, y) if key == 27: break inference_time = round(time.time() - start_inf_time, 1) fps = int(frame_counter) / inference_time logger_object.error("counter {} seconds".format(frame_counter)) logger_object.error("total inference time {} seconds".format(inference_time)) logger_object.error("fps {} frame/second".format(fps)) with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f: f.write('inference time : ' + str(inference_time) + '\n') f.write('fps: ' + str(fps) + '\n') f.write('Models Loading: '+ str(load_total_time) + '\n') logger_object.error('Video stream ended') cv2.destroyAllWindows() input_feeder.close()
def infer(args, logging_enabled): """ run inference on input video, display/save output video """ face_detection = FaceDetection(args.face_detection) facial_landmark_detection = FacialLandmarkDetection( args.facial_landmark_detection) gaze_estimation = GazeEstimation(args.gaze_estimation) head_pose_estimation = HeadPoseEstimation(args.head_pose_estimation) load_start = now() face_detection.load_model() fl_start = now() facial_landmark_detection.load_model() ge_start = now() gaze_estimation.load_model() hp_start = now() head_pose_estimation.load_model() log_model_load_times(logging_enabled, load_start, fl_start, ge_start, hp_start) feeder = InputFeeder("video", args.input) feeder.load_data() frame_count, fd_time, fl_time, ge_time, hp_time = [0] * 5 while 1: key = cv2.waitKey(20) try: frame = next(feeder.next_batch()) except StopIteration: break frame_count += 1 fd_frame = face_detection.preprocess_input(frame) inf_start = now() fd_output = face_detection.predict(fd_frame) fd_time += now() - inf_start out_frame, faces = face_detection.preprocess_output( fd_output, frame, args.overlay_inference, args.probability_threshold) detected_face = frame[faces[0][1]:faces[0][3], faces[0][0]:faces[0][2]] fl_frame = facial_landmark_detection.preprocess_input(detected_face) fl_start = now() fl_output = facial_landmark_detection.predict(fl_frame) fl_time += now() - fl_start out_frame, l_coord, r_coord, = facial_landmark_detection.preprocess_output( fl_output, faces[0], out_frame, args.overlay_inference) hp_frame = head_pose_estimation.preprocess_input(detected_face) hp_start = now() hp_output = head_pose_estimation.predict(hp_frame) hp_time += now() - hp_start out_frame, head_pose = head_pose_estimation.preprocess_output( hp_output, out_frame, detected_face, faces[0], args.overlay_inference) out_frame, l_eye, r_eye = gaze_estimation.preprocess_input( out_frame, detected_face, l_coord, r_coord, args.overlay_inference) ge_start = now() ge_output = gaze_estimation.predict(head_pose, l_eye, r_eye) ge_time += now() - ge_start out_frame, g_vec = gaze_estimation.preprocess_output( ge_output, out_frame, faces[0], l_coord, r_coord, args.overlay_inference) if args.video_window: cv2.imshow( "Computer-Human Interface Peripheral Signal Manipulation via AI Retina Tracking (CHIPSMART)", out_frame, ) if args.mouse_control and frame_count % 6 == 0: mouse_control.move(g_vec[0], g_vec[1]) # Quit if user presses Esc or Q if key in (27, 81): user_quit(logging_enabled) break log_inference_times(logging_enabled, frame_count, fd_time, fl_time, ge_time, hp_time) feeder.close() cv2.destroyAllWindows() quit()
class MoveMouse: ''' Main Class for the Mouse Controller app. This is the class where all the models are stitched together to control the mouse pointer ''' def __init__(self, args): ''' This method instances variables for the Facial Landmarks Detection Model. Args: args = All arguments parsed by the arguments parser function Return: None ''' init_start_time = time.time() self.output_path = args.output_path self.show_output = args.show_output self.total_processing_time = 0 self.count_batch = 0 self.inference_speed = [] self.avg_inference_speed = 0 if args.all_devices != 'CPU': args.face_device = args.all_devices args.face_landmark_device = args.all_devices args.head_pose_device = args.all_devices args.gaze_device = args.all_devices model_init_start = time.time() self.face_model = FaceDetection(args.face_model, args.face_device, args.face_device_ext, args.face_prob_threshold) self.landmarks_model = FacialLandmarksDetection( args.face_landmark_model, args.face_landmark_device, args.face_landmark_device_ext, args.face_landmark_prob_threshold) self.head_pose_model = HeadPoseEstimation( args.head_pose_model, args.head_pose_device, args.head_pose_device_ext, args.head_pose_prob_threshold) self.gaze_model = GazeEstimation(args.gaze_model, args.gaze_device, args.gaze_device_ext, args.gaze_prob_threshold) self.model_init_time = time.time() - model_init_start log.info('[ Main ] All required models initiallized') self.mouse_control = MouseController(args.precision, args.speed) log.info('[ Main ] Mouse controller successfully initialized') self.input_feeder = InputFeeder(args.batch_size, args.input_type, args.input_file) log.info('[ Main ] Initialized input feeder') model_load_start = time.time() self.face_model.load_model() self.landmarks_model.load_model() self.head_pose_model.load_model() self.gaze_model.load_model() self.model_load_time = time.time() - model_load_start self.app_init_time = time.time() - init_start_time log.info('[ Main ] All moadels loaded to Inference Engine\n') return None def draw_face_box(self, frame, face_coords): ''' Draws face's bounding box on the input frame Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with bounding box of faces drawn on it ''' start_point = (face_coords[0][0], face_coords[0][1]) end_point = (face_coords[0][2], face_coords[0][3]) thickness = 5 color = (255, 86, 0) frame = cv2.rectangle(frame, start_point, end_point, color, thickness) return frame def draw_eyes_boxes(self, frame, left_eye_coords, right_eye_coords): ''' Draws face's bounding box on the input frame Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with bounding box of left and right eyes drawn on it ''' left_eye_start_point = (left_eye_coords[0], left_eye_coords[1]) left_eye_end_point = (left_eye_coords[2], left_eye_coords[3]) right_eye_start_point = (right_eye_coords[0], right_eye_coords[1]) right_eye_end_point = (right_eye_coords[2], right_eye_coords[3]) thickness = 5 color = (0, 210, 0) frame = cv2.rectangle(frame, left_eye_start_point, left_eye_end_point, color, thickness) frame = cv2.rectangle(frame, right_eye_start_point, right_eye_end_point, color, thickness) return frame def draw_outputs(self, frame): ''' Draws the inference outputs (bounding boxes of the face and both eyes and the 3D head pose directions) of the four models onto the frames. Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with all inference outputs drawn on it ''' frame = self.draw_face_box(frame, self.face_coords) frame = self.draw_eyes_boxes(frame, self.left_eye_coords, self.right_eye_coords) frame_id = f'Batch id = {self.count_batch}' avg_inference_speed = f'Avg. inference speed = {self.avg_inference_speed:.3f}fps' total_processing_time = f'Total infer. time = {self.total_processing_time:.3f}s' cv2.putText(frame, frame_id, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) cv2.putText(frame, avg_inference_speed, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) cv2.putText(frame, total_processing_time, (15, 45), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) return frame def run_inference(self, frame): ''' Performs inference on the input video or image by passing it through all four models to get the desired coordinates for moving the mouse pointer. Args: frame = Input image, frame from video or camera feed Return: None ''' self.input_feeder.load_data() for frame in self.input_feeder.next_batch(): if self.input_feeder.frame_flag == True: log.info('[ Main ] Started processing a new batch') start_inference = time.time() self.face_coords, self.face_crop = self.face_model.predict( frame) if self.face_coords == []: log.info( '[ Main ] No face detected.. Waiting for you to stare at the camera' ) f.write('[ Error ] No face was detected') else: self.head_pose_angles = self.head_pose_model.predict( self.face_crop) self.left_eye_coords, self.left_eye_image, self.right_eye_coords, self.right_eye_image = self.landmarks_model.predict( self.face_crop) self.x, self.y = self.gaze_model.predict( self.left_eye_image, self.right_eye_image, self.head_pose_angles) log.info( f'[ Main ] Relative pointer coordinates: [{self.x:.2f}, {self.y:.2f}]' ) batch_process_time = time.time() - start_inference self.total_processing_time += batch_process_time self.count_batch += 1 log.info( f'[ Main ] Finished processing batch. Time taken = {batch_process_time}s\n' ) self.mouse_control.move(self.x, self.y) if self.show_output: self.draw_outputs(frame) cv2.imshow('Computer Pointer Controller Output', frame) self.inference_speed.append(self.count_batch / self.total_processing_time) self.avg_inference_speed = sum(self.inference_speed) / len( self.inference_speed) with open(os.path.join(self.output_path, 'outputs.txt'), 'w+') as f: f.write('INFERENCE STATS\n') f.write( f'Total model initialization time : {self.model_init_time:.2f}s\n' ) f.write( f'Total model load time: {self.model_load_time:.2f}s\n' ) f.write( f'App initialization time: {self.app_init_time:.2f}s\n' ) f.write( f'Total processing time: {self.total_processing_time:.2f}s\n' ) f.write( f'Average inference speed: {self.avg_inference_speed:.2f}FPS\n' ) f.write(f'Batch count: {self.count_batch}\n\n') f.write('LAST OUTPUTS\n') f.write(f'Face coordinates: {self.face_coords}\n') f.write(f'Left eye coordinates: {self.left_eye_coords}\n') f.write( f'Right eye coordinates: {self.right_eye_coords}\n') f.write(f'Head pose angles: {self.head_pose_angles}\n') f.write( f'Relative pointer coordinates/ Gaze vector: [{self.x:.2f}, {self.y:.2f}]' ) else: self.input_feeder.close() cv2.destroyAllWindows() log.info( f'[ Main ] All input Batches processed in {self.total_processing_time:.2f}s' ) log.info('[ Main ] Shutting down app...') log.info('[ Main ] Mouse controller app has been shut down.') break return
def main(): args = build_argparser().parse_args() # initialize variables with the input arguments for easy access fdm = args.face_detection_model ldm = args.facial_landmarks_detection_model hpem = args.head_pose_estimation_model gem = args.gaze_estimation_model output_flags = args.output_flags input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold cpu_extension = args.cpu_extension if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): log.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) # initialize model face_detection_model = FaceDetect(fdm, device_name, cpu_extension, prob_threshold) landmark_detection_model = FacialLandmarks(ldm, device_name, cpu_extension, prob_threshold) head_pose_estimation_model = HeadPose(hpem, device_name, cpu_extension, prob_threshold) gaze_estimation_model = GazeEstimation(gem, device_name, cpu_extension, prob_threshold) mouse_controller = MouseController('medium', 'fast') # load Models start_model_load_time = time.time() face_detection_model.load_model() #load face detection model log.info("Face Detection Model Loaded...") FDMT = time.time() - start_model_load_time start1 = time.time() landmark_detection_model.load_model() #load_landmark_detection_model log.info("landmark_estimation Model Loaded...") LDMT = time.time() - start1 start2 = time.time() head_pose_estimation_model.load_model() #load_head_pose_estimation_model log.info("Head pose estimation model Loaded...") hpem = time.time() - start2 start3 = time.time() gaze_estimation_model.load_model() #load_gaze_estimation_model log.info("Gaze_estimation model loaded..") gem = time.time() - start3 total_time = time.time() - start_model_load_time feeder.load_data() #check for output flags if (len(output_flags) != 0): for flag in output_flags: if not flag in ['fdm', 'lrm', 'hp', 'gze']: log.error("Flag '" + flag + "' is not a valid preview flag.") sys.exit(1) frame_count = 0 start_inference_time = time.time() for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 #if frame_count%5==0: #cv2.imshow('video',cv2.resize(frame,(500,500))) key = cv2.waitKey(60) try: image, fc = face_detection_model.predict(frame, args.prob_threshold) #print (fc) #print (image.shape) #face_cords1=face_cords[0] #face_c = face_cords1.astype(np.int8) #print (image.shape) if type(image) == int: log.warning("Unable to detect the face") if key == 27: break continue #for cord in face_c: #face1=cord.astype(np.int32) # cord = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame #face = image[face_cords1[1]:face_cords1[3],face_cords1[0]:face_cords1[2]] #print (face.shape) if 'fdm' in output_flags: #cv2.rectangle(frame,(fc[0],fc[1]),(fc[2],fc[4]),3) cv2.putText(frame, "face detected", (10, 140), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 4) # predicting using landmark detection model left_eye_image, right_eye_image, eye_coords = landmark_detection_model.predict( image) #using the output of face detection model print(eye_coords) eye_buffer = 10 if 'lrm' in output_flags: view_eye_rectangle(eye_coords, eye_buffer, image) print(left_eye_image.shape) print(right_eye_image.shape) #predicting using head_pose_estimation model pose_output = head_pose_estimation_model.predict(image) yaw = pose_output[0] pitch = pose_output[1] roll = pose_output[2] if "hp" in output_flags: cv2.putText( frame, "Pose Angles: yaw:{:.2f}, pitch:{:.2f}, roll:{:.2f}". format(yaw, pitch, roll), (10, 40), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 0), 4) mouse_coord, gaze_vector = gaze_estimation_model.predict( left_eye_image, right_eye_image, pose_output) if "gze" in output_flags: cv2.putText( frame, "Gaze Cords: x= {:.2f} , y= {:.2f} , z= {:.2f}".format( gaze_vector[0], gaze_vector[1], gaze_vector[2]), (10, 90), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 4) except Exception as e: log.warning("Could not predict using model " + str(e) + " for frame " + str(frame_count)) continue #image = cv2.resize(frame, (500, 500)) total_inference_time = time.time() - start_inference_time cv2.imshow("Visualization", cv2.resize(frame, (500, 500))) #out_video.write(preview_frame) #moving_mouse_controller if frame_count % 5 == 0: mouse_controller.move(mouse_coord[0], -1 * mouse_coord[1]) if key == 27: break log.error("VideoStream ended...") print("total_model_load time is {:} ms".format(1000 * total_time / frame_count)) print("fps is {:}".format(int(feeder.get_fps()))) print("total inference time is{:} ms".format(1000 * total_inference_time / frame_count)) print("fdmt loading time is{:} ms".format(1000 * FDMT / frame_count)) print("ldmt loading time is{:} ms".format(1000 * LDMT / frame_count)) print("hpem loading tiem{:} ms".format(1000 * hpem / frame_count)) print("gzem loading time{:} ms".format(1000 * hpem / frame_count)) cv2.destroyAllWindows() feeder.close()
def main(): # Grab command line args args = build_argparser().parse_args() flags = args.models_outputs_flags logger = logging.getLogger() input_file_path = args.input input_feeder = None if input_file_path.lower() == "cam": input_feeder = InputFeeder("cam") else: if not os.path.isfile(input_file_path): logger.error("Unable to find specified video file") exit(1) input_feeder = InputFeeder("video", input_file_path) model_path_dict = { 'FaceDetection': args.face_detection_model, 'FacialLandmarks': args.facial_landmarks_model, 'GazeEstimation': args.gaze_estimation_model, 'HeadPoseEstimation': args.head_pose_estimation_model } for file_name_key in model_path_dict.keys(): if not os.path.isfile(model_path_dict[file_name_key]): logger.error("Unable to find specified " + file_name_key + " xml file") exit(1) fdm = FaceDetection(model_path_dict['FaceDetection'], args.device, args.cpu_extension) flm = FacialLandmarks(model_path_dict['FacialLandmarks'], args.device, args.cpu_extension) gem = GazeEstimation(model_path_dict['GazeEstimation'], args.device, args.cpu_extension) hpem = HeadPoseEstimation(model_path_dict['HeadPoseEstimation'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') input_feeder.load_data() fdm.load_model() flm.load_model() hpem.load_model() gem.load_model() frame_count = 0 for ret, frame in input_feeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) cropped_face, face_coords = fdm.predict(frame, args.prob_threshold) if type(cropped_face) == int: logger.error("Unable to detect any face.") if key == 27: break continue hp_output = hpem.predict(cropped_face) left_eye_img, right_eye_img, eye_coords = flm.predict(cropped_face) new_mouse_coord, gaze_vector = gem.predict(left_eye_img, right_eye_img, hp_output) if (not len(flags) == 0): preview_frame = frame if 'fd' in flags: preview_frame = cropped_face if 'fld' in flags: cv2.rectangle(cropped_face, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle(cropped_face, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) if 'hp' in flags: cv2.putText( preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_output[0], hp_output[1], hp_output[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in flags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 left_eye = cv2.line(left_eye_img, (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(left_eye, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) right_eye = cv2.line(right_eye_img, (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(right_eye, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) cropped_face[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = left_eye cropped_face[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = right_eye cv2.imshow("Visualization", cv2.resize(preview_frame, (500, 500))) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("VideoStream ended...") cv2.destroyAllWindows() input_feeder.close()
def main(): args = argparser().parse_args() log.basicConfig(filename='log.log', level=log.INFO) device = args.device threshold = args.prob_threshold extension = args.cpu_extension preview_flags = args.preview_flag input_file_path = args.input # Initialize Models log.info( "------------------------Program Started-------------------------------------" ) face = FaceDetect(args.face_detection_model, args.device, args.cpu_extension, args.prob_threshold) landmark = FacialLandmarksDetect(args.landmark_detection_model, args.device, args.cpu_extension) head_pose = HeadPoseDetect(args.head_pose_estimation_model, args.device, args.cpu_extension) gaze_estimation = GazeEstimation(args.gaze_estimation_model, args.device, args.cpu_extension) # Load models log.info("Loading Models") start_time = time.time() face.load_model() log.info("Face detection model loaded: time: {:.3f} ms".format( (time.time() - start_time) * 1000)) landmark_start = time.time() landmark.load_model() log.info("Facial landmarks detection model loaded: time: {:.3f} ms".format( (time.time() - landmark_start) * 1000)) head_start = time.time() head_pose.load_model() log.info("Head pose estimation model loaded: time: {:.3f} ms".format( (time.time() - head_start) * 1000)) gaze_start = time.time() gaze_estimation.load_model() log.info("Gaze estimation model loaded: time: {:.3f} ms".format( (time.time() - gaze_start) * 1000)) load_total_time = time.time() - start_time log.info("Time to load all models: time: {:.3f} ms".format( load_total_time * 1000)) log.info("All models are loaded successfully..") if input_file_path.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_file_path): log.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_file_path) log.info("Initialize Mouse") mouse = MouseController(precision='low', speed='fast') feeder.load_data() log.info("Starting Inference on Video") start_time = time.time() counter = 0 for ret, frame in feeder.next_batch(): if not ret: break key = cv2.waitKey(60) counter = counter + 1 face_coords, face_image = face.predict(frame.copy()) left_eye, right_eye, eye_coords = landmark.predict(face_image) hp_angles = head_pose.predict(face_image) gaze_coords = gaze_estimation.predict(left_eye, right_eye, hp_angles) # Settings from https://knowledge.udacity.com/questions/171017 focal_length = 950.0 scale = 50 center_of_face = (face_image.shape[1] / 2, face_image.shape[0] / 2) yaw = hp_angles[0] pitch = hp_angles[1] roll = hp_angles[2] if len(preview_flags) != 0: if 'ff' in preview_flags and len(preview_flags) == 1: preview_window = frame else: preview_window = face_image.copy() if 'ff' in preview_flags and len(preview_flags) == 1: cv2.rectangle(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 250, 0), 3) elif 'fl' in preview_flags and len(preview_flags) == 1: cv2.rectangle(preview_window, (eye_coords[0][0], eye_coords[0][1]), (eye_coords[0][2], eye_coords[0][3]), (150, 0, 150)) cv2.rectangle(preview_window, (eye_coords[1][0], eye_coords[1][1]), (eye_coords[1][2], eye_coords[1][3]), (150, 0, 150)) elif 'fh' in preview_flags and len(preview_flags) == 1: cv2.putText( preview_window, "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}". format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, .5, (255, 0, 0), 1) elif 'fg' in preview_flags and len(preview_flags) == 1: draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) elif 'fg' in preview_flags and 'fh' in preview_flags and len( preview_flags) == 2: #Gaze draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) #Head Pose cv2.putText( preview_window, "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}". format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, .3, (255, 0, 0), 1) elif 'ff' in preview_flags and 'fh' in preview_flags and len( preview_flags) == 2: #face cv2.rectangle(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 250, 0), 3) #Head Pose cv2.putText( preview_window, "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}". format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, .3, (255, 0, 0), 1) elif 'ff' in preview_flags and 'fl' in preview_flags and len( preview_flags) == 2: #face cv2.rectangle(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 250, 0), 3) #eye cv2.rectangle(preview_window, (eye_coords[0][0], eye_coords[0][1]), (eye_coords[0][2], eye_coords[0][3]), (150, 0, 150)) cv2.rectangle(preview_window, (eye_coords[1][0], eye_coords[1][1]), (eye_coords[1][2], eye_coords[1][3]), (150, 0, 150)) elif 'fh' in preview_flags and 'fl' in preview_flags and 'fg' in preview_flags and len( preview_flags) == 3: #Head Pose cv2.putText( preview_window, "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}". format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, .3, (255, 0, 0), 1) #eye cv2.rectangle(preview_window, (eye_coords[0][0], eye_coords[0][1]), (eye_coords[0][2], eye_coords[0][3]), (150, 0, 150)) cv2.rectangle(preview_window, (eye_coords[1][0], eye_coords[1][1]), (eye_coords[1][2], eye_coords[1][3]), (150, 0, 150)) #Gaze draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) elif 'ff' in preview_flags and 'fg' in preview_flags and len( preview_flags) == 2: #face cv2.rectangle(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 250, 0), 3) #gaze draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) elif 'fl' in preview_flags and 'fh' in preview_flags and len( preview_flags) == 2: #eye cv2.rectangle(preview_window, (eye_coords[0][0], eye_coords[0][1]), (eye_coords[0][2], eye_coords[0][3]), (150, 0, 150)) cv2.rectangle(preview_window, (eye_coords[1][0], eye_coords[1][1]), (eye_coords[1][2], eye_coords[1][3]), (150, 0, 150)) #head pose cv2.putText( preview_window, "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}". format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, .2, (255, 0, 0), 1) elif 'fg' in preview_flags and 'fl' in preview_flags and len( preview_flags) == 2: #gaze draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) #eye cv2.rectangle(preview_window, (eye_coords[0][0], eye_coords[0][1]), (eye_coords[0][2], eye_coords[0][3]), (150, 0, 150)) cv2.rectangle(preview_window, (eye_coords[1][0], eye_coords[1][1]), (eye_coords[1][2], eye_coords[1][3]), (150, 0, 150)) else: #face cv2.rectangle(frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 250, 0), 3) #gaze draw_axes(preview_window, center_of_face, yaw, pitch, roll, scale, focal_length) #eye cv2.rectangle(preview_window, (eye_coords[0][0], eye_coords[0][1]), (eye_coords[0][2], eye_coords[0][3]), (150, 0, 150)) cv2.rectangle(preview_window, (eye_coords[1][0], eye_coords[1][1]), (eye_coords[1][2], eye_coords[1][3]), (150, 0, 150)) #head pose cv2.putText( preview_window, "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}". format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, .3, (255, 0, 0), 1) if len(preview_flags) != 0: preview_image = np.hstack((cv2.resize(frame, (1500, 1500)), cv2.resize(preview_window, (1500, 1500)))) else: preview_image = cv2.resize(frame, (1500, 1500)) cv2.imshow('Visualization', preview_image) mouse.move(gaze_coords[0], gaze_coords[1]) key = cv2.waitKey(20) if key == 27: # exit on ESC break inference_time = round(time.time() - start_time, 1) fps = int(counter) / inference_time log.info("Counter {} seconds".format(counter)) log.info("Total Inference Time {} seconds".format(inference_time)) log.info("fps {} frame/second".format(fps)) log.info("Video has completed") log.info( "---------------------------------Program has ended ----------------------------------------" ) feeder.close() cv2.destroyAllWindows()