def main(): args = build_argparser().parse_args() preview_flags = args.preview_flags logger = logging.getLogger() input_path = args.input if input_path.lower() == 'cam': input_feed = InputFeeder('cam') else: if not os.path.isfile(input_path): logger.error('Unable to find specified video file') exit(1) file_extension = input_path.split(".")[-1] if (file_extension in ['jpg', 'jpeg', 'bmp']): input_feed = InputFeeder('image', input_path) elif (file_extension in ['avi', 'mp4']): input_feed = InputFeeder('video', input_path) else: logger.error( "Unsupported file Extension. Allowed ['jpg', 'jpeg', 'bmp', 'avi', 'mp4']" ) exit(1) if sys.platform == "linux" or sys.platform == "linux2": #CODEC = 0x00000021 CODEC = cv2.VideoWriter_fourcc(*"mp4v") elif sys.platform == "darwin": CODEC = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G') else: print("Unsupported OS.") exit(1) file_flag = False if args.output_file.lower() == 'y': file_flag = True out = cv2.VideoWriter('output.mp4', CODEC, 30, (FRAME_WIDTH, FRAME_HEIGHT)) modelPathDict = { 'face_detect': args.face_detection_model, 'face_landmark_regress': args.facial_landmark_model, 'head_pose': args.head_pose_model, 'gaze_estimate': args.gaze_estimation_model } for pathname in modelPathDict: if not os.path.isfile(modelPathDict[pathname]): logger.error('Unable to find specified ' + pathname + ' xml file') exit(1) #initializing models fdm = FaceDetectionModel(modelPathDict['face_detect'], args.device, args.cpu_extension) fldm = FacialLandmarksDetectionModel( modelPathDict['face_landmark_regress'], args.device, args.cpu_extension) hpem = HeadPoseEstimationModel(modelPathDict['head_pose'], args.device, args.cpu_extension) gem = GazeEstimationModel(modelPathDict['gaze_estimate'], args.device, args.cpu_extension) #initializing mouse controller mouse_controller = MouseController('medium', 'fast') input_feed.load_data() #checking models fdm.check_model() fldm.check_model() hpem.check_model() gem.check_model() #loading models / creating executable network fdm.load_model() fldm.load_model() hpem.load_model() gem.load_model() frame_count = 0 for ret, frame in input_feed.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) """ Sequence of model execution:- 1. Predict from each model. 2. Preprocess of outputs from each model. 3. Send the processed output to the next model. Model Sequence:- - Head Pose Estimation Model - Face Detection Model <(First Head Pose and Then Facial Landmark)>Gaze Estimation Model - Facial Landmark Detection Model - """ cropped_face, face_coords = fdm.preprocess_output( frame.copy(), fdm.predict(frame.copy()), args.prob_threshold) if type(cropped_face) == int: logger.error('Unable to detect the face.') if key == 27: break continue hp_out = hpem.preprocess_output(hpem.predict(cropped_face.copy())) left_eye, right_eye, eye_coords = fldm.preprocess_output( cropped_face.copy(), fldm.predict(cropped_face.copy())) new_mouse_coord, gaze_vector = gem.preprocess_output( gem.predict(left_eye, right_eye, hp_out), hp_out) if (not len(preview_flags) == 0) or file_flag: preview_frame = frame.copy() if 'fd' in preview_flags: preview_frame = cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 0, 255), 3) cropped_face = preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] if 'fld' in preview_flags: cropped_face = cv2.rectangle( cropped_face, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cropped_face = cv2.rectangle( cropped_face, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = cropped_face if 'hp' in preview_flags: cv2.putText( preview_frame, 'Pose Angles: yaw: {:.2f} | pitch: {:.2f} | roll: {:.2f}'. format(hp_out[0], hp_out[1], hp_out[2]), (20, 40), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2) if 'ge' in preview_flags: x, y = int(gaze_vector[0] * GAZE_ARROW_LENGTH), -int( gaze_vector[1] * GAZE_ARROW_LENGTH) le_mid_x = int((eye_coords[0][0] + eye_coords[0][2]) / 2) le_mid_y = int((eye_coords[0][1] + eye_coords[0][3]) / 2) re_mid_x = int((eye_coords[1][0] + eye_coords[1][2]) / 2) re_mid_y = int((eye_coords[1][1] + eye_coords[1][3]) / 2) cv2.arrowedLine(cropped_face, (le_mid_x, le_mid_y), ((le_mid_x + x), (le_mid_y + y)), (255, 0, 0), GAZE_ARROW_WIDTH) cv2.arrowedLine(cropped_face, (re_mid_x, re_mid_y), ((re_mid_x + x), (re_mid_y + y)), (255, 0, 0), GAZE_ARROW_WIDTH) preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = cropped_face if (not len(preview_flags) == 0) and frame_count % 2 == 0: if args.zoomed: cv2.imshow( 'Cropped Face', cv2.resize(cropped_face, (FRAME_WIDTH, FRAME_HEIGHT))) else: cv2.imshow( 'Preview', cv2.resize(preview_frame, (FRAME_WIDTH, FRAME_HEIGHT))) if file_flag: out.write( cv2.resize(preview_frame, (FRAME_WIDTH, FRAME_HEIGHT))) #move the mouse pointer try: mouse_controller.move(new_mouse_coord[0], new_mouse_coord[1]) except pyautogui.FailSafeException: pass if frame_count % 2 == 0 and len(preview_flags) == 0: cv2.imshow('Video', cv2.resize(frame, (FRAME_WIDTH, FRAME_HEIGHT))) if key == 27: break logger.error('VideoStream ended.') if args.output_file.lower() == 'y': out.release() input_feed.close() cv2.destroyAllWindows()
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("debug.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") fdnet = FaceDetectionModel(args.fdmodel) lmnet = FacialLandMarksDetectionModel(args.lmmodel) hpnet = HeadPoseEstimationModel(args.hpmodel) genet = GazeEstimationModel(args.gemodel) start_time = time.time() fdnet.load_model() logging.info( f"Face Detection Model: {1000 * (time.time() - start_time):.1f}ms") start_time = time.time() lmnet.load_model() logging.info( f"Facial Landmarks Detection Model: {1000 * (time.time() - start_time):.1f}ms" ) start_time = time.time() hpnet.load_model() logging.info( f"Headpose Estimation Model: {1000 * (time.time() - start_time):.1f}ms" ) start_time = time.time() genet.load_model() logging.info( f"Gaze Estimation Model: {1000 * (time.time() - start_time):.1f}ms" ) # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() frame_count = 0 fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 # face detection p_frame = fdnet.preprocess_input(frame) start_time = time.time() fd_output = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, bboxes = fdnet.preprocess_output( fd_output, frame, args.print) for bbox in bboxes: face = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]] p_frame = lmnet.preprocess_input(face) start_time = time.time() lm_output = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lm_output, bbox, out_frame, args.print) # get head pose estimation p_frame = hpnet.preprocess_input(face) start_time = time.time() hp_output = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angles = hpnet.preprocess_output( hp_output, out_frame, face, bbox, args.print) # get gaze estimation out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() ge_output = genet.predict(left_eye, right_eye, headpose_angles) ge_infertime += time.time() - start_time out_frame, gaze_vector = genet.preprocess_output( ge_output, out_frame, bbox, left_eye_point, right_eye_point, args.print) if not args.no_video: cv2.imshow('image', out_frame) if not args.no_move: mc.move(gaze_vector[0], gaze_vector[1]) break if key_pressed == 27: break if frame_count > 0: logging.info( f"Face Detection:{1000* fd_infertime/frame_count:.1f}ms") logging.info( f"Facial Landmarks Detection:{1000* lm_infertime/frame_count:.1f}ms" ) logging.info( f"Headpose Estimation:{1000* hp_infertime/frame_count:.1f}ms") logging.info( f"Gaze Estimation:{1000* ge_infertime/frame_count:.1f}ms") feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception(f"Error during inference:{str(ex)}")
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("gaze-app.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") #mc.move(100,100) fdnet = FaceDetectionModel(args.fdmodel) lmnet = FacialLandMarksDetectionModel(args.lmmodel) hpnet = HeadPoseEstimationModel(args.hpmodel) genet = GazeEstimationModel(args.gemodel) ### Load the model through ### logging.info("============== Models Load time ===============") start_time = time.time() fdnet.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() lmnet.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() hpnet.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) start_time = time.time() genet.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) logging.info("============== End =====================") # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() # FPS = feeder.get_fps() # Grab the shape of the input # width = feeder.get_width() # height = feeder.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 #print(int((frame_count) % int(FPS))) # face detection p_frame = fdnet.preprocess_input(frame) start_time = time.time() fnoutput = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, fboxes = fdnet.preprocess_output( fnoutput, frame, args.print) #for each face for fbox in fboxes: # fbox = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]] p_frame = lmnet.preprocess_input(face) start_time = time.time() lmoutput = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lmoutput, fbox, out_frame, args.print) # get head pose estimation p_frame = hpnet.preprocess_input(face) start_time = time.time() hpoutput = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angels = hpnet.preprocess_output( hpoutput, out_frame, face, fbox, args.print) # get gaze estimation out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() geoutput = genet.predict(left_eye, right_eye, headpose_angels) ge_infertime += time.time() - start_time out_frame, gazevector = genet.preprocess_output( geoutput, out_frame, fbox, left_eye_point, right_eye_point, args.print) if (not args.no_video): cv2.imshow('im', out_frame) if (not args.no_move): mc.move(gazevector[0], gazevector[1]) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") # Release the capture and destroy any OpenCV windows feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception("Error in inference:" + str(ex))
def infer_on_stream(args): face_detection_model_file = args.faceDetectionModel facial_landmarks_detection_model_file = args.facialLandmarksModel head_pose_estimation_model_file = args.headPoseModel gaze_estimation_model_file = args.gazeModel video_file = args.input device_name = args.device cpu_extension = args.cpu_extension prob_threshold = args.prob_threshold preview_flag = args.preview_flag output_path = args.output_path if not os.path.exists(output_path): os.mkdir(output_path) mouse_control = MouseController("low", "fast") try: logging.info("*********** Model Load Time ***************") start_model_load_time = time.time() start_time = time.time() face_detection_model = FaceDetectionModel(face_detection_model_file, device_name, cpu_extension) logging.info("Face Detection Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() facial_landmarks_detection_model = FacialLandmarksDetectionModel( facial_landmarks_detection_model_file, device_name, cpu_extension) logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() head_pose_estimation_model = HeadPoseEstimationModel( head_pose_estimation_model_file, device_name, cpu_extension) logging.info("Head Pose Estimation Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) start_time = time.time() gaze_estimation_model = GazeEstimationModel(gaze_estimation_model_file, device_name, cpu_extension) logging.info("Gaze Estimation Model: {:.1f} ms.".format( 1000 * (time.time() - start_time))) total_model_load_time = time.time() - start_model_load_time logging.info("*********** Model Load Completed ***********") except Exception as e: logging.error("ERROR in model loading: " + str(e)) sys.exit(1) feeder = InputFeeder('video', video_file) feeder.load_data() out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.fps() / 10), (1920, 1080), True) start_inference_time = 0 frame_count = 0 face_detect_infer_time = 0 facial_landmarks_infer_time = 0 head_pose_infer_time = 0 gaze_infer_time = 0 while True: try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 ## Face Detecton Model image = face_detection_model.preprocess_input(frame) start_time = time.time() outputs = face_detection_model.predict(image) face_detect_infer_time += (time.time() - start_time) out_frame, faces = face_detection_model.preprocess_output( outputs, frame, preview_flag, prob_threshold) for face in faces: crop_image = frame[face[1]:face[3], face[0]:face[2]] ## Facial Landmarks Detecton Model image = facial_landmarks_detection_model.preprocess_input( crop_image) start_time = time.time() outputs = facial_landmarks_detection_model.predict(image) facial_landmarks_infer_time += (time.time() - start_time) out_frame, left_eye_point, right_eye_point = facial_landmarks_detection_model.preprocess_output( outputs, out_frame, face, preview_flag) ## Head Pose Estimation Model image = head_pose_estimation_model.preprocess_input(crop_image) start_time = time.time() outputs = head_pose_estimation_model.predict(image) head_pose_infer_time += (time.time() - start_time) out_frame, headpose_angels_list = head_pose_estimation_model.preprocess_output( outputs, out_frame, preview_flag) ## Gaze Estimation Model out_frame, left_eye, right_eye = gaze_estimation_model.preprocess_input( out_frame, crop_image, left_eye_point, right_eye_point) start_time = time.time() outputs = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels_list) gaze_infer_time += (time.time() - start_time) out_frame, gazevector = gaze_estimation_model.preprocess_output( outputs, out_frame, face, left_eye_point, right_eye_point, preview_flag) cv2.imshow("Computer Pointer Control", out_frame) out_video.write(out_frame) mouse_control.move(gazevector[0], gazevector[1]) if key_pressed == 27: break if frame_count > 0: logging.info("*********** Model Inference Time ****************") logging.info("Face Detection Model: {:.1f} ms.".format( 1000 * face_detect_infer_time / frame_count)) logging.info("Facial Landmarks Detection Model: {:.1f} ms.".format( 1000 * facial_landmarks_infer_time / frame_count)) logging.info("Head Pose Detection Model: {:.1f} ms.".format( 1000 * head_pose_infer_time / frame_count)) logging.info("Gaze Detection Model: {:.1f} ms.".format( 1000 * gaze_infer_time / frame_count)) logging.info("*********** Model Inference Completed ***********") total_infer_time = time.time() - start_inference_time total_inference_time = round(total_infer_time, 1) fps = frame_count / total_inference_time with open(os.path.join(output_path, 'stats.txt'), 'w') as f: f.write(str(total_inference_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_model_load_time) + '\n') logging.info("*********** Total Summary ****************") logging.info(f"Total Model Load Time: {total_model_load_time}") logging.info(f"Total Inference Time: {total_inference_time}") logging.info(f"FPS: {fps}") logging.info("*********** Total Summary ***********") logging.info("*********** ************************* ***********") feeder.close() cv2.destroyAllWindows()