Exemplo n.º 1
0
    def preprocess_output_head_pose_estimation(self, outputs, frame):
        """
        Before feeding the output of this model to the next model,
        you might have to preprocess the output. This function is where you can do that.
        """
        head_pose_estimation = HeadPoseEstimation()

        yaw, pitсh, roll = head_pose_estimation.preprocess_output(
            outputs, frame)
        return (yaw, pitсh, roll)
Exemplo n.º 2
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("gaze-app.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        #mc.move(100,100)
        fdnet = FaceDetection(args.fdmodel)
        lmnet = FacialLandmarks(args.lmmodel)
        hpnet = HeadPoseEstimation(args.hpmodel)
        genet = GazeEstimation(args.gemodel)

        ### Load the model through ###
        logging.info("============== Models Load time ===============")
        start_time = time.time()
        fdnet.load_model()
        logging.info("Face Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        fdnet.check_model()
        logging.info("Face Detection estimation layers loaded correctly")

        start_time = time.time()
        lmnet.load_model()
        logging.info("Facial Landmarks Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        lmnet.check_model()
        logging.info("Facial Landmarks estimation layers loaded correctly")

        start_time = time.time()
        hpnet.load_model()
        logging.info("Headpose Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        hpnet.check_model()
        logging.info("Head pose estimation layers loaded correctly")

        start_time = time.time()
        genet.load_model()
        logging.info("Gaze Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        genet.check_model()
        logging.info("Gaze estimation layers loaded correctly")
        logging.info("==============  End =====================")
        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()
        # FPS = feeder.get_fps()

        # Grab the shape of the input
        # width = feeder.get_width()
        # height = feeder.get_height()

        # init scene variables
        frame_count = 0

        ### Loop until stream is over ###
        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0
        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1
            #print(int((frame_count) % int(FPS)))

            # face detection
            fd_process_time = time.time()
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fnoutput = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, fboxes = fdnet.preprocess_output(
                fnoutput, frame, args.print)
            logging.info(
                "Face Detection Model processing time : {:.1f}ms".format(
                    1000 * (time.time() - fd_process_time)))

            #for each face
            for fbox in fboxes:

                # fbox = (xmin,ymin,xmax,ymax)
                # get face landmarks
                # crop face from frame
                face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]]
                lm_process_time = time.time()
                p_frame = lmnet.preprocess_input(face)
                start_time = time.time()
                lmoutput = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lmoutput, fbox, out_frame, args.print)
                logging.info(
                    "Landmarks model processing time : {:.1f}ms".format(
                        1000 * (time.time() - lm_process_time)))

                # get head pose estimation
                hp_process_time = time.time()
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hpoutput = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angels = hpnet.preprocess_output(
                    hpoutput, out_frame, face, fbox, args.print)
                logging.info(
                    "Headpose estimation model processing time : {:.1f}ms".
                    format(1000 * (time.time() - hp_process_time)))

                # get gaze  estimation
                gaze_process_time = time.time()
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                geoutput = genet.predict(left_eye, right_eye, headpose_angels)
                ge_infertime += time.time() - start_time
                out_frame, gazevector = genet.preprocess_output(
                    geoutput, out_frame, fbox, left_eye_point, right_eye_point,
                    args.print)
                logging.info(
                    "Gaze estimation model processing time : {:.1f}ms".format(
                        1000 * (time.time() - gaze_process_time)))

                if (not args.no_video):
                    cv2.imshow('im', out_frame)

                if (not args.no_move):
                    mc.move(gazevector[0], gazevector[1])

                #consider only first detected face in the frame
                break

            # Break if escape key pressed
            if key_pressed == 27:
                break

        #logging inference times
        if (frame_count > 0):
            logging.info(
                "============== Models Inference time ===============")
            logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime /
                                                          frame_count))
            logging.info("Facial Landmarks Detection:{:.1f}ms".format(
                1000 * lm_infertime / frame_count))
            logging.info("Headpose Estimation:{:.1f}ms".format(
                1000 * hp_infertime / frame_count))
            logging.info("Gaze Estimation:{:.1f}ms".format(
                1000 * ge_infertime / frame_count))
            logging.info("============== End ===============================")

        # Release the capture and destroy any OpenCV windows
        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception("Error in inference:" + str(ex))
Exemplo n.º 3
0
def infer(args, logging_enabled):
    """
        run inference on input video, display/save output video
    """
    face_detection = FaceDetection(args.face_detection)
    facial_landmark_detection = FacialLandmarkDetection(
        args.facial_landmark_detection)
    gaze_estimation = GazeEstimation(args.gaze_estimation)
    head_pose_estimation = HeadPoseEstimation(args.head_pose_estimation)
    load_start = now()
    face_detection.load_model()
    fl_start = now()
    facial_landmark_detection.load_model()
    ge_start = now()
    gaze_estimation.load_model()
    hp_start = now()
    head_pose_estimation.load_model()
    log_model_load_times(logging_enabled, load_start, fl_start, ge_start,
                         hp_start)
    feeder = InputFeeder("video", args.input)
    feeder.load_data()
    frame_count, fd_time, fl_time, ge_time, hp_time = [0] * 5
    while 1:
        key = cv2.waitKey(20)
        try:
            frame = next(feeder.next_batch())
        except StopIteration:
            break
        frame_count += 1
        fd_frame = face_detection.preprocess_input(frame)
        inf_start = now()
        fd_output = face_detection.predict(fd_frame)
        fd_time += now() - inf_start
        out_frame, faces = face_detection.preprocess_output(
            fd_output, frame, args.overlay_inference,
            args.probability_threshold)
        detected_face = frame[faces[0][1]:faces[0][3], faces[0][0]:faces[0][2]]
        fl_frame = facial_landmark_detection.preprocess_input(detected_face)
        fl_start = now()
        fl_output = facial_landmark_detection.predict(fl_frame)
        fl_time += now() - fl_start
        out_frame, l_coord, r_coord, = facial_landmark_detection.preprocess_output(
            fl_output, faces[0], out_frame, args.overlay_inference)
        hp_frame = head_pose_estimation.preprocess_input(detected_face)
        hp_start = now()
        hp_output = head_pose_estimation.predict(hp_frame)
        hp_time += now() - hp_start
        out_frame, head_pose = head_pose_estimation.preprocess_output(
            hp_output, out_frame, detected_face, faces[0],
            args.overlay_inference)
        out_frame, l_eye, r_eye = gaze_estimation.preprocess_input(
            out_frame, detected_face, l_coord, r_coord, args.overlay_inference)
        ge_start = now()
        ge_output = gaze_estimation.predict(head_pose, l_eye, r_eye)
        ge_time += now() - ge_start
        out_frame, g_vec = gaze_estimation.preprocess_output(
            ge_output, out_frame, faces[0], l_coord, r_coord,
            args.overlay_inference)
        if args.video_window:
            cv2.imshow(
                "Computer-Human Interface Peripheral Signal Manipulation via AI Retina Tracking (CHIPSMART)",
                out_frame,
            )
        if args.mouse_control and frame_count % 6 == 0:
            mouse_control.move(g_vec[0], g_vec[1])
        # Quit if user presses Esc or Q
        if key in (27, 81):
            user_quit(logging_enabled)
            break
    log_inference_times(logging_enabled, frame_count, fd_time, fl_time,
                        ge_time, hp_time)
    feeder.close()
    cv2.destroyAllWindows()
    quit()
Exemplo n.º 4
0
	land_inf, output_lm = net2.predict(fd_image, lm_shape)
	left_eye, right_eye, ml_vis = LandmarksDetection.preprocess_output(output_lm[lm_name], fd_image)
	land_time.append(land_inf)
	if args['lmv']:
		cv2.imshow('FD Vis', ml_vis)
		cv2.waitKey(1)
	if len(left_eye) == []:
		print('Left eye not detected')
		continue
	elif len(right_eye) == []:
		print('Right eye not detected')
		continue
	
	#HeadPoseEstimation
	head_inf, output_hp = net3.predict(fd_image, hp_shape)
	p, r, y, hp_vis = HeadPoseEstimation.preprocess_output(output_hp, frame.copy())
	head_time.append(head_inf)
	if args['hpv']:
		cv2.imshow('FD Vis', cv2.resize(hp_vis, (700, 500)))
		cv2.waitKey(1)
	head_pose_angles = np.array([[y, p, r]])

	#GazeEstimation
	out, gaze_inf = net4.ge_predict(head_pose_angles, left_eye, right_eye, hp_shape)
	gaze_time.append(gaze_inf)
	mouse.move(out[0][0], out[0][1])

	
avg_face_time =  sum(face_time) / len(face_time)
avg_land_time =  sum(land_time) / len(land_time)
avg_head_time =  sum(head_time) / len(head_time)
Exemplo n.º 5
0
def main():
    """
    Load inference networks, stream video to network,
    and output stats and video.
    :return: None
    """

    # Logger init
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")

    # Get command line args
    args = get_arg()

    #Load Preferencies
    with open(args.config_file, "r") as yamlfile:
        cfg = yaml.load(yamlfile, Loader=yaml.FullLoader)
    models = cfg['models']
    input_source = args.input
    video_path = cfg['video_path']
    face_model = FaceDetection(models['face_detection'])
    head_pose_model = HeadPoseEstimation(models['head_pose_estimation'])
    facial_landmarks_model = FacialLandmarksDetection(models['facial_landmarks_detection'])
    gaze_estimation_model = GazeEstimation(models['gaze_estimation'])

    # Initialise the MouseController
    mouse_contr = MouseController("low","fast")

    # Load the models and log timing
    start_time = time.time()
    face_model.load_model(args.device)
    logging.info("Load Face Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) )

    start_time = time.time()
    facial_landmarks_model.load_model(args.device)
    logging.info("Load Facial Landmarks Detection model: {:.1f}ms".format(1000 * (time.time() - start_time)) )

    start_time = time.time()
    head_pose_model.load_model(args.device)
    logging.info("Load Head Pose Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) )

    start_time = time.time()
    gaze_estimation_model.load_model(args.device) 
    logging.info("Load Gaze Estimation model: {:.1f}ms".format(1000 * (time.time() - start_time)) )

    # Get and open video or camera capture
    #input_feed = InputFeeder('video', args.input)
    #input_feed.load_data()

    input_feed = InputFeeder(input_type=input_source, input_file=video_path)
    input_feed.load_data()

    if not input_feed.cap.isOpened():
        log.critical('Error opening input, check --video_path parameter')
        sys.exit(1)
    # FPS = input_feed.get_fps()

    # Grab the shape of the input 
    # width = input_feed.get_width()
    # height = input_feed.get_height()

    # init scene variables
    frame_count = 0

    ### Loop until stream is over ###
    facedetect_infer_time = 0
    landmark_infer_time = 0
    headpose_infer_time = 0
    gaze_infer_time = 0
    while True:
        # Read the next frame
        try:
            frame = next(input_feed.next_batch())
        except StopIteration:
            break

        if frame is None:
            break


        key_pressed = cv2.waitKey(60)
        frame_count += 1
        input_height, input_width, _ = frame.shape
        logging.info("frame {count} size {w}, {h}".format(count= frame_count, w = input_width, h =input_height)) 
        
        # face detection
        p_frame = face_model.preprocess_input(frame)
        start_time = time.time()
        fnoutput = face_model.predict(p_frame)
        facedetect_infer_time += time.time() - start_time
        out_frame,fboxes = face_model.preprocess_output(fnoutput,frame,args.overlay, args.prob_threshold)
        
        #for each face
        for fbox in fboxes:

            face = frame[fbox[1]:fbox[3],fbox[0]:fbox[2]]
            p_frame = facial_landmarks_model.preprocess_input(face)
            
            start_time = time.time()
            lmoutput = facial_landmarks_model.predict(p_frame)
            landmark_infer_time += time.time() - start_time
            out_frame,left_eye_point,right_eye_point = facial_landmarks_model.preprocess_output(lmoutput, fbox, out_frame,args.overlay, args.prob_threshold)

            # get head pose estimation
            p_frame  = head_pose_model.preprocess_input(face)
            start_time = time.time()
            hpoutput = head_pose_model.predict(p_frame)
            headpose_infer_time += time.time() - start_time
            out_frame, headpose_angels = head_pose_model.preprocess_output(hpoutput,out_frame, face,fbox,args.overlay, args.prob_threshold)

            # get gaze  estimation
            out_frame, left_eye, right_eye  = gaze_estimation_model.preprocess_input(out_frame,face,left_eye_point,right_eye_point,args.overlay)
            start_time = time.time()
            geoutput = gaze_estimation_model.predict(left_eye, right_eye, headpose_angels)
            gaze_infer_time += time.time() - start_time
            out_frame, gazevector = gaze_estimation_model.preprocess_output(geoutput,out_frame,fbox, left_eye_point,right_eye_point,args.overlay, args.prob_threshold)

            cv2.imshow('im', out_frame)
            
            if(args.mouse_move):
                logging.info("mouse move vector : x ={}, y={}".format(gazevector[0], gazevector[1])) 
                mouse_contr.move(gazevector[0], gazevector[1])
            
            #use only first detected face in the frame
            break
        
        # Break if escape key pressed
        if key_pressed == 27:
            break

    #logging inference times
    if(frame_count>0):
        logging.info("***** Models Inference time *****") 
        logging.info("Face Detection:{:.1f}ms".format(1000* facedetect_infer_time/frame_count))
        logging.info("Facial Landmarks Detection:{:.1f}ms".format(1000* landmark_infer_time/frame_count))
        logging.info("Headpose Estimation:{:.1f}ms".format(1000* headpose_infer_time/frame_count))
        logging.info("Gaze Estimation:{:.1f}ms".format(1000* gaze_infer_time/frame_count))


    # Release the capture and destroy any OpenCV windows
    input_feed.close()
    cv2.destroyAllWindows()