def run_inference(args):

    feed = InputFeeder(input_type='video', input_file=args.input)
    feed.load_data()
    for batch in feed.next_batch():
        cv2.imshow("Output", cv2.resize(batch, (500, 500)))
        key = cv2.waitKey(60)

        if (key == 27):
            break

        # getting face
        faceDetection = FaceDetection(model_name=args.face_detection_model)
        faceDetection.load_model()
        face = faceDetection.predict(batch)

        # getting eyes
        facialLandmarksDetection = FacialLandmarksDetection(
            args.facial_landmarks_detection_model)
        facialLandmarksDetection.load_model()
        left_eye, right_eye = facialLandmarksDetection.predict(face)

        # getting head pose angles
        headPoseEstimation = HeadPoseEstimation(
            args.head_pose_estimation_model)
        headPoseEstimation.load_model()
        head_pose = headPoseEstimation.predict(face)
        print("head pose angles: ", head_pose)

        # get mouse points
        gazeEstimation = GazeEstimation(args.gaze_estimation_model)
        gazeEstimation.load_model()
        mouse_coords = gazeEstimation.predict(left_eye, right_eye, head_pose)
        print("gaze  output: ", mouse_coords)
    feed.close()
    def __init__(self, args):
        '''
        This method instances variables for the Facial Landmarks Detection Model.

        Args:
        args = All arguments parsed by the arguments parser function

        Return:
        None
        '''

        init_start_time = time.time()
        self.output_path = args.output_path
        self.show_output = args.show_output
        self.total_processing_time = 0
        self.count_batch = 0
        self.inference_speed = []
        self.avg_inference_speed = 0

        if args.all_devices != 'CPU':
            args.face_device = args.all_devices
            args.face_landmark_device = args.all_devices
            args.head_pose_device = args.all_devices
            args.gaze_device = args.all_devices

        model_init_start = time.time()
        self.face_model = FaceDetection(args.face_model, args.face_device,
                                        args.face_device_ext,
                                        args.face_prob_threshold)
        self.landmarks_model = FacialLandmarksDetection(
            args.face_landmark_model, args.face_landmark_device,
            args.face_landmark_device_ext, args.face_landmark_prob_threshold)
        self.head_pose_model = HeadPoseEstimation(
            args.head_pose_model, args.head_pose_device,
            args.head_pose_device_ext, args.head_pose_prob_threshold)
        self.gaze_model = GazeEstimation(args.gaze_model, args.gaze_device,
                                         args.gaze_device_ext,
                                         args.gaze_prob_threshold)
        self.model_init_time = time.time() - model_init_start
        log.info('[ Main ] All required models initiallized')

        self.mouse_control = MouseController(args.precision, args.speed)
        log.info('[ Main ] Mouse controller successfully initialized')

        self.input_feeder = InputFeeder(args.batch_size, args.input_type,
                                        args.input_file)
        log.info('[ Main ] Initialized input feeder')

        model_load_start = time.time()
        self.face_model.load_model()
        self.landmarks_model.load_model()
        self.head_pose_model.load_model()
        self.gaze_model.load_model()

        self.model_load_time = time.time() - model_load_start
        self.app_init_time = time.time() - init_start_time
        log.info('[ Main ] All moadels loaded to Inference Engine\n')

        return None
def models_handler(logger, args):
	## put all path of model from args in to dict
	Dict_model_path = {
		'Face': args.face_detection_path,
		'Landmarks': args.facial_landmarks_path,
		'Headpose': args.head_pose_path,
		'Gaze': args.gaze_estimation_path
	}

	## check if model exists in given path
	for model_key in Dict_model_path.keys():
		# print(Dict_model_path[model_key])
		if not os.path.isfile(Dict_model_path[model_key]):
			print("\n## " + model_key + " Model path not exists: " + Dict_model_path[model_key] + ' Please try again !!!')
			logger.error("## " + model_key + " Model path not exists: " + Dict_model_path[model_key] + ' Please try again !!!')
			exit(1)
		else:
			print('## '+model_key + " Model path is correct: " + Dict_model_path[model_key] + '\n')
			logger.info('## '+model_key + " Model path is correct: " + Dict_model_path[model_key])

	## initialize face detection mode
	model_fd = FaceDetection(Dict_model_path['Face'], args.device, args.cpu_extension)
	## initialize facial landmarks detection model
	model_fld = FacialLandmarkDetection(Dict_model_path['Landmarks'], args.device, args.cpu_extension)
	## initialize head pose estimation model
	model_hpe = HeadPoseEstimation(Dict_model_path['Headpose'], args.device, args.cpu_extension)
	## initialize gaze estimation model
	model_ge = GazeEstimation(Dict_model_path['Gaze'], args.device, args.cpu_extension)

	return model_fd, model_fld, model_hpe, model_ge
def main(args):
    # set log level
    levels = {
        'debug': logging.DEBUG,
        'info': logging.INFO,
        'warning': logging.WARNING,
        'error': logging.ERROR
    }

    log_level = levels.get(args.log_level, logging.ERROR)

    logging.basicConfig(level=log_level)

    mouse_control = MouseController('high', 'fast')

    logging.info("Model Loading Please Wait ..")
    face_det = FaceDetection(args.face_detection, args.device)
    facial_det = FaceLandmark(args.face_landmark, args.device)
    head_pose_est = HeadPoseEstimation(args.head_pose, args.device)
    gaze_est = GazeEstimation(args.gaze_estimation, args.device)
    logging.info("Model loading successfully")

    inp = InputFeeder(input_type='video', input_file=args.input)
    inp.load_data()

    face_det.load_model()
    facial_det.load_model()
    head_pose_est.load_model()
    gaze_est.load_model()

    video_writer = cv2.VideoWriter(args.output_dir + '/demo_output11.mp4',
                                   cv2.VideoWriter_fourcc(*'MPEG'), 15,
                                   (1920, 1080), True)

    cv2.namedWindow('gaze')
    for frame in inp.next_batch():
        try:
            frame.shape
        except Exception as err:
            break
        crop_face, crop_coords = face_det.predict(frame,
                                                  visualize=args.visualize)

        left_eye, right_eye, left_eye_crop, right_eye_crop = facial_det.predict(
            crop_face, visualize=args.visualize)
        head_pose = head_pose_est.predict(crop_face, visualize=args.visualize)

        (new_x, new_y), gaze_vector = gaze_est.predict(left_eye_crop,
                                                       right_eye_crop,
                                                       head_pose)

        left_eye_gaze = int(left_eye[0] +
                            gaze_vector[0] * 100), int(left_eye[1] -
                                                       gaze_vector[1] * 100)
        right_eye_gaze = int(right_eye[0] +
                             gaze_vector[0] * 100), int(right_eye[1] -
                                                        gaze_vector[1] * 100)

        cv2.arrowedLine(crop_face, left_eye, left_eye_gaze, (0, 0, 255), 2)
        cv2.arrowedLine(crop_face, right_eye, right_eye_gaze, (0, 0, 255), 2)

        video_writer.write(frame)
        mouse_control.move(new_x, new_y)

        if args.show_result:
            cv2.imshow('gaze', frame)
            cv2.waitKey(1)

    inp.close()
    video_writer.release()
    cv2.destroyAllWindows()
def main():

    args = get_args().parse_args()
    path_filender = args.input
    four_flags = args.flags_checker
    loger = logging.getLogger()
    feeder_in = None
    out_path = args.out_path

    if path_filender.lower() == "cam":
        feeder_in = InputFeeder("cam")
    else:
        if not os.path.isfile(path_filender):
            loger.error("The video was not found")
            exit(1)
        feeder_in = InputFeeder("video", path_filender)

    model_locations = {
        'FaceDetection': args.face_detection_model,
        'HeadPoseEstimation': args.head_pose_estimation_model,
        'FacialLandmarksDetection': args.facial_landmarks_detection_model,
        'GazeEstimation': args.gaze_estimation_model
    }

    for key_name in model_locations.keys():
        if not os.path.isfile(model_locations[key_name]):
            loger.error("The system cannot find the " + key_name + " xml file")
            exit(1)

    dt = FaceDetection(model_locations['FaceDetection'], args.device,
                       args.cpu_extension)
    pe = HeadPoseEstimation(model_locations['HeadPoseEstimation'], args.device,
                            args.cpu_extension)
    ld = FacialLandmarksDetection(model_locations['FacialLandmarksDetection'],
                                  args.device, args.cpu_extension)
    ge = GazeEstimation(model_locations['GazeEstimation'], args.device,
                        args.cpu_extension)

    cursor = MouseController('medium', 'fast')

    feeder_in.load_data()
    model_load_time_start = time.time()
    dt.load_model()
    pe.load_model()
    ld.load_model()
    ge.load_model()
    total_load_time = time.time() - model_load_time_start

    frame_counter = 0
    inference_time_start = time.time()
    for ret, frame in feeder_in.next_batch():
        if not ret:
            break
        frame_counter = frame_counter + 1
        if frame_counter % 1 == 0:
            cv2.imshow('video', cv2.resize(frame, (600, 600)))

        key = cv2.waitKey(60)

        face_detected, coords_face = dt.predict(frame, args.p_th)
        if type(face_detected) == int:
            loger.error("The system cannot detect any face.")
            if key == 27:
                break
            continue

        head_pose_output = pe.predict(face_detected)
        eye_left_detect, eye_right_detect, eye_coordinates_detect = ld.predict(
            face_detected)
        coordi_update_pointer, coordi_gaze = ge.predict(
            eye_left_detect, eye_right_detect, head_pose_output)

        if (not len(four_flags) == 0):
            result_app = frame
            if 'fad' in four_flags:
                result_app = face_detected
            if 'hpe' in four_flags:
                cv2.putText(
                    result_app,
                    "HP Angles: YAW:{:.3f} * PITCH:{:.3f} * ROLL:{:.3f}".
                    format(head_pose_output[0], head_pose_output[1],
                           head_pose_output[2]), (5, 40),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (153, 76, 0), 0)
            if 'fld' in four_flags:
                cv2.rectangle(face_detected,
                              (eye_coordinates_detect[0][0] - 4,
                               eye_coordinates_detect[0][1] - 4),
                              (eye_coordinates_detect[0][2] + 4,
                               eye_coordinates_detect[0][3] + 4),
                              (255, 255, 0), 4)
                cv2.rectangle(face_detected,
                              (eye_coordinates_detect[1][0] - 4,
                               eye_coordinates_detect[1][1] - 4),
                              (eye_coordinates_detect[1][2] + 4,
                               eye_coordinates_detect[1][3] + 4),
                              (255, 255, 0), 4)
            if 'gae' in four_flags:
                x = int(coordi_gaze[0] * 2)
                y = int(coordi_gaze[1] * 2)
                w = 150
                right_E = cv2.line(eye_right_detect, (x - w, y - w),
                                   (x + w, y + w), (51, 255, 153), 1)
                cv2.line(right_E, (x - w, y + w), (x + w, y - w),
                         (51, 255, 253), 1)
                left_E = cv2.line(eye_left_detect, (x - w, y - w),
                                  (x + w, y + w), (51, 255, 153), 1)
                cv2.line(left_E, (x - w, y + w), (x + w, y - w),
                         (51, 255, 253), 1)
                face_detected[
                    eye_coordinates_detect[1][1]:eye_coordinates_detect[1][3],
                    eye_coordinates_detect[1][0]:eye_coordinates_detect[1]
                    [2]] = right_E
                face_detected[
                    eye_coordinates_detect[0][1]:eye_coordinates_detect[0][3],
                    eye_coordinates_detect[0][0]:eye_coordinates_detect[0]
                    [2]] = left_E

            cv2.imshow("Result of the App", cv2.resize(result_app, (600, 600)))

        if frame_counter % 5 == 0:
            cursor.move(coordi_update_pointer[0], coordi_update_pointer[1])
        if key == 27:
            break

    total_time = time.time() - inference_time_start
    total_time_for_inference = round(total_time, 1)
    fps = frame_counter / total_time_for_inference

    with open(out_path + 'stats.txt', 'w') as f:
        f.write('Inference time: ' + str(total_time_for_inference) + '\n')
        f.write('FPS: ' + str(fps) + '\n')
        f.write('Model load time: ' + str(total_load_time) + '\n')

    loger.error("The video stream is over...")
    cv2.destroyAllWindows()
    feeder_in.close()
def main():
    """
    Load the network and parse the output.
    :return: None
    """
    global INFO
    global DELAY
    global POSE_CHECKED
    #controller = MouseController()

    log.basicConfig(format="[ %(levelname)s ] %(message)s",
                    level=log.INFO,
                    stream=sys.stdout)
    args = args_parser().parse_args()
    logger = log.getLogger()

    if args.input == 'cam':
        input_stream = 0
    else:
        input_stream = args.input
        assert os.path.isfile(args.input), "Specified input file doesn't exist"

    cap = cv2.VideoCapture(input_stream)
    initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    out = cv2.VideoWriter(os.path.join(args.output_dir, "shopper.mp4"),
                          cv2.VideoWriter_fourcc(*"MP4V"), fps,
                          (initial_w, initial_h), True)
    frame_count = 0

    job_id = 1  #os.environ['PBS_JOBID']
    progress_file_path = os.path.join(args.output_dir,
                                      'i_progress_' + str(job_id) + '.txt')

    infer_time_start = time.time()

    if input_stream:
        cap.open(args.input)
        # Adjust DELAY to match the number of FPS of the video file
        DELAY = 1000 / cap.get(cv2.CAP_PROP_FPS)

    if not cap.isOpened():
        logger.error("ERROR! Unable to open video source")
        return

    # Initialise the class
    if args.cpu_extension:
        facedet = FaceDetection(args.facemodel,
                                args.confidence,
                                extensions=args.cpu_extension)
        posest = HeadPoseEstimation(args.posemodel,
                                    args.confidence,
                                    extensions=args.cpu_extension)
        landest = FaceLandmarksDetection(args.landmarksmodel,
                                         args.confidence,
                                         extensions=args.cpu_extension)
        gazeest = GazeEstimation(args.gazemodel,
                                 args.confidence,
                                 extensions=args.cpu_extension)
    else:
        facedet = FaceDetection(args.facemodel, args.confidence)
        posest = HeadPoseEstimation(args.posemodel, args.confidence)
        landest = FaceLandmarksDetection(args.landmarksmodel, args.confidence)
        gazeest = GazeEstimation(args.gazemodel, args.confidence)

    # infer_network_pose = Network()
    # Load the network to IE plugin to get shape of input layer
    facedet.load_model()
    posest.load_model()
    landest.load_model()
    gazeest.load_model()
    print("loaded models")

    ret, frame = cap.read()
    while ret:
        looking = 0
        POSE_CHECKED = False
        ret, frame = cap.read()
        frame_count += 1
        if not ret:
            print("checkpoint *BREAKING")
            break

        if frame is None:
            log.error("checkpoint ERROR! blank FRAME grabbed")
            break

        initial_w = int(cap.get(3))
        initial_h = int(cap.get(4))

        # Start asynchronous inference for specified request
        inf_start_fd = time.time()
        # Results of the output layer of the network
        coords, frame = facedet.predict(frame)
        det_time_fd = time.time() - inf_start_fd
        if len(coords) > 0:
            [xmin, ymin, xmax,
             ymax] = coords[0]  # use only the first detected face
            head_pose = frame[ymin:ymax, xmin:xmax]
            inf_start_hp = time.time()
            is_looking, pose_angles = posest.predict(head_pose)
            if is_looking:
                det_time_hp = time.time() - inf_start_hp
                POSE_CHECKED = True
                #print(is_looking)
                inf_start_lm = time.time()
                coords, f = landest.predict(head_pose)
                frame[ymin:ymax, xmin:xmax] = f
                det_time_lm = time.time() - inf_start_lm

                [[xlmin, ylmin, xlmax, ylmax], [xrmin, yrmin, xrmax,
                                                yrmax]] = coords
                left_eye_image = frame[ylmin:ylmax, xlmin:xlmax]
                right_eye_image = frame[yrmin:yrmax, xrmin:xrmax]
                output = gazeest.predict(left_eye_image, right_eye_image,
                                         pose_angles)
        # Draw performance stats
        inf_time_message = "Face Inference time: {:.3f} ms.".format(
            det_time_fd * 1000)
        if POSE_CHECKED:
            cv2.putText(
                frame, "Head pose Inference time: {:.3f} ms.".format(
                    det_time_hp * 1000), (0, 35), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (255, 255, 255), 1)
            cv2.putText(frame, inf_time_message, (0, 15),
                        cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 255, 255), 1)
        out.write(frame)
        print("frame", frame_count)
        if frame_count % 10 == 0:
            print(time.time() - infer_time_start)
            progressUpdate(progress_file_path,
                           int(time.time() - infer_time_start), frame_count,
                           video_len)
        if args.output_dir:
            total_time = time.time() - infer_time_start
            with open(os.path.join(args.output_dir, 'stats.txt'), 'w') as f:
                f.write(str(round(total_time, 1)) + '\n')
                f.write(str(frame_count) + '\n')
    facedet.clean()
    posest.clean()
    landest.clean()
    gazeest.clean()
    out.release()
    cap.release()
    cv2.destroyAllWindows()
def inference(args):

    time_sheet = {
        'face_infr': [],
        'landmark_infr': [],
        'head_infr': [],
        'gaze_infr': [],
        'infr_per_frame': []
    }

    logging.basicConfig(filename='result.log', level=logging.INFO)
    logging.info(
        "================================================================================="
    )
    logging.info("Precision(face,landmark,head,gaze): FP32-INT1,FP{0},FP{1},FP{2}".format(\
            args.landmark_model.split("FP")[1].split("\\")[0],
            args.head_model.split("FP")[1].split("\\")[0],
            args.gaze_model.split("FP")[1].split("\\")[0]))

    model_load_start = time.time()

    face_detection = FaceDetection(args.face_model)
    face_detection.load_model()
    landmark_regression = LandmarkRegression(args.landmark_model)
    landmark_regression.load_model()
    head_pose = HeadPose(args.head_model)
    head_pose.load_model()
    gaze_estimation = GazeEstimation(args.gaze_model)
    gaze_estimation.load_model()

    logging.info("4 models load time: {0:.4f}sec".format(time.time() -
                                                         model_load_start))

    mouse_controller = MouseController('high', 'fast')

    cv2.namedWindow('preview', cv2.WND_PROP_FULLSCREEN)
    cv2.setWindowProperty('preview', cv2.WND_PROP_FULLSCREEN,
                          cv2.WINDOW_FULLSCREEN)

    input_feeder = InputFeeder(args.input_type, args.input_file)
    input_feeder.load_data()

    total_infr_start = time.time()

    for image in input_feeder.next_batch():
        if image is None:
            break
        face_infr_start = time.time()
        face_image = face_detection.predict(image)
        time_sheet['face_infr'].append(time.time() - face_infr_start)

        landmark_infr_start = time.time()
        left_eye_image, right_eye_image = landmark_regression.predict(
            np.copy(face_image))
        time_sheet['landmark_infr'].append(time.time() - landmark_infr_start)

        head_infr_start = time.time()
        head_pose_angles = head_pose.predict(np.copy(face_image))
        time_sheet['head_infr'].append(time.time() - head_infr_start)

        gaze_infr_start = time.time()
        x, y, z = gaze_estimation.predict(left_eye_image, right_eye_image,
                                          head_pose_angles)
        time_sheet['gaze_infr'].append(time.time() - gaze_infr_start)
        time_sheet['infr_per_frame'].append(time.time() - face_infr_start)
        cv2.imshow('preview', image)
        mouse_controller.move(x, y)
        key = cv2.waitKey(20)
        if key == 27:  # exit on ESC
            break

    logging.info("Face model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['face_infr'])))
    logging.info("Landmark model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['landmark_infr'])))
    logging.info("Head model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['head_infr'])))
    logging.info("Gaze model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['gaze_infr'])))
    logging.info("4 Model avg inference per frame: {0:.4f}sec".format(
        np.mean(time_sheet['infr_per_frame'])))
    logging.info("Total inference time: {0:.4f}sec".format(time.time() -
                                                           total_infr_start))
    logging.info(
        "====================================END==========================================\n"
    )

    input_feeder.close()
    cv2.destroyAllWindows()
def main(args):
    model = args.faceModel
    headpose = args.headpose
    device = args.device
    facialLandmark = args.facialLandmark
    gazeEstimation = args.gazeEstimation
    input_arg = args.input
    threshold = float(args.threshold)
    logging.basicConfig(filename='error_log.log', filemode='w')
    error_log = logging.getLogger()

    if input_arg == 'cam':
        input_stream = 0
        cap = cv2.VideoCapture(input_stream)
    else:
        if os.path.isfile(input_arg):
            input_stream = input_arg
            cap = cv2.VideoCapture(input_stream)
        else:
            print(
                'Could not determine the file location or Could not load the desired format, please use .mp4 or cam'
            )
            error_log.error(
                'Could not determine the file location or Could not load the desired format, please use .mp4 or cam'
            )
            exit(1)
            return

    model_load_time = time.time()

    # Load Face Detection Model
    face_detection = Face_Model(model, threshold, device=device)
    face_net = face_detection.load_model()

    # Load Head Pose Detection Model
    head_pose = HeadPose(headpose, threshold, device=device)
    head_net = head_pose.load_model()

    # Load Facial Landmarks Model
    facial_landmarks = FacialLandmark(facialLandmark, threshold, device=device)
    landmark_net = facial_landmarks.load_model()

    # Load Gaze Estimation Model
    gaze_estimation = GazeEstimation(gazeEstimation, threshold, device=device)
    gaze_net = gaze_estimation.load_model()

    total_loading_time = time.time() - model_load_time

    mouse_controller = MouseController('medium', 'slow')

    width = int(cap.get(3))
    height = int(cap.get(4))

    out_video = cv2.VideoWriter('out_video.mp4',
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(cv2.CAP_PROP_FPS), (width, height), True)
    frame_count = 0
    inference_time = time.time()
    print(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    while cap.isOpened():
        flag, frame = cap.read()
        if not flag:
            break

        key_pressed = cv2.waitKey(60)

        out_frame, face_coords = face_detection.predict(
            frame, face_net, width, height)
        if out_frame is not None:
            if not (out_frame.shape[1] == 0 or out_frame.shape[0] == 0):
                yaw, pitch, roll = head_pose.predict(out_frame, head_net)
                head_pose_angles = [yaw, pitch, roll]
                left_eye_image, right_eye_image, eye_cords = facial_landmarks.predict(
                    out_frame, landmark_net)
                mouse_pointer, gaze_vector = gaze_estimation.predict(
                    gaze_net, left_eye_image, right_eye_image,
                    head_pose_angles)

                mouse_controller.move(-mouse_pointer[0], mouse_pointer[1])
                if frame_count % 5 == 0:
                    cv2.putText(
                        frame,
                        "Head Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}"
                        .format(head_pose_angles[0], head_pose_angles[1],
                                head_pose_angles[2]), (50, 50),
                        cv2.FONT_HERSHEY_TRIPLEX, 1.0, (255, 255, 0), 1)
                    #cv2.arrowedLine(frame, (x, y), (x +5,y + 5), (255, 255, 0), 2)
                    cv2.rectangle(frame,
                                  (eye_cords[0][0] + face_coords[0] - 10,
                                   eye_cords[0][1] + face_coords[1] - 10),
                                  (eye_cords[0][2] + face_coords[0] + 10,
                                   eye_cords[0][3] + face_coords[1] + 10),
                                  (255, 255, 0), 2)
                    cv2.rectangle(frame,
                                  (eye_cords[1][0] + face_coords[0] - 10,
                                   eye_cords[1][1] + face_coords[1] - 10),
                                  (eye_cords[1][2] + face_coords[0] + 10,
                                   eye_cords[1][3] + face_coords[1] + 10),
                                  (255, 255, 0), 2)
                    cv2.rectangle(frame, (face_coords[0], face_coords[1]),
                                  (face_coords[2], face_coords[3]),
                                  (255, 255, 0), 2)
                    #cv2.imshow('prev', frame)

        out_video.write(frame)

        frame_count += 10
        cap.set(1, frame_count)

        if frame_count % 10 == 0:
            print(frame_count)

        if frame_count == cap.get(cv2.CAP_PROP_FRAME_COUNT):
            break

        if key_pressed == 27:
            break

    cap.release()
    cv2.destroyAllWindows()

    total_inference_time = time.time() - inference_time
    total_fps = frame_count / total_inference_time

    with open('result.txt', 'w') as f:
        f.write(str(total_loading_time) + '\n')
        f.write(str(total_inference_time) + '\n')
        f.write(str(total_fps) + '\n')
示例#9
0
def main():
    """
    Load the network and parse the output.
    :return: None
    """
    global POSE_CHECKED

    log.basicConfig(format="[ %(levelname)s ] %(message)s",
                    level=log.INFO,
                    stream=sys.stdout)
    args = args_parser().parse_args()
    logger = log.getLogger()

    if args.input == 'cam':
        input_stream = 0
    else:
        input_stream = args.input
        assert os.path.isfile(args.input), "Specified input file doesn't exist"

    cap = cv2.VideoCapture(input_stream)
    initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    out = cv2.VideoWriter(os.path.join(args.output_dir, "output.mp4"),
                          cv2.VideoWriter_fourcc(*"MP4V"), fps,
                          (initial_w, initial_h), True)

    if args.write_intermediate == 'yes':
        out_fm = cv2.VideoWriter(
            os.path.join(args.output_dir, "output_fm.mp4"),
            cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True)
        out_lm = cv2.VideoWriter(
            os.path.join(args.output_dir, "output_lm.mp4"),
            cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True)
        out_pm = cv2.VideoWriter(
            os.path.join(args.output_dir, "output_pm.mp4"),
            cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True)
        out_gm = cv2.VideoWriter(
            os.path.join(args.output_dir, "output_gm.mp4"),
            cv2.VideoWriter_fourcc(*"MP4V"), fps, (initial_w, initial_h), True)

    frame_count = 0

    job_id = 1

    infer_time_start = time.time()

    if input_stream:
        cap.open(args.input)
        # Adjust DELAY to match the number of FPS of the video file

    if not cap.isOpened():
        logger.error("ERROR! Unable to open video source")
        return

    if args.mode == 'sync':
        async_mode = False
    else:
        async_mode = True

    # Initialise the class
    if args.cpu_extension:
        face_det = FaceDetection(args.facemodel,
                                 args.confidence,
                                 extensions=args.cpu_extension,
                                 async_mode=async_mode)
        pose_det = HeadPoseEstimation(args.posemodel,
                                      args.confidence,
                                      extensions=args.cpu_extension,
                                      async_mode=async_mode)
        land_det = FaceLandmarksDetection(args.landmarksmodel,
                                          args.confidence,
                                          extensions=args.cpu_extension,
                                          async_mode=async_mode)
        gaze_est = GazeEstimation(args.gazemodel,
                                  args.confidence,
                                  extensions=args.cpu_extension,
                                  async_mode=async_mode)
    else:
        face_det = FaceDetection(args.facemodel,
                                 args.confidence,
                                 async_mode=async_mode)
        pose_det = HeadPoseEstimation(args.posemodel,
                                      args.confidence,
                                      async_mode=async_mode)
        land_det = FaceLandmarksDetection(args.landmarksmodel,
                                          args.confidence,
                                          async_mode=async_mode)
        gaze_est = GazeEstimation(args.gazemodel,
                                  args.confidence,
                                  async_mode=async_mode)

    # infer_network_pose = Network()
    # Load the network to IE plugin to get shape of input layer
    face_det.load_model()
    pose_det.load_model()
    land_det.load_model()
    gaze_est.load_model()

    model_load_time = time.time() - infer_time_start

    print("All models are loaded successfully")

    try:
        pass
    except Exception as e:
        print("Could not run Inference: ", e)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("checkpoint *BREAKING")
            break

        frame_count += 1
        looking = 0
        POSE_CHECKED = False

        if frame is None:
            log.error("checkpoint ERROR! blank FRAME grabbed")
            break

        initial_w = int(cap.get(3))
        initial_h = int(cap.get(4))

        # Start asynchronous inference for specified request
        inf_start_fd = time.time()

        # Results of the output layer of the network
        coords, frame = face_det.predict(frame)

        if args.write_intermediate == 'yes':
            out_fm.write(frame)

        det_time_fd = time.time() - inf_start_fd

        if len(coords) > 0:
            [xmin, ymin, xmax,
             ymax] = coords[0]  # use only the first detected face
            head_pose = frame[ymin:ymax, xmin:xmax]
            inf_start_hp = time.time()
            is_looking, pose_angles = pose_det.predict(head_pose)
            if args.write_intermediate == 'yes':
                p = "Pose Angles {}, is Looking? {}".format(
                    pose_angles, is_looking)
                cv2.putText(frame, p, (50, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5,
                            (255, 0, 0), 1)
                out_pm.write(frame)

            if is_looking:
                det_time_hp = time.time() - inf_start_hp
                POSE_CHECKED = True
                inf_start_lm = time.time()
                coords, f = land_det.predict(head_pose)

                frame[ymin:ymax, xmin:xmax] = f

                if args.write_intermediate == "yes":
                    out_lm.write(frame)

                det_time_lm = time.time() - inf_start_lm
                [[xlmin, ylmin, xlmax, ylmax], [xrmin, yrmin, xrmax,
                                                yrmax]] = coords
                left_eye_image = f[ylmin:ylmax, xlmin:xlmax]
                right_eye_image = f[yrmin:yrmax, xrmin:xrmax]

                output, gaze_vector = gaze_est.predict(left_eye_image,
                                                       right_eye_image,
                                                       pose_angles)

                if args.write_intermediate == 'yes':
                    p = "Gaze Vector {}".format(gaze_vector)
                    cv2.putText(frame, p, (50, 15), cv2.FONT_HERSHEY_COMPLEX,
                                0.5, (255, 0, 0), 1)
                    fl = draw_gaze(left_eye_image, gaze_vector)
                    fr = draw_gaze(right_eye_image, gaze_vector)
                    f[ylmin:ylmax, xlmin:xlmax] = fl
                    f[yrmin:yrmax, xrmin:xrmax] = fr
                    # cv2.arrowedLine(f, (xlmin, ylmin), (xrmin, yrmin), (0,0,255), 5)
                    out_gm.write(frame)

        # Draw performance stats
        inf_time_message = "Face Inference time: {:.3f} ms.".format(
            det_time_fd * 1000)
        #
        if POSE_CHECKED:
            cv2.putText(
                frame, "Head pose Inference time: {:.3f} ms.".format(
                    det_time_hp * 1000), (0, 35), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (0, 255, 0), 1)
            cv2.putText(frame, inf_time_message, (0, 15),
                        cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 0, 0), 1)
        out.write(frame)
        if frame_count % 10 == 0:
            print("Inference time = ", int(time.time() - infer_time_start))
            print('Frame count {} and vidoe len {}'.format(
                frame_count, video_len))
        if args.output_dir:
            total_time = time.time() - infer_time_start
            with open(os.path.join(args.output_dir, 'stats.txt'), 'w') as f:
                f.write(str(round(total_time, 1)) + '\n')
                f.write(str(frame_count) + '\n')

    if args.output_dir:
        with open(os.path.join(args.output_dir, 'stats.txt'), 'a') as f:
            f.write(str(round(model_load_time)) + '\n')

    # Clean all models
    face_det.clean()
    pose_det.clean()
    land_det.clean()
    gaze_est.clean()
    # release cv2 cap
    cap.release()
    cv2.destroyAllWindows()
    # release all out writer
    out.release()
    if args.write_intermediate == 'yes':
        out_fm.release()
        out_pm.release()
        out_lm.release()
        out_gm.release()
def main():
    arg_parser = ArgParser()
    args = arg_parser.get_args()

    input_file = args.input

    # If input file defined then use it else use the webcam
    if input_file:
        if not os.path.isfile(input_file):
            log.error("Input file cannot be found")
            exit()
        input_feeder = InputFeeder("video", input_file)
    else:
        input_feeder = InputFeeder("cam")

    face_detection_model = FaceDetection(args.face_detection_model,
                                         args.device, args.extensions)
    face_detection_model.load_model()

    facial_landmarks_model = FacialLandmarksDetection(
        args.facial_landmark_detection_model, args.device, args.extensions)
    facial_landmarks_model.load_model()

    gaze_model = GazeEstimation(args.gaze_estimation_model, args.device,
                                args.extensions)
    gaze_model.load_model()

    head_pose_model = HeadPoseEstimation(args.head_pose_estimation_model,
                                         args.device, args.extensions)
    head_pose_model.load_model()

    mouse_controller = MouseController('medium', 'fast')

    input_feeder.load_data()

    frame_count = 0
    total_face_detection_inference_time = 0
    total_facial_landmark_inference_time = 0
    total_head_pose_inference_time = 0
    total_gaze_estimation_inference_time = 0
    total_inference_time = 0
    for ret, frame in input_feeder.next_batch():

        if not ret:
            log.error("ret variable not found")
            break

        frame_count += 1

        if frame_count % args.mouse_update_interval == 0:
            cv2.imshow('Input', frame)

        key_pressed = cv2.waitKey(60)

        # Run inference on the face detection model
        start_time = time.time()
        cropped_face, face_coordinates = face_detection_model.predict(
            frame.copy(), args.probability_threshold)
        finish_time = time.time()
        total_face_detection_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # If no face detected get the next frame
        if len(face_coordinates) == 0:
            continue

        # Run inference on the facial landmark detection model
        start_time = time.time()
        results = facial_landmarks_model.predict(cropped_face.copy())
        finish_time = time.time()
        left_eye_coordinates = results[0]
        right_eye_coordinates = results[1]
        left_eye_image = results[2]
        right_eye_image = results[3]
        left_eye_crop_coordinates = results[4]
        right_eye_crop_coordinates = results[5]
        total_facial_landmark_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # Run inference on the head pose estimation model
        start_time = time.time()
        head_pose = head_pose_model.predict(cropped_face.copy())
        finish_time = time.time()
        total_head_pose_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # Run inference on the gaze estimation model
        start_time = time.time()
        new_mouse_x_coordinate, new_mouse_y_coordinate, gaze_vector = gaze_model.predict(
            left_eye_image, right_eye_image, head_pose)
        finish_time = time.time()
        total_gaze_estimation_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        if frame_count % args.mouse_update_interval == 0:
            log.info("Mouse controller new coordinates: x = {}, y = {}".format(
                new_mouse_x_coordinate, new_mouse_y_coordinate))
            mouse_controller.move(new_mouse_x_coordinate,
                                  new_mouse_y_coordinate)

            # Optional visualization configuration:
            if args.show_detected_face:
                showDetectedFace(frame, face_coordinates)

            if args.show_head_pose:
                showHeadPose(frame, head_pose)

            if args.show_facial_landmarks:
                showFacialLandmarks(cropped_face, left_eye_crop_coordinates,
                                    right_eye_crop_coordinates)

            if args.show_gaze_estimation:
                showGazeEstimation(frame, right_eye_coordinates,
                                   left_eye_coordinates, gaze_vector,
                                   cropped_face, face_coordinates)

        # Break if escape key pressed
        if key_pressed == 27:
            log.warning("Keyboard interrupt triggered")
            break

    # Release the capture and destroy any OpenCV windows
    cv2.destroyAllWindows()
    input_feeder.close()
    log.info("Average face detection inference time: {} seconds".format(
        total_face_detection_inference_time / frame_count))
    log.info(
        "Average facial landmark detection inference time: {} seconds".format(
            total_facial_landmark_inference_time / frame_count))
    log.info("Average head pose estimation inference time: {} seconds".format(
        total_head_pose_inference_time / frame_count))
    log.info("Average gaze estimation inference time: {} seconds".format(
        total_gaze_estimation_inference_time / frame_count))
    log.info("Average total inference time: {} seconds".format(
        total_inference_time / frame_count))
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.

    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """

    #if args.input == 'cam':
    #    args.input = 0
    output_intermediate_model = args.output_intermediate_model

    ### TODO: Handle the input stream ###
    feed = InputFeeder(input_type=args.input_type, input_file=args.input_file)
    cap = feed.load_data()
    width = int(cap.get(3))
    height = int(cap.get(4))
    fps = int(cap.get(5))

    # Initialise the class
    try:
        infer_network_face_detection = BasePointer()
        infer_network_head_pose_estimation = BasePointer()
        infer_network_landmarks_regression_retail = BasePointer()
        infer_network_gaze_estimation = GazeEstimation()
    except:
        logging.error("Error in initializing models")
        exit(1)
    ### TODO: Load the model through `infer_network_face_detection` ###
    try:
        start_loading_time_face_detection = time.time()
        infer_network_face_detection.load_model(args.model1, args.device)
        load_model_face_detection_time_taken = time.time(
        ) - start_loading_time_face_detection

        start_loading_time_head_pose_estimation = time.time()
        infer_network_head_pose_estimation.load_model(args.model2, args.device)
        load_model_head_pose_estimation_time_taken = time.time(
        ) - start_loading_time_head_pose_estimation

        start_loading_time_landmarks_regression_retail = time.time()
        infer_network_landmarks_regression_retail.load_model(
            args.model3, args.device)
        load_model_landmarks_regression_retail_time_taken = time.time(
        ) - start_loading_time_landmarks_regression_retail

        start_loading_time_gaze_estimation = time.time()
        infer_network_gaze_estimation.load_model(args.model4, args.device)
        load_model_gaze_estimation_time_taken = time.time(
        ) - start_loading_time_gaze_estimation
    except:
        logging.error("Error in loading the models")
        exit(1)

    logging.debug(
        "Loading times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} "
        .format(load_model_face_detection_time_taken,
                load_model_landmarks_regression_retail_time_taken,
                load_model_head_pose_estimation_time_taken,
                load_model_gaze_estimation_time_taken))

    if output_intermediate_model == 'true':
        out = cv2.VideoWriter('out.mp4', CODEC, fps, (width, height))

    total_time_taken_to_infer_inf_face_detection = 0
    total_time_taken_to_infer_landmarks_regression_retail = 0
    total_time_taken_to_infer_inf_head_pose_estimation = 0
    total_time_taken_to_infer_gaze_estimation = 0

    ### TODO: Loop until stream is over ###
    for batch in feed.next_batch():
        ### TODO: Read from the video capture ###

        flag, frame = batch
        if not flag:
            break
        key_pressed = cv2.waitKey(60)

        ### TODO: Start inference for face detection ###
        start_inf_face_detection = time.time()
        outputs_face_detection = infer_network_face_detection.predict(frame)
        time_taken_to_infer_inf_face_detection = time.time(
        ) - start_inf_face_detection
        coords, frame = infer_network_face_detection.preprocess_output_face_detection(
            outputs_face_detection, width, height, args.prob_threshold, frame)
        if output_intermediate_model == 'true':
            out.write(frame)

        frame_crop_face = crop_face(coords, frame, output_intermediate_model)

        start_inf_head_pose_estimation = time.time()
        outputs_head_pose_estimation = infer_network_head_pose_estimation.predict(
            frame_crop_face)
        time_taken_to_infer_inf_head_pose_estimation = time.time(
        ) - start_inf_head_pose_estimation

        yaw, pitсh, roll = infer_network_head_pose_estimation.preprocess_output_head_pose_estimation(
            outputs_head_pose_estimation, frame_crop_face)
        head_pose_angles = [yaw, pitсh, roll]

        if output_intermediate_model == 'true':
            cv2.putText(frame, ("Yaw: " + str(int(yaw))), (100, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)
            cv2.putText(frame, ("Pitch: " + str(int(pitсh))), (100, 140),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)
            cv2.putText(frame, ("Roll: " + str(int(roll))), (100, 180),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)

        height_crop_face = coords[0][3] - coords[0][1]
        width_crop_face = coords[0][2] - coords[0][0]

        start_inf_landmarks_regression_retail = time.time()
        outputs_landmarks_regression_retail = infer_network_landmarks_regression_retail.predict(
            frame_crop_face)
        time_taken_to_infer_landmarks_regression_retail = time.time(
        ) - start_inf_landmarks_regression_retail

        coord_landmarks_regression_retail = infer_network_landmarks_regression_retail.preprocess_output_landmarks_regression_retail(
            outputs_landmarks_regression_retail, width_crop_face,
            height_crop_face, args.prob_threshold, frame)
        center_left_eye = ((coords[0][0] +
                            coord_landmarks_regression_retail[0]),
                           coords[0][1] + coord_landmarks_regression_retail[1])
        center_right_eye = ((coords[0][0] +
                             coord_landmarks_regression_retail[2]),
                            coords[0][1] +
                            coord_landmarks_regression_retail[3])

        xmin_left_eye = center_left_eye[0] - 30
        ymin_left_eye = center_left_eye[1] - 30
        xmax_left_eye = center_left_eye[0] + 30
        ymax_left_eye = center_left_eye[1] + 30
        xmin_right_eye = center_right_eye[0] - 30
        ymin_right_eye = center_right_eye[1] - 30
        xmax_right_eye = center_right_eye[0] + 30
        ymax_right_eye = center_right_eye[1] + 30

        frame_landmarks_regression_retail = cv2.circle(frame,
                                                       center_left_eye,
                                                       2, (0, 255, 0),
                                                       thickness=3)
        frame_landmarks_regression_retail = cv2.circle(frame,
                                                       center_right_eye,
                                                       2, (0, 255, 0),
                                                       thickness=3)
        box_left_eye = cv2.rectangle(frame, (xmin_left_eye, ymin_left_eye),
                                     (xmax_left_eye, ymax_left_eye),
                                     (0, 255, 0), 3)
        box_right_eye = cv2.rectangle(frame, (xmin_right_eye, ymin_right_eye),
                                      (xmax_right_eye, ymax_right_eye),
                                      (0, 255, 0), 3)
        if output_intermediate_model == 'true':
            out.write(frame_landmarks_regression_retail)

        ### TODO: Start inference for gaze estimation ###
        start_inf_gaze_estimation = time.time()
        outputs_gaze_estimation = infer_network_gaze_estimation.predict(
            box_left_eye, box_right_eye, head_pose_angles)
        time_taken_to_infer_gaze_estimation = time.time(
        ) - start_inf_gaze_estimation

        total_time_taken_to_infer_inf_face_detection = time_taken_to_infer_inf_face_detection + total_time_taken_to_infer_inf_face_detection
        total_time_taken_to_infer_landmarks_regression_retail = time_taken_to_infer_landmarks_regression_retail + total_time_taken_to_infer_landmarks_regression_retail
        total_time_taken_to_infer_inf_head_pose_estimation = time_taken_to_infer_inf_head_pose_estimation + total_time_taken_to_infer_inf_head_pose_estimation
        total_time_taken_to_infer_gaze_estimation = time_taken_to_infer_gaze_estimation + total_time_taken_to_infer_gaze_estimation

        arrow = 100
        g_x = int(outputs_gaze_estimation[0] * arrow)
        g_y = int(-(outputs_gaze_estimation[1]) * arrow)

        frame = cv2.arrowedLine(frame, (center_left_eye),
                                ((center_left_eye[0] + g_x),
                                 (center_left_eye[1] + g_y)), (0, 0, 255), 3)
        frame = cv2.arrowedLine(frame, (center_right_eye),
                                ((center_right_eye[0] + g_x),
                                 (center_right_eye[1] + g_y)), (0, 0, 255), 3)

        if output_intermediate_model == 'true':
            out.write(frame)

        mouse_controler_pc = MouseController("high", "fast")
        mouse_controler_pc.move(outputs_gaze_estimation[0],
                                outputs_gaze_estimation[1])

        if key_pressed == 27:
            break
    feed.close()

    logging.debug(
        "total inference times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} "
        .format(total_time_taken_to_infer_inf_face_detection,
                total_time_taken_to_infer_landmarks_regression_retail,
                total_time_taken_to_infer_inf_head_pose_estimation,
                total_time_taken_to_infer_gaze_estimation))
    if output_intermediate_model == 'true':
        out.release()
    #cap.release()
    cv2.destroyAllWindows()
示例#12
0
def main(args):
    #model=args.model
    fd_model = args.face
    flmd_model = args.landmarks
    hp_model = args.head
    ge_model = args.gaze
    device = args.device
    display_flag = args.display

    # Init and load models
    fd = FaceDetection(fd_model, device)
    logger.info("######## Model loading Time #######")
    start = time.time()
    fd.load_model()
    logger.info("Face Detection Model: {:.1f}ms".format(1000 *
                                                        (time.time() - start)))

    flmd = FacialLandMarksDetection(flmd_model, device)
    start = time.time()
    flmd.load_model()
    logger.info("Facial Landmarks Detection Model: {:.1f}ms".format(
        1000 * (time.time() - start)))

    hpe = HeadPoseEstimation(hp_model, device)
    start = time.time()
    hpe.load_model()
    logger.info("HeadPose Estimation Model: {:.1f}ms".format(
        1000 * (time.time() - start)))

    ge = GazeEstimation(ge_model, device)
    start = time.time()
    ge.load_model()
    logger.info("Gaze Estimation Model: {:.1f}ms".format(
        1000 * (time.time() - start)))

    # Mouse controller
    mc = MouseController("low", "fast")

    feed = InputFeeder(input_type=args.input_type, input_file=args.input_file)
    feed.load_data()

    frame_count = 0
    fd_inference_time = 0
    lm_inference_time = 0
    hp_inference_time = 0
    ge_inference_time = 0
    move_mouse = False

    for batch in feed.next_batch():
        frame_count += 1
        # Preprocessed output from face detection
        face_boxes, image, fd_time = fd.predict(batch, display_flag)
        fd_inference_time += fd_time

        for face in face_boxes:
            cropped_face = batch[face[1]:face[3], face[0]:face[2]]
            #print(f"Face boxe = {face}")
            # Get preprocessed result from landmarks
            image, left_eye, right_eye, lm_time = flmd.predict(
                image, cropped_face, face, display_flag)
            lm_inference_time += lm_time

            # Get preprocessed result from pose estimation
            image, headpose_angels, hp_time = hpe.predict(
                image, cropped_face, face, display_flag)
            hp_inference_time += hp_time

            # Get preprocessed result from Gaze estimation model
            image, gazevector, ge_time = ge.predict(image, cropped_face, face,
                                                    left_eye, right_eye,
                                                    headpose_angels,
                                                    display_flag)
            #cv2.imshow('Face', cropped_face)
            ge_inference_time += ge_time
            #print(f"Gaze vect {gazevector[0],gazevector[1]}")
            cv2.imshow('img', image)
            if (not move_mouse):
                mc.move(gazevector[0], gazevector[1])
            break

        if cv2.waitKey(1) & 0xFF == ord("k"):
            break
    if (frame_count > 0):
        logger.info("###### Models Inference time ######")
        logger.info(
            f"Face Detection inference time = {(fd_inference_time*1000)/frame_count} ms"
        )
        logger.info(
            f"Facial Landmarks Detection inference time = {(lm_inference_time*1000)/frame_count} ms"
        )
        logger.info(
            f"Headpose Estimation inference time = {(hp_inference_time*1000)/frame_count} ms"
        )
        logger.info(
            f"Gaze estimation inference time = {(ge_inference_time*1000)/frame_count} ms"
        )
    feed.close()
示例#13
0
def main(args):
    device = args.device
    video_file = args.video
    input_type = args.input_type
    toggle = args.toggle
    stats = args.stats
    model = args.model

    if stats == 'true':
        stats = True
    else:
        stats = False

    if toggle == 'true':
        toggle = True
    else:
        toggle = False

    # Start Model Loading
    start_model_load_time = time.time()
    print(f'[INFO] Started Model Loading...........')

    face_model = FaceDetection(parse_models_file(
        label='face_detection', path=model),
        device)
    face_model.load_model()

    # Load Landmark model
    landmark_model = LandMarksDetection(
        parse_models_file(label='facial_landmarks_detection', path=model),
        device)
    landmark_model.load_model()
    pose_estimation_model = HeadPoseEstimation(
        parse_models_file(label='head_pose_estimation', path=model),
        device)
    pose_estimation_model.load_model()

    gaze_estimation_model = GazeEstimation(
        parse_models_file(label='gaze_estimation', path=model), device)
    gaze_estimation_model.load_model()

    total_model_load_time = time.time() - start_model_load_time
    print('[TOTAL] Loaded in {:.3f} ms'.format(total_model_load_time))

    # End Model Loading
    mouse = MouseController('high', 'fast')
    if not toggle:
        cv2.namedWindow(MAIN_WINDOW_NAME, cv2.WINDOW_AUTOSIZE)
    try:
        feed = InputFeeder(input_type=input_type, input_file=video_file)
        feed.load_data()
        initial_w = int(feed.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        initial_h = int(feed.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        counter = 0
        if not toggle:
            cv2.namedWindow(MAIN_WINDOW_NAME, cv2.WINDOW_NORMAL)
        for frame, _ in feed.next_batch():
            if not _:
                break
            try:
                counter += 1
                # Start Inferences
                coord = face_model.predict(frame, (initial_w, initial_h))

                for i in range(len(coord)):
                    xmin, ymin, xmax, ymax = coord[i]
                    cropped_image = frame[ymin:ymax, xmin:xmax]
                    # Landmark Inference
                    cropped_left, cropped_right = landmark_model.predict(cropped_image)
                    if cropped_right.shape[0] < 60 or cropped_left.shape[1] < 60:
                        break
                    if cropped_right.shape[1] < 60 or cropped_left.shape[0] < 60:
                        break
                    # Pose Estimation Inference
                    poses = pose_estimation_model.predict(cropped_image)
                    # Gaze Estimation Inference
                    gz = gaze_estimation_model.predict(poses, cropped_left, cropped_right)
                    # Mouse Controller
                    mouse.move(gz[0][0], gz[0][1])
                    # If user pass statistics argument to true
                    if stats:
                        # Print performance
                        performance_counts(
                            face_model.performance_counter(0)
                        )
                        performance_counts(
                            pose_estimation_model.performance_counter(0)
                        )
                        performance_counts(
                            landmark_model.performance_counter(0)
                        )
                        performance_counts(
                            gaze_estimation_model.performance_counter(0)
                        )

                if not toggle:
                    # Output Camera or Video
                    #cv2.resizeWindow(MAIN_WINDOW_NAME, 480, 320)
                    cv2.imshow(MAIN_WINDOW_NAME, frame)

                else:
                    # Print Statistics only no camera or video
                    performance_counts(
                        face_model.performance_counter(0)
                    )
                    performance_counts(
                        pose_estimation_model.performance_counter(0)
                    )
                    performance_counts(
                        landmark_model.performance_counter(0)
                    )
                    performance_counts(
                        gaze_estimation_model.performance_counter(0)
                    )

                cv2.waitKey(1)
            except Exception as e:
                print('Could not run Inference', e)

        feed.close()
    except Exception as e:
        print("Could not run Inference: ", e)
示例#14
0
if __name__ == '__main__':

    logger = logging.getLogger()
    args = arg_parse()
    input_file = args.input

    face_model_path = args.face_detection
    head_pose_path = args.head_pose
    facial_landmark_path = args.facial_landmark
    gaze_model_path = args.gaze_model

    face_model = FaceDetection(model_name=face_model_path)
    head_pose_model = HeadPoseEstimation(model_name=head_pose_path)
    facial_landmark_model = FacialLandmarkDetection(
        model_name=facial_landmark_path)
    gaze_estimation_model = GazeEstimation(model_name=gaze_model_path)

    mouse_controller = MouseController('medium', 'fast')

    start_time = time.time()
    face_model.load_model()
    face_loading_time = (time.time() - start_time) * 1000

    head_start_time = time.time()
    head_pose_model.load_model()
    head_pose_time = (time.time() - head_start_time) * 1000

    facial_landmark_start = time.time()
    facial_landmark_model.load_model()
    facial_landmark_time = (time.time() - facial_landmark_start) * 1000
示例#15
0
def test_run(args):
    logging.getLogger().setLevel(logging.INFO)
    feeder = None
    activate_frame_count = 10
    logging.warning("Running default value activate frame count = 10")
    if args.input_type == 'video' or args.input_type == 'image':
        feeder = InputFeeder(args.input_type, args.input)
        if args.input == '../bin/demo.mp4':
            logging.warning("Running default setting and input")
    elif args.input_type == 'webcam':
        feeder = InputFeeder(args.input_type, args.input)
    else:
        logging.error("Input not found")
        exit(1)

    mouse_controller = MouseController(args.precision, args.speed)

    feeder.load_data()
    start_time = 0

    face_model_load_time = 0
    start_time = time.time()
    face_model = FaceDetection(args.face, args.device, args.cpu_extension)
    face_model.load_model()
    face_model_load_time = time.time() - start_time
    logging.info("Face Detection Model Loaded...")

    head_pose_estimation_load_time = 0
    start_time = time.time()
    head_pose_estimation = HeadPoseEstimation(args.headpose, args.device,
                                              args.cpu_extension)
    head_pose_estimation.load_model()
    head_pose_estimation_load_time = time.time() - start_time
    logging.info("Head Pose Detection Model Loaded...")

    facial_landmarks_detection_load_time = 0
    start_time = time.time()
    facial_landmarks_detection = FacialLandmarksDetection(
        args.landmarks, args.device, args.cpu_extension)
    facial_landmarks_detection.load_model()
    facial_landmarks_detection_load_time = time.time() - start_time
    logging.info("Facial Landmark Detection Model Loaded...")

    gaze_model_load_time = 0
    start_time = time.time()
    gaze_model = GazeEstimation(args.gazeestimation, args.device,
                                args.cpu_extension)
    gaze_model.load_model()
    gaze_model_load_time = time.time() - start_time
    logging.info("Gaze Estimation Model Loaded...")

    frame_count = 0

    total_face_model_inference_time = 0
    total_head_pose_estimation_inference_time = 0
    total_facial_landmarks_detection_inference_time = 0
    total_gaze_model_inference_time = 0
    start_time = 0
    for frame in feeder.next_batch():
        if frame is None:
            break
        frame_count += 1
        key = cv2.waitKey(60)

        start_time = time.time()
        first_face_box, first_face = face_model.predict(frame.copy())
        total_face_model_inference_time = total_face_model_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        head_pose_output = head_pose_estimation.predict(first_face_box.copy())
        total_head_pose_estimation_inference_time = total_head_pose_estimation_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        left_eye, right_eye, eye_coords = facial_landmarks_detection.predict(
            first_face_box.copy())
        total_facial_landmarks_detection_inference_time = total_facial_landmarks_detection_inference_time + (
            time.time() - start_time)

        start_time = time.time()
        move_to_coors_mouse = gaze_model.predict(left_eye, right_eye,
                                                 head_pose_output)
        total_gaze_model_inference_time = total_gaze_model_inference_time + (
            time.time() - start_time)

        if frame_count % activate_frame_count == 0 and (args.flag == "3"
                                                        or args.flag == "4"):
            mouse_controller.move(move_to_coors_mouse[0],
                                  move_to_coors_mouse[1])
            cv2.imshow('video', frame)
            key = cv2.waitKey(60)
        if key == 27:
            break

        if args.flag == "1":
            cv2.rectangle(frame, (first_face[0], first_face[1]),
                          (first_face[2], first_face[3]), (255, 0, 0))
            cv2.imshow('video', frame)
            key = cv2.waitKey(60)
        elif args.flag == "2":
            cv2.rectangle(facial_landmarks_detection.image,
                          (eye_coords[0], eye_coords[1]),
                          (eye_coords[2], eye_coords[3]), (255, 0, 0))
            cv2.imshow('video', facial_landmarks_detection.image)
            key = cv2.waitKey(60)
        elif args.flag == "3":
            if frame_count == 1:
                logging.info("Printing mouse coors: ")
            logging.info(move_to_coors_mouse)

    #Print Report
    if args.flag == "0":
        print('------------- BEGIN REPORT -------------')
        avg_inference_face_model = total_face_model_inference_time / frame_count
        avg_inference_headpose = total_head_pose_estimation_inference_time / frame_count
        avg_inference_facial_landmark = total_facial_landmarks_detection_inference_time / frame_count
        avg_inference_gaze_model = total_gaze_model_inference_time / frame_count

        print("Face Detection Model Load Time: ", args.face)
        print("Loading time: ", face_model_load_time)
        print("Inference time: ", avg_inference_face_model)

        print("Head Pose Detection Model: ", args.headpose)
        print("Loading time: ", head_pose_estimation_load_time)
        print("Inference time:", avg_inference_headpose)

        print("Facial Landmark Detection Model Load Time: ", args.landmarks)
        print("Loading time: ", facial_landmarks_detection_load_time)
        print("Inference time:", avg_inference_facial_landmark)

        print("Gaze Estimation Model Load Time: ", args.gazeestimation)
        print("Loading time: ", gaze_model_load_time)
        print("Inference time:", avg_inference_gaze_model)

        print('------------- END REPORT -------------')
示例#16
0
def pipeline(args):
    feed = InputFeeder(args.i)
    feed.load_data()

    FaceDetectionPipe = FaceDetection(args.m_fd, args.pt, args.d, args.cpu_ext)
    load_time = time.time()
    FaceDetectionPipe.load_model()
    load_time_fd = time.time() - load_time

    FacialLandmarksPipe = FacialLandmarks(args.m_ld, args.d, args.cpu_ext)
    load_time = time.time()
    FacialLandmarksPipe.load_model()
    load_time_ld = time.time() - load_time

    HeadPoseEstimationPipe = HeadPoseEstimation(args.m_hpe, args.d,
                                                args.cpu_ext)
    load_time = time.time()
    HeadPoseEstimationPipe.load_model()
    load_time_hpe = time.time() - load_time

    GazeEstimationPipe = GazeEstimation(args.m_ge, args.d, args.cpu_ext)
    load_time = time.time()
    GazeEstimationPipe.load_model()
    load_time_ge = time.time() - load_time

    log.info('Load time for face detection model: ' + str(load_time_fd))
    log.info('Load time for landmark detection model: ' + str(load_time_ld))
    log.info('Load time for head pose estimation model: ' + str(load_time_hpe))
    log.info('Load time for gaze estimation model: ' + str(load_time_ge))

    inf_time_fd = inf_time_ld = inf_time_hpe = inf_time_ge = frame_count = 0
    for frame in feed.next_batch():
        if frame is None:
            break

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

        frame_count += 1
        inf_time = time.time()
        fd_img_output, fd_coords = FaceDetectionPipe.predict(frame)
        inf_time_fd = time.time() - inf_time

        if (fd_coords == []):
            log.info('No face detected')
        else:
            inf_time = time.time()
            eye_l_image, eye_r_image, ld_coords = FacialLandmarksPipe.predict(
                fd_img_output)
            inf_time_ld = time.time() - inf_time

            inf_time = time.time()
            hpe_output = HeadPoseEstimationPipe.predict(fd_img_output)
            inf_time_hpe = time.time() - inf_time

            yaw, pitch, roll = hpe_output
            inf_time = time.time()
            ge_output = GazeEstimationPipe.predict(eye_l_image, eye_r_image,
                                                   [yaw, pitch, roll])
            inf_time_ge = time.time() - inf_time

            if frame_count % 5 == 0:
                pointer = MouseController('medium', 'fast')
                pointer.move(ge_output[0], ge_output[1])

            fps_fd = 1 / inf_time_fd
            fps_ld = 1 / inf_time_ld
            fps_hpe = 1 / inf_time_hpe
            fps_ge = 1 / inf_time_ge

            if (args.v):
                v = Visualizer(frame, fd_img_output, fd_coords, ld_coords,
                               hpe_output)
                v.visualize()

            log.info('Average inference time for face detection model: ' +
                     str(inf_time_fd))
            log.info('Average inference time for landmark detection model: ' +
                     str(inf_time_ld))
            log.info(
                'Average inference time for head pose estimation model: ' +
                str(inf_time_hpe))
            log.info('Average inference time for gaze estimation model: ' +
                     str(inf_time_ge))

            log.info('FPS for face detection model: ' + str(fps_fd))
            log.info('FPS for landmark detection model: ' + str(fps_ld))
            log.info('FPS for head pose estimation model: ' + str(fps_hpe))
            log.info('FPS for gaze estimation model: ' + str(fps_ge))

            log.info('Frames Count:' + str(frame_count))

            mm = ModelMetrics()
            log.info('Writing stats to file...')
            mm.save_to_file('stats_fd.txt', 'FD/' + model_precision,
                            inf_time_fd, fps_fd, load_time_fd)
            mm.save_to_file('stats_ld.txt', model_precision, inf_time_ld,
                            fps_ld, load_time_ld)
            mm.save_to_file('stats_hpe.txt', model_precision, inf_time_hpe,
                            fps_hpe, load_time_hpe)
            mm.save_to_file('stats_ge.txt', model_precision, inf_time_ge,
                            fps_ge, load_time_ge)
    feed.close()
def main():

    # Grab command line args
   
    logger = logging.getLogger()
    args = build_argparser().parse_args()
    flags=args.previewflags
    inputfile=args.input
    inputfeed=None
    if inputfile.lower()=='cam':
        inputfeed=InputFeeder('cam')
    elif inputfile.endswith('.jpg') or inputfile.endswith('.bmp') :
        inputfeed=InputFeeder("image",inputfile)
    #elif inputfile.endswith('.mp4') :
        
        #inputfeed=InputFeeder("video",inputfile)
    else:
        
        if not (os.path.isfile(inputfile)):
            print((inputfile))
            logger.error("Specified input file doesn't exist")
            exit(1)
        inputfeed=InputFeeder("video",inputfile)
      
    model_paths={'GazeEstimation':args.gazeestimationnmodel,'FacialLandmarkDetection':args.faciallandmarkmodel,'HeadPoseEstimation':args.headposemodel,'FaceDetection':args.facedetectionmodel}

    for file in model_paths.keys():
        
        if not os.path.isfile(model_paths[file]):
            logger.error("Unable to find specified "+file+" xml file")
        
    flm=FacialLandmarkDetection(model_paths['FacialLandmarkDetection'],args.device,args.cpu_extension)
        
    gze= GazeEstimation(model_paths['GazeEstimation'],args.device,args.cpu_extension)
        
    hpe=HeadPoseEstimation(model_paths['HeadPoseEstimation'],args.device,args.cpu_extension)
        
    fd=FaceDetection(model_paths['FaceDetection'],args.device,args.cpu_extension)
     
    flm.load_model() 
    fd.load_model()
    gze.load_model()
    hpe.load_model()
    mc=MouseController('medium','fast')
    inputfeed.load_data()
    
    frame_count=0
    for ret, frame in inputfeed.next_batch():
          
          if not ret:
            break
          frame_count+=1
          
          if frame_count%3==0:
                cv2.imshow('video',cv2.resize(frame,(300,300)))
                cv2.waitKey(1)
          facecoords,cropped_image=fd.predict(frame.copy(),args.prob_threshold)
          
          if type(cropped_image)==int:
              logger.error('unable to detect face')
          head_out=hpe.predict(cropped_image)
          left_eye,right_eye,eye=flm.predict(cropped_image)
          mouse_coords,gaze_vector=gze.predict(left_eye,right_eye,head_out)
          if (len(flags)!=0):
              preview_frame=frame.copy()
              if 'fd' in flags:
                  preview_frame=cropped_image
              if 'fld' in flags:
                  cv2.rectangle(cropped_image,(eye[0][0]-15,eye[0][1]-15),(eye[0][2]+15,eye[0][3]+15),(0,0,255))
                  cv2.rectangle(cropped_image,(eye[1][0]-15,eye[1][1]-15),(eye[1][2]+15,eye[1][3]+15),(0,0,255))
              if 'hp' in flags:
                  
                  cv2.putText(preview_frame,"Pose Angles: roll:{:2f}|pitch:{:2f}|yaw:{:2f}|".format(head_out[2],head_out[1],head_out[0]),(10,20),cv2.FONT_HERSHEY_COMPLEX,0.25,(255,0,0),1)
              if 'ge' in flags:
                  x,y,w=(int(gaze_vector[1]*12),int(gaze_vector[0]*12),130)
                  left =cv2.line(left_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,0), 2)
                  cv2.line(left, (x-w, y+w), (x+w, y-w), (255,0,0), 2)
                  right = cv2.line(right_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,0), 2)
                  cv2.line(right, (x-w, y+w), (x+w, y-w), (255,0,0), 2)
                  cropped_image[eye[0][1]:eye[0][3],eye[0][0]:eye[0][2]] = left
                  cropped_image[eye[1][1]:eye[1][3],eye[1][0]:eye[1][2]] = right
              cv2.imshow("visualisation_frame",cv2.resize(preview_frame,(300,300)))
          if frame_count%3==0:
              mc.move(mouse_coords[0],mouse_coords[1])
    logger.error("video ended...")
    cv2.destroyAllWindows()
    inputfeed.close()    
示例#18
0
def infer_on_stream(args):
    models = None
    # Check selected precision model
    if "FP32" in args.precision:
        models = select_precision(args.precision)

    if "FP16" in args.precision:
        models = select_precision(args.precision)

    if "INT8" in args.precision:
        models = select_precision(args.precision)

    # Get Input
    input_feeder = InputFeeder(args.input_type, args.input_file)
    input_feeder.load_data()

    # Load face detection model
    face = FaceDetection(model_name=models[0],
                         device=args.device,
                         extensions=args.cpu_extension)
    face.load_model()

    # Load head pose model
    head = HeadPoseEstimation(model_name=models[1],
                              device=args.device,
                              extensions=args.cpu_extension)
    head.load_model()

    # Load facial landmark model
    landmark = FacialLandmarkDetection(model_name=models[2],
                                       device=args.device,
                                       extensions=args.cpu_extension)
    landmark.load_model()

    # Load gaze estimation model
    gaze = GazeEstimation(model_name=models[3],
                          device=args.device,
                          extensions=args.cpu_extension)
    gaze.load_model()

    # Initalize mouse controller
    mouse = MouseController('high', 'fast')

    for frame in input_feeder.next_batch():
        # Break if number of next frame less then number of batch
        if frame is None:
            break

        # Estimate face region
        output_frame, cropped_face, box_coord = face.predict(frame)

        # Estimate head pose position
        head_pose = head.predict(cropped_face)
        head_pose = np.array(head_pose)

        # Estimate eyes landmark coordinates
        lr_eyes = landmark.predict(cropped_face)

        eyes = []

        # Calculate eye image region
        for coord in lr_eyes:
            x = int(coord[0] + box_coord[0])
            y = int(coord[1] + box_coord[1])
            cv2.circle(output_frame, (x, y), 5, (255, 0, 0), -1)

            eye_box, cropped_eye = eyes_crop(output_frame, x, y, 40)
            cv2.rectangle(output_frame, eye_box[0], eye_box[1], (255, 0, 0), 1)
            eyes.append(cropped_eye)

        # Estimate gaze direction
        gaze_coords = gaze.predict(eyes[0], eyes[1], head_pose)

        # Move the mouse cursor
        mouse.move(gaze_coords[0], gaze_coords[1])

        if "True" in args.visualize:
            cv2.imshow('Capture', output_frame)

            if cv2.waitKey(30) & 0xFF == ord('q'):
                break

    input_feeder.close()
    if "True" in args.visualize:
        cv2.destroyAllWindows()
示例#19
0
def main():
    '''
    Main Function for  Eye Gaze Based Mouse Controller Program using Multiple OpenVino Models
    '''
    try:
        log.basicConfig(level=log.ERROR)
        #Argument parser
        args = build_argparser().parse_args()
        
        # Checking input type image ,video or cam(0)
        image_file_extension_list = ['jpg','jpeg','png','bmp','tiff','gif','webp']
        if(args.input!='0'):
            
            input_file_extension = args.input.split('.')[len(args.input.split('.'))-1]
            
            if input_file_extension in image_file_extension_list:
               input_type ='image'
               
            else:
               input_type ='video'
            input_file = args.input
        
        else:
            input_type ='cam'
            input_file = None
        
        #Set show frame and annotate flag based on argument
        args.show_frame=True if args.show_frame else False
        if(args.show_frame):
            args.annot_frame=True if args.annot_frame else False
        else:
            args.annot_frame=False
        
        feed =InputFeeder(input_type,input_file)
        feed.load_data()
        #fps = feed.get_fps()
        
        
        
        #Set Performace Stats Levels based on which performance stats will be printed in console and written in stat file(if provided)  
        perf_stat_lvl=args.perf_stat_lvl
        if(perf_stat_lvl>0 and (args.perf_stat_file is not None)):
            perf_stat_file = open(args.perf_stat_file ,'w')
            perf_stat_file.writelines(["##############################OpenVino Model Performance Stats##############################"])
        else:
            perf_stat_file = None
          
        
        #Initialization of performance counters for all Models
        total_model_load_time = 0
        
        all_model_infer_time =0
        all_model_infer_time_min =99999999999999
        all_model_infer_time_max =0
        all_model_infer_time_avg =0
        all_model_infer_time_total =0
        
        face_detect_infer_time =0
        face_detect_infer_time_min =99999999999999
        face_detect_infer_time_max =0
        face_detect_infer_time_avg =0
        face_detect_infer_time_total=0
        
        face_landmarks_infer_time =0
        face_landmarks_infer_time_min =99999999999999
        face_landmarks_infer_time_max =0
        face_landmarks_infer_time_avg =0
        face_landmarks_infer_time_total=0
        
        head_estimation_infer_time =0
        head_estimation_infer_time_min =99999999999999
        head_estimation_infer_time_max =0
        head_estimation_infer_time_avg =0
        head_estimation_infer_time_total=0
        
        gaze_estimation_infer_time =0
        gaze_estimation_infer_time_min =99999999999999
        gaze_estimation_infer_time_max =0
        gaze_estimation_infer_time_avg =0
        gaze_estimation_infer_time_total=0
        
        #Instantiate Face Detection Class & Load corresponding model
        face_detect=FaceDetection(args.face_detect_model,args.device,args.cpu_extension)
        start_time=timeit.default_timer()
        face_detect.load_model()  
        end_time = timeit.default_timer()
        model_load_time = end_time-start_time # Record Model Load time
        total_model_load_time = total_model_load_time + model_load_time
        log_perf_stat("Face Detection Model Loading Time: {0:.1f}ms".format(model_load_time*1000),perf_stat_lvl,perf_stat_file)
        
        #Instantiate Face Landmarks Detection Class & Load corresponding model
        face_lm_detect = FaceLandmarksDetection(args.face_landmarks_model,args.device,args.cpu_extension)
        start_time=timeit.default_timer()
        face_lm_detect.load_model()  
        end_time = timeit.default_timer()
        model_load_time = end_time-start_time
        total_model_load_time = total_model_load_time + model_load_time
        log_perf_stat("Face Landmarks Detection Model Loading Time: {0:.1f}ms".format(model_load_time*1000),perf_stat_lvl,perf_stat_file)
        
        #Instantiate Head Pose Estimate Class & Load corresponding model
        head_pose_estimate = HeadPoseEstimation(args.head_pose_model,args.device,args.cpu_extension)
        start_time=timeit.default_timer()
        head_pose_estimate.load_model()  
        end_time = timeit.default_timer()
        model_load_time = end_time-start_time
        total_model_load_time = total_model_load_time + model_load_time
        log_perf_stat("Head Estimation Model Loading Time: {0:.1f}ms".format(model_load_time*1000),perf_stat_lvl,perf_stat_file)
        
        #Instantiate Gaze Estimate Class & Load corresponding model
        gaze_estimate = GazeEstimation(args.gaze_estimation_model,args.device,args.cpu_extension)
        start_time=timeit.default_timer()
        gaze_estimate.load_model()  
        end_time = timeit.default_timer()
        model_load_time = end_time-start_time
        total_model_load_time = total_model_load_time + model_load_time
        log_perf_stat("Gaze Estimation Model Loading Time: {0:.1f}ms".format(model_load_time*1000),perf_stat_lvl,perf_stat_file)
        
        #Instantiate Mouse Controller Class and reset mouse pointer to center of screen
        mouse_control = MouseController(args.mouse_prec,args.mouse_speed)
        mouse_control.move_mouse_to_center()
        
        #If Show frame flag is set open frame window
        if(args.show_frame):
            cv2.namedWindow('Output Image',cv2.WINDOW_NORMAL)
            cv2.resizeWindow('Output Image', 600,450)
            cv2.moveWindow('Output Image', 600,300)
        
        frame_no=0 
        frame_no_with_face =0
        try:
            for image in feed.next_batch(): # Read frame one by one
                if (image is None):
                    break
                if(input_type =='cam'):
                    image =cv2.flip(image,1) # In case of cam input, flip image
                frame_no+=1
                image =cv2.resize(image,(1920,1080))
                
                #Run Face Detection Inferrence pipeline(Pre-process Input, Predict &  Pre-process Output)
                face_detect_infer_time,face_detected ,bb_coord,annotated_image = run_infer_pipeline_face_detection(face_detect,image,args.prob_threshold,args.annot_frame)
                
                # Calculate face detection statistical(min,max ,avg) inference time across frames               
                face_detect_infer_time_min,face_detect_infer_time_max,face_detect_infer_time_total =calculate_historical_infer_stats(face_detect_infer_time,face_detect_infer_time_min,face_detect_infer_time_max,face_detect_infer_time_total )
                
                if(perf_stat_lvl>1):#Log frame by frame stats if perf stat level is more than 1
                    log_perf_stat("Face Detect Model, Frame No. {} Infer time : {:.2f}ms".format(frame_no,face_detect_infer_time*1000),perf_stat_lvl,perf_stat_file)
                
                if(face_detected):#if face detected run next models inference in pipeline
                    frame_no_with_face+=1
                    
                    #Run Face Landmark Detection Inferrence pipeline(Pre-process Input, Predict &  Pre-process Output)
                    face_landmarks_infer_time,left_eye_image,right_eye_image,annotated_image=run_infer_pipeline_face_landmark_detection(face_lm_detect,image,bb_coord,annotated_image,args.annot_frame)
                    
                    # Calculate face landmark detection statistical(min,max ,avg) inference time across frames 
                    face_landmarks_infer_time_min,face_landmarks_infer_time_max,face_landmarks_infer_time_total =calculate_historical_infer_stats(face_landmarks_infer_time,face_landmarks_infer_time_min,face_landmarks_infer_time_max,face_landmarks_infer_time_total )
                    if(perf_stat_lvl>1):#Log frame by frame stats if perf stat level is more than 1
                        log_perf_stat("Face LandMarks Detection Model, Frame No. {} Infer time : {:.6f}ms".format(frame_no_with_face,face_landmarks_infer_time*1000),perf_stat_lvl,perf_stat_file)
                        
                    
                    #Run Head Estimation Inferrence pipeline(Pre-process Input, Predict &  Pre-process Output)
                    head_estimation_infer_time,head_angles,annotated_image=run_infer_pipeline_head_estimation(head_pose_estimate,image,bb_coord,annotated_image,args.annot_frame)
                    
                    # Calculate Head Estimate statistical(min,max ,avg) inference time across frames 
                    head_estimation_infer_time_min,head_estimation_infer_time_max,head_estimation_infer_time_total =calculate_historical_infer_stats(head_estimation_infer_time,head_estimation_infer_time_min,head_estimation_infer_time_max,head_estimation_infer_time_total )
                    if(perf_stat_lvl>1):#Log frame by frame stats if perf stat level is more than 1
                        log_perf_stat("Head Angles Estimation Model, Frame No. {} Infer time : {:.2f}ms".format(frame_no_with_face,head_estimation_infer_time*1000),perf_stat_lvl,perf_stat_file)
                    
                    #Run Gaze Estimation Inferrence pipeline(Pre-process Input, Predict &  Pre-process Output)
                    gaze_estimation_infer_time,annotated_image,gaze_output =run_infer_pipeline_gaze_estimation(gaze_estimate,image,left_eye_image,right_eye_image,head_angles,annotated_image,args.annot_frame)
                    
                    # Calculate Gaze Estimate statistical(min,max ,avg) inference time across frames
                    gaze_estimation_infer_time_min,gaze_estimation_infer_time_max,gaze_estimation_infer_time_total =calculate_historical_infer_stats(gaze_estimation_infer_time,gaze_estimation_infer_time_min,gaze_estimation_infer_time_max,gaze_estimation_infer_time_total )
                    if(perf_stat_lvl>1):
                        log_perf_stat("Gaze Estimation Model, Frame No. {} Infer time : {:.2f}ms".format(frame_no_with_face,gaze_estimation_infer_time*1000),perf_stat_lvl,perf_stat_file)
                    
                    # Calculate All 4 models total statistical(min,max ,avg) inference time across frames
                    all_model_infer_time = face_detect_infer_time + face_landmarks_infer_time +head_estimation_infer_time +  gaze_estimation_infer_time
                    
                    all_model_infer_time_min,all_model_infer_time_max,all_model_infer_time_total =calculate_historical_infer_stats(all_model_infer_time,all_model_infer_time_min,all_model_infer_time_max,all_model_infer_time_total )
                    if(perf_stat_lvl>0):
                        log_perf_stat("All 4 Models Infer Time for Frame No. {} : {:.2f}ms".format(frame_no_with_face,all_model_infer_time*1000),perf_stat_lvl,perf_stat_file)
                    
                   
                    #Set mouse x,y relative movement position base on gaze estimation model output
                    move_x = gaze_output[0][0]
                    move_y = gaze_output[0][1]
                    mouse_control.move(move_x,move_y)
                    
                    annoatation_text = "Total All Models Inference time : {:.2f}ms".format(all_model_infer_time*1000) 
                else:
                    print ("No Face Detected")
                    annoatation_text = "No Face Detected"
              
                #If annotation and show frame is True
                if(args.annot_frame):
                    annotated_image= cv2.putText(annotated_image,annoatation_text , (20,90), cv2.FONT_HERSHEY_SIMPLEX,1, (255,0,0), 4, cv2.LINE_AA)
                if(args.show_frame):    
                    cv2.imshow('Output Image' ,annotated_image)
                           
                
                if(input_type =='image'):
                    cv2.waitKey(0)
                    break
                else:
                    if(cv2.waitKey(30)>0):
                        break
                        
                
                       
                
              
            #At last log Summary of performance stats for each models and total all 4 models inference time across all frames
            if(perf_stat_lvl>0):
                
                log_perf_stat("#######Performance Summary Stats#######",perf_stat_lvl,perf_stat_file)
                face_detect_infer_time_avg =face_detect_infer_time_total/frame_no
                log_perf_stat("Face Detect Model Inference Time Summary : Min {:.2f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(face_detect_infer_time_min*1000,face_detect_infer_time_avg*1000,face_detect_infer_time_max*1000),perf_stat_lvl,perf_stat_file)
                
                if(frame_no_with_face!=0):
                    face_landmarks_infer_time_avg =face_landmarks_infer_time_total/frame_no_with_face
                    log_perf_stat("Face Landmarks Model Inference Time Summary: Min {:.5f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(face_landmarks_infer_time_min*1000,face_landmarks_infer_time_avg*1000,face_landmarks_infer_time_max*1000),perf_stat_lvl,perf_stat_file)
                    
                    head_estimation_infer_time_avg =head_estimation_infer_time_total/frame_no_with_face
                    log_perf_stat("Head Estimation Model Inference Time Summary: Min {:.2f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(head_estimation_infer_time_min*1000,head_estimation_infer_time_avg*1000,head_estimation_infer_time_max*1000),perf_stat_lvl,perf_stat_file)
                    
                    gaze_estimation_infer_time_avg =gaze_estimation_infer_time_total/frame_no_with_face
                    log_perf_stat("Gaze Estimation Model Inference Time Summary: Min {:.2f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(gaze_estimation_infer_time_min*1000,gaze_estimation_infer_time_avg*1000,gaze_estimation_infer_time_max*1000),perf_stat_lvl,perf_stat_file)
                    
                    all_model_infer_time_avg =all_model_infer_time_total/frame_no_with_face
                    log_perf_stat("All 4 Models Total Inference Time Summary: Min {:.2f}ms, Avg {:.2f}ms, Max {:.2f}ms".format(all_model_infer_time_min*1000,all_model_infer_time_avg*1000,all_model_infer_time_max*1000),perf_stat_lvl,perf_stat_file)
               
                
            feed.close()
            if(perf_stat_file is not None):
                perf_stat_file.close()
            cv2.destroyAllWindows() 
        except KeyboardInterrupt: 
               if(not perf_stat_file.closed ):
                    perf_stat_file.close()
               if(feed.is_cap_open):
                   feed.close() 
                   
               cv2.destroyAllWindows() 
               print("Keyboard Interrupt, Exiting!!!")
               sys.exit()
    except Exception as e:
            log.error("Unexpected Error Happened, see below for more details")
            log.error("Exception Error Type:{}".format(str(e)))
            log.error("###Below is traceback for Debug###")
            log.error(traceback.format_exc())
            log.error("Program will Exit!!!")
            sys.exit(0)
示例#20
0
def main():
    #Building the arguments
    args = build_parser().parse_args()
    previewFlag = args.previewFlags

    log = logging.getLogger()
    input_path = args.input
    inputFeed = None

    if input_path.lower() == 'cam':
        inputFeed = InputFeeder('cam')
    else:
        if not os.path.isfile(input_path):
            log.error("Unable to find the input file specified.")
            exit(1)
        inputFeed = InputFeeder('video', input_path)

    #Creating Model paths
    model_path = {
        'FaceDetectionModel': args.facedetectionmodel,
        'FacialLandmarksDetectionModel': args.faciallandmarkmodel,
        'GazeEstimationModel': args.gazeestimationmodel,
        'HeadPoseEstimationModel': args.headposemodel
    }

    for fnameKey in model_path.keys():
        if not os.path.isfile(model_path[fnameKey]):
            log.error('Unable to find the specified ' + fnameKey +
                      'binary file(.xml)')
            exit(1)

    #Creating Model Instances
    fd = FaceDetection(model_path['FaceDetectionModel'], args.device,
                       args.cpu_extension)
    flm = FacialLandmarkDetection(model_path['FacialLandmarksDetectionModel'],
                                  args.device, args.cpu_extension)
    gm = GazeEstimation(model_path['GazeEstimationModel'], args.device,
                        args.cpu_extension)
    hpe = Head_Pose_estimation(model_path['HeadPoseEstimationModel'],
                               args.device, args.cpu_extension)

    m_control = MouseController('medium', 'fast')

    #Loading data
    inputFeed.load_data()
    fd.load_model()
    flm.load_model()
    hpe.load_model()
    gm.load_model()

    frame_count = 0
    for ret, frame in inputFeed.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 10 == 0:
            cv2.imshow('Original Video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        coords, img = fd.predict(frame, args.prob_threshold)
        if type(img) == int:
            log.error("No face detected")
            if key == 27:
                break
            continue

        hpout = hpe.predict(img)
        left_eye, right_eye, eye_coord = flm.predict(img)
        mouse_coord, gaze_vec = gm.predict(left_eye, right_eye, hpout)

        if (not len(previewFlag) == 0):
            preview_img = img
            if 'fd' in previewFlag:
                preview_img = img
            if 'fld' in previewFlag:
                start_l = (eye_coord[0][0] - 10, eye_coord[0][1] - 10)
                end_l = (eye_coord[0][2] + 10, eye_coord[0][3] + 10)
                start_r = (eye_coord[1][0] - 10, eye_coord[1][1] - 10)
                end_r = (eye_coord[1][2] + 10, eye_coord[1][3] + 10)
                cv2.rectangle(img, start_l, end_l, (0, 255, 0), 2)
                cv2.rectangle(img, start_r, end_r, (0, 255, 0), 2)
            if 'hp' in previewFlag:
                cv2.putText(
                    preview_img,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hpout[0], hpout[1], hpout[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (255, 255, 255), 1)
            if 'ge' in previewFlag:
                x, y, w = int(gaze_vec[0] * 12), int(gaze_vec[1] * 12), 160
                lefteye = cv2.line(left_eye, (x - w, y - w), (x + w, y + w),
                                   (100, 0, 255), 1)
                cv2.line(lefteye, (x - w, y + w), (x + w, y - w),
                         (100, 0, 255), 1)
                righteye = cv2.line(right_eye, (x - w, y - w), (x + w, y + w),
                                    (100, 0, 255), 1)
                cv2.line(righteye, (x - w, y + w), (x + w, y - w),
                         (100, 0, 255), 1)
                img[eye_coord[0][1]:eye_coord[0][3],
                    eye_coord[0][0]:eye_coord[0][2]] = lefteye
                img[eye_coord[1][1]:eye_coord[1][3],
                    eye_coord[1][0]:eye_coord[1][2]] = righteye

            cv2.imshow("Detections", cv2.resize(preview_img, (500, 500)))
        if frame_count % 10 == 0:
            m_control.move(mouse_coord[0], mouse_coord[1])
        if key == 27:
            break
    log.error("Videostream Completed")
    cv2.destroyAllWindows()
    inputFeed.close()
def main():
    # Get command line arguments
    args = parser.parse_args()
    device = args.device
    cpu_extensions = args.extensions
    threshold = args.threshold
    gaze_estimation_precision = args.gaze_estimation_precision
    head_pose_precision = args.head_pose_precision
    face_detection_precision = args.face_detection_precision
    landmarks_precision = args.landmarks_precision
    input_feeder = InputFeeder(args)
    control_mouse = MouseController(args)
    gaze_model = 'models/intel/gaze-estimation-adas-0002/{}/gaze-estimation-adas-0002'.format(
        gaze_estimation_precision)
    face_detector_model = 'models/intel/face-detection-adas-binary-0001/{}/face-detection-adas-binary-0001'.format(
        face_detection_precision)
    facial_landmark_model = 'models/intel/landmarks-regression-retail-0009/{}/landmarks-regression-retail-0009'.format(
        landmarks_precision)
    head_pose_model = 'models/intel/head-pose-estimation-adas-0001/{}/head-pose-estimation-adas-0001'.format(
        head_pose_precision)

    # Initialize the models
    face_detector = FaceDetector(face_detector_model, args)
    facial_landmarks = FacialLandmarksDetector(
        model_name=facial_landmark_model,
        device=device,
        extensions=cpu_extensions)
    head_pose_estimation = HeadPoseEstimation(model_name=head_pose_model,
                                              device=device,
                                              extensions=cpu_extensions)
    gaze_estimation = GazeEstimation(model_name=gaze_model,
                                     device=device,
                                     extensions=cpu_extensions)

    # Load the models
    start_time = time.time()
    face_detector.load_model()
    face_detector_loadtime = time.time() - start_time
    start_time = time.time()
    facial_landmarks.load_model()
    facial_landmark_loadtime = time.time() - start_time
    start_time = time.time()
    head_pose_estimation.load_model()
    head_pose_estimation_loadtime = time.time() - start_time
    start_time = time.time()
    gaze_estimation.load_model()
    gaze_estimation_loadtime = time.time() - start_time
    logging.info('FINISH LOADING MODELS')

    try:
        width, height = input_feeder.load_data()
    except TypeError:
        logging.error('Invalid file type.')
        return

    output_handler = OutputHandler(args)
    output_handler.initalize_video_writer(width, height)
    frame_count = 0
    start_time = 0
    capture = input_feeder.cap
    inputs = args.input
    if input_feeder.input_type == 'cam':
        inputs = 0
    else:
        capture.open(inputs)
    while capture.isOpened():
        flag, frame = capture.read()

        if start_time == 0:
            start_time = time.time()

        if inputs == 0 and time.time() - start_time >= 1:
            gaze_estimate = run_inference(frame, face_detector,
                                          facial_landmarks,
                                          head_pose_estimation,
                                          gaze_estimation, output_handler)
            if gaze_estimate is None:
                break

            if gaze_estimate[0][0]:
                x, y = gaze_estimate[0][:2]
                control_mouse.move(x, y)
            start_time = 0
            frame_count += 1
        elif not inputs == 0:
            gaze_estimate = run_inference(frame, face_detector,
                                          facial_landmarks,
                                          head_pose_estimation,
                                          gaze_estimation, output_handler)
            if gaze_estimate is None:
                break

            if gaze_estimate[0][0] and time.time() - start_time >= 0.5:
                x, y = gaze_estimate[0][:2]
                control_mouse.move(x, y)
                start_time = 0
            frame_count += 1

    input_feeder.close()
    logging.info('TOTOAL FRAMES PROCESSED: {}'.format(frame_count))
    logging.info('Time to load face detector model is {:.5f}'.format(
        face_detector_loadtime))
    logging.info('Time to load head pose estimation model is {:.5f}'.format(
        head_pose_estimation_loadtime))
    logging.info('Time to load facial landmarks model model is {:.5f}'.format(
        facial_landmark_loadtime))
    logging.info('Time to load gaze estimation model is {:.5f}'.format(
        gaze_estimation_loadtime))
示例#22
0
def main():

    args = argparser().parse_args()
    device = args.device
    input_feed = args.input

    log = logging.getLogger()

    model_paths = {
        'facedet': args.face_detection_model + 'xml',
        'faceldmdet': args.landmark_detection_model + 'xml',
        'headpose': args.pose_estimation_model + 'xml',
        'gaze': args.gaze_estimation_model + 'xml'
    }

    for mp in model_paths.keys():
        if not os.path.isfile(model_paths[mp]):
            print(model_paths[mp])
            print('Recheck file path and try again')
            log.error("Not a file")
            raise FileNotFoundError

    if input_feed == 'cam':

        feed = InputFeeder(input_type='cam')

    elif not os.path.isfile(input_feed):

        print('Recheck file path and try again')
        log.error("Unable to find specified video file")
        raise FileNotFoundError

    else:
        feed = InputFeeder(input_type='video', input_file=input_feed)

    facedet = FaceDetection(args.face_detection_model, args.device,
                            args.extensions, args.async_mode)
    faceldmdet = FacialLandmarksDetection(args.landmark_detection_model,
                                          args.device, args.extensions,
                                          args.async_mode)
    headpose = HeadPose(args.pose_estimation_model, args.device,
                        args.extensions, args.async_mode)
    gaze = GazeEstimation(args.gaze_estimation_model, args.device,
                          args.extensions, args.async_mode)

    try:
        log.info('Loading models...')
        facedet.load_model()
        faceldmdet.load_model()
        headpose.load_model()
        gaze.load_model()
        feed.load_data()
        log.info('Models loaded successfully!')
    except:
        log.error('One or more of the models failed to load..')
        exit(1)

    log.info('Initializing mouse controller')
    mouse = MouseController(precision='medium', speed='fast')

    for batch in feed.next_batch():
        face = facedet.predict(batch)
        eyes, eye_coords = faceldmdet.predict(face)
        pose = headpose.predict(face)

        point = gaze.predict(pose, eyes)
        #print('Gaze values = ', point[0], point[1])

        log.info('All inference complete')

        #print('view_inter = ', args.view_intermediate)
        if args.input == 'cam':
            point[0] = -point[0]

        mouse.move(point[0], point[1])
        if args.view_intermediate == True:
            visualize(pose, face, eye_coords, point)
示例#23
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("gaze-app.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        #mc.move(100,100)
        fdnet = FaceDetection(args.fdmodel)
        lmnet = FacialLandmarks(args.lmmodel)
        hpnet = HeadPoseEstimation(args.hpmodel)
        genet = GazeEstimation(args.gemodel)

        ### Load the model through ###
        logging.info("============== Models Load time ===============")
        start_time = time.time()
        fdnet.load_model()
        logging.info("Face Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        fdnet.check_model()
        logging.info("Face Detection estimation layers loaded correctly")

        start_time = time.time()
        lmnet.load_model()
        logging.info("Facial Landmarks Detection Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        lmnet.check_model()
        logging.info("Facial Landmarks estimation layers loaded correctly")

        start_time = time.time()
        hpnet.load_model()
        logging.info("Headpose Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))
        hpnet.check_model()
        logging.info("Head pose estimation layers loaded correctly")

        start_time = time.time()
        genet.load_model()
        logging.info("Gaze Estimation Model: {:.1f}ms".format(
            1000 * (time.time() - start_time)))

        genet.check_model()
        logging.info("Gaze estimation layers loaded correctly")
        logging.info("==============  End =====================")
        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()
        # FPS = feeder.get_fps()

        # Grab the shape of the input
        # width = feeder.get_width()
        # height = feeder.get_height()

        # init scene variables
        frame_count = 0

        ### Loop until stream is over ###
        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0
        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1
            #print(int((frame_count) % int(FPS)))

            # face detection
            fd_process_time = time.time()
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fnoutput = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, fboxes = fdnet.preprocess_output(
                fnoutput, frame, args.print)
            logging.info(
                "Face Detection Model processing time : {:.1f}ms".format(
                    1000 * (time.time() - fd_process_time)))

            #for each face
            for fbox in fboxes:

                # fbox = (xmin,ymin,xmax,ymax)
                # get face landmarks
                # crop face from frame
                face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]]
                lm_process_time = time.time()
                p_frame = lmnet.preprocess_input(face)
                start_time = time.time()
                lmoutput = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lmoutput, fbox, out_frame, args.print)
                logging.info(
                    "Landmarks model processing time : {:.1f}ms".format(
                        1000 * (time.time() - lm_process_time)))

                # get head pose estimation
                hp_process_time = time.time()
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hpoutput = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angels = hpnet.preprocess_output(
                    hpoutput, out_frame, face, fbox, args.print)
                logging.info(
                    "Headpose estimation model processing time : {:.1f}ms".
                    format(1000 * (time.time() - hp_process_time)))

                # get gaze  estimation
                gaze_process_time = time.time()
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                geoutput = genet.predict(left_eye, right_eye, headpose_angels)
                ge_infertime += time.time() - start_time
                out_frame, gazevector = genet.preprocess_output(
                    geoutput, out_frame, fbox, left_eye_point, right_eye_point,
                    args.print)
                logging.info(
                    "Gaze estimation model processing time : {:.1f}ms".format(
                        1000 * (time.time() - gaze_process_time)))

                if (not args.no_video):
                    cv2.imshow('im', out_frame)

                if (not args.no_move):
                    mc.move(gazevector[0], gazevector[1])

                #consider only first detected face in the frame
                break

            # Break if escape key pressed
            if key_pressed == 27:
                break

        #logging inference times
        if (frame_count > 0):
            logging.info(
                "============== Models Inference time ===============")
            logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime /
                                                          frame_count))
            logging.info("Facial Landmarks Detection:{:.1f}ms".format(
                1000 * lm_infertime / frame_count))
            logging.info("Headpose Estimation:{:.1f}ms".format(
                1000 * hp_infertime / frame_count))
            logging.info("Gaze Estimation:{:.1f}ms".format(
                1000 * ge_infertime / frame_count))
            logging.info("============== End ===============================")

        # Release the capture and destroy any OpenCV windows
        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception("Error in inference:" + str(ex))
示例#24
0
def main():
    # command line args
    args = build_argparser().parse_args()
    input_file_path = args.input
    log_object = log.getLogger()
    oneneneflags = args.visualization_flag

    # Initialise the classes
    fd_object = FaceDetection(model_name=args.face_detection_model,
                              device=args.device,
                              threshold=args.prob_threshold,
                              extensions=args.cpu_extension)
    fl_object = FacialLandmarkDetection(model_name=args.facial_landmarks_model,
                                        device=args.device,
                                        extensions=args.cpu_extension)
    hp_object = HeadPoseEstimation(model_name=args.head_pose_model,
                                   device=args.device,
                                   extensions=args.cpu_extension)
    ge_object = GazeEstimation(model_name=args.gaze_estimation_model,
                               device=args.device,
                               extensions=args.cpu_extension)

    mouse_controller_object = MouseController('low', 'fast')

    ### Loading the models ###
    log_object.error(
        "=================== Models Load Time ====================")
    start_time = time.time()
    fd_object.load_model()
    log_object.error("Face detection model loaded in {:.3f} ms".format(
        (time.time() - start_time) * 1000))

    fl_start = time.time()
    fl_object.load_model()
    log_object.error(
        "Facial landmarks detection model loaded in {:.3f} ms".format(
            (time.time() - fl_start) * 1000))

    hp_start = time.time()
    hp_object.load_model()
    log_object.error("Head pose estimation model loaded in {:.3f} ms".format(
        (time.time() - hp_start) * 1000))

    ge_start = time.time()
    ge_object.load_model()
    log_object.error("Gaze estimation model loaded in {:.3f} ms".format(
        (time.time() - ge_start) * 1000))

    total_time = time.time() - start_time
    log_object.error(
        "=================== Models loaded successfully ===================")
    log_object.error("Total loading time is {:.3f} ms".format(total_time *
                                                              1000))

    counter = 0
    infer_start = time.time()
    log_object.error(
        "=================== Start inferencing on input video ===================="
    )

    if input_file_path == "CAM":
        input_feeder = InputFeeder("cam")
    else:
        if not os.path.isfile(input_file_path):
            exit(1)
        input_feeder = InputFeeder("video", input_file_path)

        log_object.error("Input feeders are loaded")
        input_feeder.load_data()

    for frame in input_feeder.next_batch():
        # if not flag:
        #     break
        pressed_key = cv2.waitKey(60)
        counter += 1

        face_coordinates, face_image = fd_object.predict(frame.copy())
        if face_coordinates == 0:
            continue

        hp_output = hp_object.predict(face_image)

        left_eye_image, right_eye_image, eye_coord = fl_object.predict(
            face_image)

        mouse_coordinate, gaze_vector = ge_object.predict(
            left_eye_image, right_eye_image, hp_output)

        if len(oneneneflags) != 0:
            preview_window = frame.copy()
            if 'fd' in oneneneflags:
                if len(oneneneflags) != 1:
                    preview_window = face_image
                else:
                    cv2.rectangle(preview_window,
                                  (face_coordinates[0], face_coordinates[1]),
                                  (face_coordinates[2], face_coordinates[3]),
                                  (0, 150, 0), 3)
            if 'fl' in oneneneflags:
                if not 'fd' in oneneneflags:
                    preview_window = face_image.copy()
                cv2.rectangle(preview_window,
                              (eye_coord[0][0], eye_coord[0][1]),
                              (eye_coord[0][2], eye_coord[0][3]),
                              (150, 0, 150))
                cv2.rectangle(preview_window,
                              (eye_coord[1][0], eye_coord[1][1]),
                              (eye_coord[1][2], eye_coord[1][3]),
                              (150, 0, 150))
            if 'hp' in oneneneflags:
                cv2.putText(
                    preview_window,
                    "yaw:{:.1f} | pitch:{:.1f} | roll:{:.1f}".format(
                        hp_output[0], hp_output[1], hp_output[2]), (20, 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1)
            if 'ge' in oneneneflags:

                yaw = hp_output[0]
                pitch = hp_output[1]
                roll = hp_output[2]
                focal_length = 950.0
                scale = 50
                center_of_face = (face_image.shape[1] / 2,
                                  face_image.shape[0] / 2, 0)
                if 'fd' in oneneneflags or 'fl' in oneneneflags:
                    draw_axes(preview_window, center_of_face, yaw, pitch, roll,
                              scale, focal_length)
                else:
                    draw_axes(frame, center_of_face, yaw, pitch, roll, scale,
                              focal_length)

        if len(oneneneflags) != 0:
            img_hor = np.hstack((cv2.resize(frame, (500, 500)),
                                 cv2.resize(preview_window, (500, 500))))
        else:
            img_hor = cv2.resize(frame, (500, 500))

        cv2.imshow('Visualization', img_hor)
        mouse_controller_object.move(mouse_coordinate[0], mouse_coordinate[1])

        if pressed_key == 27:
            log_object.error("exit key is pressed..")
            break

    infer_time = round(time.time() - infer_start, 1)
    fps = int(counter) / infer_time
    log_object.error("counter {} seconds".format(counter))
    log_object.error("total inference time {} seconds".format(infer_time))
    log_object.error("fps {} frame/second".format(fps))
    log_object.error("Video session has ended")

    with open(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'stats.txt'), 'w') as f:
        f.write(str(infer_time) + '\n')
        f.write(str(fps) + '\n')
        f.write(str(total_time) + '\n')

    input_feeder.close()
    cv2.destroyAllWindows()
示例#25
0
def main():
    args = build_argparser().parse_args()
    device_name = args.device
    prob_threshold = args.prob_threshold
    logger_object = log.getLogger()

    # Initialize variables with the input arguments
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'FacialLandmarkModel': args.facialLandmarksModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }

    # Instantiate model
    face_model = FaceDetection(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold)
    landmark_model = FacialLandmarksDetection(model_path_dict['FacialLandmarkModel'], device_name,
                                              threshold=prob_threshold)
    head_pose_model = HeadPoseEstimation(model_path_dict['HeadPoseEstimationModel'], device_name,
                                         threshold=prob_threshold)
    gaze_model = GazeEstimation(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold)
    mouse_controller = MouseController('medium', 'fast')

    # Load Models and get time
    start_time = time.time()
    face_model.load_model()
    logger_object.error("Face detection model loaded: time: {:.3f} ms".format((time.time() - start_time) * 1000))

    first_mark = time.time()
    landmark_model.load_model()
    logger_object.error(
        "Facial landmarks detection model loaded: time: {:.3f} ms".format((time.time() - first_mark) * 1000))

    second_mark = time.time()
    head_pose_model.load_model()
    logger_object.error("Head pose estimation model loaded: time: {:.3f} ms".format((time.time() - second_mark) * 1000))

    third_mark = time.time()
    gaze_model.load_model()
    logger_object.error("Gaze estimation model loaded: time: {:.3f} ms".format((time.time() - third_mark) * 1000))
    load_total_time = time.time() - start_time
    logger_object.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000))
    logger_object.error("All models are loaded successfully..")

    # Check extention of these unsupported layers
    face_model.check_model()
    landmark_model.check_model()
    head_pose_model.check_model()
    gaze_model.check_model()

    preview_flags = args.previewFlags
    input_filename = args.input
    output_path = args.output_path
    prob_threshold = args.prob_threshold

    if input_filename.lower() == 'cam':
        input_feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger_object.error("Unable to find specified video file")
            exit(1)
        input_feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger_object.error("Unable to find specified model file" + str(model_path))
            exit(1)

    input_feeder.load_data()
    width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS))
    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps,
                                (width, height), True)

    frame_counter = 0
    start_inf_time = time.time()
    for ret, frame in input_feeder.next_batch():
        if not ret:
            break
        frame_counter += 1
        key = cv2.waitKey(60)

        try:
            cropped_image, face_cords = face_model.predict(frame, prob_threshold)

            if type(cropped_image) == int:
                print("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye, right_eye, eye_cords = landmark_model.predict(cropped_image)
            pose_output = head_pose_model.predict(cropped_image)
            x, y, z = gaze_model.predict(left_eye, right_eye, pose_output, cropped_image, eye_cords)

            mouse_controller.move(x, y)
        except Exception as e:
            print(str(e) + " for frame " + str(frame_counter))
            continue

        image = cv2.resize(frame, (width, height))
        if not len(preview_flags) == 0:
            preview_frame = frame.copy()

            if 'fd' in preview_flags:
                if len(preview_flags) != 1:
                    preview_frame = cropped_image
                    cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (0, 0, 255), 3)

            if 'hp' in preview_flags:
                cv2.putText(
                    frame,
                    "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".format(
                        pose_output[0], pose_output[1], pose_output[2]),
                    (20, 40),
                    cv2.FONT_HERSHEY_DUPLEX,
                    1, (255, 0, 0), 3)

            if 'ge' in preview_flags:
                cv2.putText(
                    frame,
                    "Gaze vector: x= {:.2f} , y= {:.2f} , z= {:.2f}".format(
                        x, y, z),
                    (15, 100),
                    cv2.FONT_HERSHEY_COMPLEX,
                    1, (0, 255, 0), 3)

            image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(image)

        if frame_counter % 5 == 0:
            mouse_controller.move(x, y)

        if key == 27:
            break

    inference_time = round(time.time() - start_inf_time, 1)
    fps = int(frame_counter) / inference_time
    logger_object.error("counter {} seconds".format(frame_counter))
    logger_object.error("total inference time {} seconds".format(inference_time))
    logger_object.error("fps {} frame/second".format(fps))
    with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f:
        f.write('inference time : ' + str(inference_time) + '\n')
        f.write('fps: ' + str(fps) + '\n')
        f.write('Models Loading: '+ str(load_total_time) + '\n')
    logger_object.error('Video stream ended')
    cv2.destroyAllWindows()
    input_feeder.close()
def infer(args, logging_enabled):
    """
        run inference on input video, display/save output video
    """
    face_detection = FaceDetection(args.face_detection)
    facial_landmark_detection = FacialLandmarkDetection(
        args.facial_landmark_detection)
    gaze_estimation = GazeEstimation(args.gaze_estimation)
    head_pose_estimation = HeadPoseEstimation(args.head_pose_estimation)
    load_start = now()
    face_detection.load_model()
    fl_start = now()
    facial_landmark_detection.load_model()
    ge_start = now()
    gaze_estimation.load_model()
    hp_start = now()
    head_pose_estimation.load_model()
    log_model_load_times(logging_enabled, load_start, fl_start, ge_start,
                         hp_start)
    feeder = InputFeeder("video", args.input)
    feeder.load_data()
    frame_count, fd_time, fl_time, ge_time, hp_time = [0] * 5
    while 1:
        key = cv2.waitKey(20)
        try:
            frame = next(feeder.next_batch())
        except StopIteration:
            break
        frame_count += 1
        fd_frame = face_detection.preprocess_input(frame)
        inf_start = now()
        fd_output = face_detection.predict(fd_frame)
        fd_time += now() - inf_start
        out_frame, faces = face_detection.preprocess_output(
            fd_output, frame, args.overlay_inference,
            args.probability_threshold)
        detected_face = frame[faces[0][1]:faces[0][3], faces[0][0]:faces[0][2]]
        fl_frame = facial_landmark_detection.preprocess_input(detected_face)
        fl_start = now()
        fl_output = facial_landmark_detection.predict(fl_frame)
        fl_time += now() - fl_start
        out_frame, l_coord, r_coord, = facial_landmark_detection.preprocess_output(
            fl_output, faces[0], out_frame, args.overlay_inference)
        hp_frame = head_pose_estimation.preprocess_input(detected_face)
        hp_start = now()
        hp_output = head_pose_estimation.predict(hp_frame)
        hp_time += now() - hp_start
        out_frame, head_pose = head_pose_estimation.preprocess_output(
            hp_output, out_frame, detected_face, faces[0],
            args.overlay_inference)
        out_frame, l_eye, r_eye = gaze_estimation.preprocess_input(
            out_frame, detected_face, l_coord, r_coord, args.overlay_inference)
        ge_start = now()
        ge_output = gaze_estimation.predict(head_pose, l_eye, r_eye)
        ge_time += now() - ge_start
        out_frame, g_vec = gaze_estimation.preprocess_output(
            ge_output, out_frame, faces[0], l_coord, r_coord,
            args.overlay_inference)
        if args.video_window:
            cv2.imshow(
                "Computer-Human Interface Peripheral Signal Manipulation via AI Retina Tracking (CHIPSMART)",
                out_frame,
            )
        if args.mouse_control and frame_count % 6 == 0:
            mouse_control.move(g_vec[0], g_vec[1])
        # Quit if user presses Esc or Q
        if key in (27, 81):
            user_quit(logging_enabled)
            break
    log_inference_times(logging_enabled, frame_count, fd_time, fl_time,
                        ge_time, hp_time)
    feeder.close()
    cv2.destroyAllWindows()
    quit()
class MoveMouse:
    '''
    Main Class for the Mouse Controller app. 
    This is the class where all the models are stitched together to control the mouse pointer
    '''
    def __init__(self, args):
        '''
        This method instances variables for the Facial Landmarks Detection Model.

        Args:
        args = All arguments parsed by the arguments parser function

        Return:
        None
        '''

        init_start_time = time.time()
        self.output_path = args.output_path
        self.show_output = args.show_output
        self.total_processing_time = 0
        self.count_batch = 0
        self.inference_speed = []
        self.avg_inference_speed = 0

        if args.all_devices != 'CPU':
            args.face_device = args.all_devices
            args.face_landmark_device = args.all_devices
            args.head_pose_device = args.all_devices
            args.gaze_device = args.all_devices

        model_init_start = time.time()
        self.face_model = FaceDetection(args.face_model, args.face_device,
                                        args.face_device_ext,
                                        args.face_prob_threshold)
        self.landmarks_model = FacialLandmarksDetection(
            args.face_landmark_model, args.face_landmark_device,
            args.face_landmark_device_ext, args.face_landmark_prob_threshold)
        self.head_pose_model = HeadPoseEstimation(
            args.head_pose_model, args.head_pose_device,
            args.head_pose_device_ext, args.head_pose_prob_threshold)
        self.gaze_model = GazeEstimation(args.gaze_model, args.gaze_device,
                                         args.gaze_device_ext,
                                         args.gaze_prob_threshold)
        self.model_init_time = time.time() - model_init_start
        log.info('[ Main ] All required models initiallized')

        self.mouse_control = MouseController(args.precision, args.speed)
        log.info('[ Main ] Mouse controller successfully initialized')

        self.input_feeder = InputFeeder(args.batch_size, args.input_type,
                                        args.input_file)
        log.info('[ Main ] Initialized input feeder')

        model_load_start = time.time()
        self.face_model.load_model()
        self.landmarks_model.load_model()
        self.head_pose_model.load_model()
        self.gaze_model.load_model()

        self.model_load_time = time.time() - model_load_start
        self.app_init_time = time.time() - init_start_time
        log.info('[ Main ] All moadels loaded to Inference Engine\n')

        return None

    def draw_face_box(self, frame, face_coords):
        '''
        Draws face's bounding box on the input frame
        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with bounding box of faces drawn on it
        '''

        start_point = (face_coords[0][0], face_coords[0][1])
        end_point = (face_coords[0][2], face_coords[0][3])
        thickness = 5
        color = (255, 86, 0)

        frame = cv2.rectangle(frame, start_point, end_point, color, thickness)

        return frame

    def draw_eyes_boxes(self, frame, left_eye_coords, right_eye_coords):
        '''
        Draws face's bounding box on the input frame
        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with bounding box of left and right eyes drawn on it
        '''

        left_eye_start_point = (left_eye_coords[0], left_eye_coords[1])
        left_eye_end_point = (left_eye_coords[2], left_eye_coords[3])
        right_eye_start_point = (right_eye_coords[0], right_eye_coords[1])
        right_eye_end_point = (right_eye_coords[2], right_eye_coords[3])
        thickness = 5
        color = (0, 210, 0)

        frame = cv2.rectangle(frame, left_eye_start_point, left_eye_end_point,
                              color, thickness)
        frame = cv2.rectangle(frame, right_eye_start_point,
                              right_eye_end_point, color, thickness)

        return frame

    def draw_outputs(self, frame):
        '''
        Draws the inference outputs (bounding boxes of the face and both eyes and 
        the 3D head pose directions) of the four models onto the frames.

        Args:
        frame = Input frame from video or camera feed. It could also be an input image

        Return:
        frame = Frame with all inference outputs drawn on it
        '''

        frame = self.draw_face_box(frame, self.face_coords)
        frame = self.draw_eyes_boxes(frame, self.left_eye_coords,
                                     self.right_eye_coords)

        frame_id = f'Batch id = {self.count_batch}'
        avg_inference_speed = f'Avg. inference speed = {self.avg_inference_speed:.3f}fps'
        total_processing_time = f'Total infer. time = {self.total_processing_time:.3f}s'

        cv2.putText(frame, frame_id, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.45,
                    (255, 86, 0), 1)
        cv2.putText(frame, avg_inference_speed, (15, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1)
        cv2.putText(frame, total_processing_time, (15, 45),
                    cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1)

        return frame

    def run_inference(self, frame):
        '''
        Performs inference on the input video or image by passing it through all four
        models to get the desired coordinates for moving the mouse pointer.

        Args:
        frame = Input image, frame from video or camera feed

        Return:
        None
        '''

        self.input_feeder.load_data()

        for frame in self.input_feeder.next_batch():

            if self.input_feeder.frame_flag == True:
                log.info('[ Main ] Started processing a new batch')
                start_inference = time.time()
                self.face_coords, self.face_crop = self.face_model.predict(
                    frame)

                if self.face_coords == []:
                    log.info(
                        '[ Main ] No face detected.. Waiting for you to stare at the camera'
                    )
                    f.write('[ Error ] No face was detected')

                else:
                    self.head_pose_angles = self.head_pose_model.predict(
                        self.face_crop)
                    self.left_eye_coords, self.left_eye_image, self.right_eye_coords, self.right_eye_image = self.landmarks_model.predict(
                        self.face_crop)
                    self.x, self.y = self.gaze_model.predict(
                        self.left_eye_image, self.right_eye_image,
                        self.head_pose_angles)
                    log.info(
                        f'[ Main ] Relative pointer coordinates: [{self.x:.2f}, {self.y:.2f}]'
                    )

                    batch_process_time = time.time() - start_inference
                    self.total_processing_time += batch_process_time
                    self.count_batch += 1
                    log.info(
                        f'[ Main ] Finished processing batch. Time taken = {batch_process_time}s\n'
                    )

                    self.mouse_control.move(self.x, self.y)

                    if self.show_output:
                        self.draw_outputs(frame)

                    cv2.imshow('Computer Pointer Controller Output', frame)
                    self.inference_speed.append(self.count_batch /
                                                self.total_processing_time)
                    self.avg_inference_speed = sum(self.inference_speed) / len(
                        self.inference_speed)

                with open(os.path.join(self.output_path, 'outputs.txt'),
                          'w+') as f:
                    f.write('INFERENCE STATS\n')
                    f.write(
                        f'Total model initialization time : {self.model_init_time:.2f}s\n'
                    )
                    f.write(
                        f'Total model load time: {self.model_load_time:.2f}s\n'
                    )
                    f.write(
                        f'App initialization time: {self.app_init_time:.2f}s\n'
                    )
                    f.write(
                        f'Total processing time: {self.total_processing_time:.2f}s\n'
                    )
                    f.write(
                        f'Average inference speed: {self.avg_inference_speed:.2f}FPS\n'
                    )
                    f.write(f'Batch count: {self.count_batch}\n\n')

                    f.write('LAST OUTPUTS\n')
                    f.write(f'Face coordinates: {self.face_coords}\n')
                    f.write(f'Left eye coordinates: {self.left_eye_coords}\n')
                    f.write(
                        f'Right eye coordinates: {self.right_eye_coords}\n')
                    f.write(f'Head pose angles: {self.head_pose_angles}\n')
                    f.write(
                        f'Relative pointer coordinates/ Gaze vector: [{self.x:.2f}, {self.y:.2f}]'
                    )

            else:
                self.input_feeder.close()
                cv2.destroyAllWindows()

                log.info(
                    f'[ Main ] All input Batches processed in {self.total_processing_time:.2f}s'
                )
                log.info('[ Main ] Shutting down app...')
                log.info('[ Main ] Mouse controller app has been shut down.')
                break

        return
def main():
    args = build_argparser().parse_args()

    # initialize variables with the input arguments for easy access
    fdm = args.face_detection_model
    ldm = args.facial_landmarks_detection_model
    hpem = args.head_pose_estimation_model
    gem = args.gaze_estimation_model
    output_flags = args.output_flags
    input_filename = args.input
    device_name = args.device
    prob_threshold = args.prob_threshold
    cpu_extension = args.cpu_extension

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            log.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    # initialize model
    face_detection_model = FaceDetect(fdm, device_name, cpu_extension,
                                      prob_threshold)
    landmark_detection_model = FacialLandmarks(ldm, device_name, cpu_extension,
                                               prob_threshold)
    head_pose_estimation_model = HeadPose(hpem, device_name, cpu_extension,
                                          prob_threshold)
    gaze_estimation_model = GazeEstimation(gem, device_name, cpu_extension,
                                           prob_threshold)

    mouse_controller = MouseController('medium', 'fast')

    # load Models
    start_model_load_time = time.time()
    face_detection_model.load_model()  #load face detection model
    log.info("Face Detection Model Loaded...")
    FDMT = time.time() - start_model_load_time
    start1 = time.time()
    landmark_detection_model.load_model()  #load_landmark_detection_model
    log.info("landmark_estimation Model Loaded...")
    LDMT = time.time() - start1
    start2 = time.time()
    head_pose_estimation_model.load_model()  #load_head_pose_estimation_model
    log.info("Head pose estimation model Loaded...")
    hpem = time.time() - start2
    start3 = time.time()
    gaze_estimation_model.load_model()  #load_gaze_estimation_model
    log.info("Gaze_estimation model loaded..")
    gem = time.time() - start3
    total_time = time.time() - start_model_load_time

    feeder.load_data()

    #check for output flags
    if (len(output_flags) != 0):
        for flag in output_flags:
            if not flag in ['fdm', 'lrm', 'hp', 'gze']:
                log.error("Flag '" + flag + "' is not a valid preview flag.")
                sys.exit(1)

    frame_count = 0
    start_inference_time = time.time()
    for ret, frame in feeder.next_batch():

        if not ret:
            break

        frame_count += 1
        #if frame_count%5==0:
        #cv2.imshow('video',cv2.resize(frame,(500,500)))
        key = cv2.waitKey(60)
        try:

            image, fc = face_detection_model.predict(frame,
                                                     args.prob_threshold)
            #print (fc)
            #print (image.shape)
            #face_cords1=face_cords[0]
            #face_c = face_cords1.astype(np.int8)
            #print (image.shape)
            if type(image) == int:
                log.warning("Unable to detect the face")
                if key == 27:
                    break
                continue
            #for cord in face_c:
            #face1=cord.astype(np.int32)
            # cord = (xmin,ymin,xmax,ymax)
            # get face landmarks
            # crop face from frame
            #face = image[face_cords1[1]:face_cords1[3],face_cords1[0]:face_cords1[2]]
            #print (face.shape)
            if 'fdm' in output_flags:
                #cv2.rectangle(frame,(fc[0],fc[1]),(fc[2],fc[4]),3)
                cv2.putText(frame, "face detected", (10, 140),
                            cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 4)
            # predicting using landmark detection model
            left_eye_image, right_eye_image, eye_coords = landmark_detection_model.predict(
                image)  #using the output of face detection model
            print(eye_coords)
            eye_buffer = 10

            if 'lrm' in output_flags:
                view_eye_rectangle(eye_coords, eye_buffer, image)

            print(left_eye_image.shape)
            print(right_eye_image.shape)

            #predicting using head_pose_estimation model
            pose_output = head_pose_estimation_model.predict(image)
            yaw = pose_output[0]
            pitch = pose_output[1]
            roll = pose_output[2]

            if "hp" in output_flags:
                cv2.putText(
                    frame,
                    "Pose Angles: yaw:{:.2f},  pitch:{:.2f},  roll:{:.2f}".
                    format(yaw, pitch, roll), (10, 40),
                    cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 0), 4)

            mouse_coord, gaze_vector = gaze_estimation_model.predict(
                left_eye_image, right_eye_image, pose_output)
            if "gze" in output_flags:
                cv2.putText(
                    frame,
                    "Gaze Cords: x= {:.2f} , y= {:.2f} , z= {:.2f}".format(
                        gaze_vector[0], gaze_vector[1], gaze_vector[2]),
                    (10, 90), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 4)

        except Exception as e:
            log.warning("Could not predict using model " + str(e) +
                        " for frame " + str(frame_count))
            continue

        #image = cv2.resize(frame, (500, 500))
        total_inference_time = time.time() - start_inference_time

        cv2.imshow("Visualization", cv2.resize(frame, (500, 500)))
        #out_video.write(preview_frame)
        #moving_mouse_controller
        if frame_count % 5 == 0:
            mouse_controller.move(mouse_coord[0], -1 * mouse_coord[1])
        if key == 27:
            break
    log.error("VideoStream ended...")
    print("total_model_load time is {:} ms".format(1000 * total_time /
                                                   frame_count))
    print("fps is {:}".format(int(feeder.get_fps())))
    print("total inference time is{:} ms".format(1000 * total_inference_time /
                                                 frame_count))
    print("fdmt loading time is{:} ms".format(1000 * FDMT / frame_count))
    print("ldmt loading time is{:} ms".format(1000 * LDMT / frame_count))
    print("hpem loading tiem{:} ms".format(1000 * hpem / frame_count))
    print("gzem loading time{:} ms".format(1000 * hpem / frame_count))
    cv2.destroyAllWindows()
    feeder.close()
def main():

    # Grab command line args
    args = build_argparser().parse_args()
    flags = args.models_outputs_flags

    logger = logging.getLogger()
    input_file_path = args.input
    input_feeder = None
    if input_file_path.lower() == "cam":
        input_feeder = InputFeeder("cam")
    else:
        if not os.path.isfile(input_file_path):
            logger.error("Unable to find specified video file")
            exit(1)
        input_feeder = InputFeeder("video", input_file_path)

    model_path_dict = {
        'FaceDetection': args.face_detection_model,
        'FacialLandmarks': args.facial_landmarks_model,
        'GazeEstimation': args.gaze_estimation_model,
        'HeadPoseEstimation': args.head_pose_estimation_model
    }

    for file_name_key in model_path_dict.keys():
        if not os.path.isfile(model_path_dict[file_name_key]):
            logger.error("Unable to find specified " + file_name_key +
                         " xml file")
            exit(1)

    fdm = FaceDetection(model_path_dict['FaceDetection'], args.device,
                        args.cpu_extension)
    flm = FacialLandmarks(model_path_dict['FacialLandmarks'], args.device,
                          args.cpu_extension)
    gem = GazeEstimation(model_path_dict['GazeEstimation'], args.device,
                         args.cpu_extension)
    hpem = HeadPoseEstimation(model_path_dict['HeadPoseEstimation'],
                              args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    input_feeder.load_data()
    fdm.load_model()
    flm.load_model()
    hpem.load_model()
    gem.load_model()

    frame_count = 0
    for ret, frame in input_feeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        cropped_face, face_coords = fdm.predict(frame, args.prob_threshold)
        if type(cropped_face) == int:
            logger.error("Unable to detect any face.")
            if key == 27:
                break
            continue

        hp_output = hpem.predict(cropped_face)

        left_eye_img, right_eye_img, eye_coords = flm.predict(cropped_face)

        new_mouse_coord, gaze_vector = gem.predict(left_eye_img, right_eye_img,
                                                   hp_output)

        if (not len(flags) == 0):
            preview_frame = frame
            if 'fd' in flags:
                preview_frame = cropped_face
            if 'fld' in flags:
                cv2.rectangle(cropped_face,
                              (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                              (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                              (0, 255, 0), 3)
                cv2.rectangle(cropped_face,
                              (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                              (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                              (0, 255, 0), 3)

            if 'hp' in flags:
                cv2.putText(
                    preview_frame,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hp_output[0], hp_output[1], hp_output[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
            if 'ge' in flags:
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                left_eye = cv2.line(left_eye_img, (x - w, y - w),
                                    (x + w, y + w), (255, 0, 255), 2)
                cv2.line(left_eye, (x - w, y + w), (x + w, y - w),
                         (255, 0, 255), 2)
                right_eye = cv2.line(right_eye_img, (x - w, y - w),
                                     (x + w, y + w), (255, 0, 255), 2)
                cv2.line(right_eye, (x - w, y + w), (x + w, y - w),
                         (255, 0, 255), 2)
                cropped_face[eye_coords[0][1]:eye_coords[0][3],
                             eye_coords[0][0]:eye_coords[0][2]] = left_eye
                cropped_face[eye_coords[1][1]:eye_coords[1][3],
                             eye_coords[1][0]:eye_coords[1][2]] = right_eye

            cv2.imshow("Visualization", cv2.resize(preview_frame, (500, 500)))

        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    logger.error("VideoStream ended...")
    cv2.destroyAllWindows()
    input_feeder.close()
def main():
    args = argparser().parse_args()

    log.basicConfig(filename='log.log', level=log.INFO)

    device = args.device
    threshold = args.prob_threshold

    extension = args.cpu_extension
    preview_flags = args.preview_flag

    input_file_path = args.input
    # Initialize Models
    log.info(
        "------------------------Program Started-------------------------------------"
    )
    face = FaceDetect(args.face_detection_model, args.device,
                      args.cpu_extension, args.prob_threshold)
    landmark = FacialLandmarksDetect(args.landmark_detection_model,
                                     args.device, args.cpu_extension)
    head_pose = HeadPoseDetect(args.head_pose_estimation_model, args.device,
                               args.cpu_extension)
    gaze_estimation = GazeEstimation(args.gaze_estimation_model, args.device,
                                     args.cpu_extension)

    # Load models
    log.info("Loading Models")
    start_time = time.time()
    face.load_model()
    log.info("Face detection model loaded: time: {:.3f} ms".format(
        (time.time() - start_time) * 1000))

    landmark_start = time.time()
    landmark.load_model()
    log.info("Facial landmarks detection model loaded: time: {:.3f} ms".format(
        (time.time() - landmark_start) * 1000))

    head_start = time.time()
    head_pose.load_model()
    log.info("Head pose estimation model loaded: time: {:.3f} ms".format(
        (time.time() - head_start) * 1000))

    gaze_start = time.time()
    gaze_estimation.load_model()
    log.info("Gaze estimation model loaded: time: {:.3f} ms".format(
        (time.time() - gaze_start) * 1000))

    load_total_time = time.time() - start_time
    log.info("Time to load all models: time: {:.3f} ms".format(
        load_total_time * 1000))
    log.info("All models are loaded successfully..")

    if input_file_path.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_file_path):
            log.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_file_path)

    log.info("Initialize Mouse")
    mouse = MouseController(precision='low', speed='fast')

    feeder.load_data()
    log.info("Starting Inference on Video")
    start_time = time.time()
    counter = 0
    for ret, frame in feeder.next_batch():

        if not ret:
            break

        key = cv2.waitKey(60)
        counter = counter + 1

        face_coords, face_image = face.predict(frame.copy())
        left_eye, right_eye, eye_coords = landmark.predict(face_image)
        hp_angles = head_pose.predict(face_image)
        gaze_coords = gaze_estimation.predict(left_eye, right_eye, hp_angles)
        # Settings from https://knowledge.udacity.com/questions/171017
        focal_length = 950.0
        scale = 50
        center_of_face = (face_image.shape[1] / 2, face_image.shape[0] / 2)

        yaw = hp_angles[0]
        pitch = hp_angles[1]
        roll = hp_angles[2]

        if len(preview_flags) != 0:

            if 'ff' in preview_flags and len(preview_flags) == 1:
                preview_window = frame
            else:
                preview_window = face_image.copy()

            if 'ff' in preview_flags and len(preview_flags) == 1:
                cv2.rectangle(frame, (face_coords[0], face_coords[1]),
                              (face_coords[2], face_coords[3]), (0, 250, 0), 3)

            elif 'fl' in preview_flags and len(preview_flags) == 1:

                cv2.rectangle(preview_window,
                              (eye_coords[0][0], eye_coords[0][1]),
                              (eye_coords[0][2], eye_coords[0][3]),
                              (150, 0, 150))
                cv2.rectangle(preview_window,
                              (eye_coords[1][0], eye_coords[1][1]),
                              (eye_coords[1][2], eye_coords[1][3]),
                              (150, 0, 150))

            elif 'fh' in preview_flags and len(preview_flags) == 1:

                cv2.putText(
                    preview_window,
                    "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}".
                    format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, .5, (255, 0, 0), 1)
            elif 'fg' in preview_flags and len(preview_flags) == 1:

                draw_axes(preview_window, center_of_face, yaw, pitch, roll,
                          scale, focal_length)

            elif 'fg' in preview_flags and 'fh' in preview_flags and len(
                    preview_flags) == 2:
                #Gaze
                draw_axes(preview_window, center_of_face, yaw, pitch, roll,
                          scale, focal_length)
                #Head Pose
                cv2.putText(
                    preview_window,
                    "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}".
                    format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, .3, (255, 0, 0), 1)

            elif 'ff' in preview_flags and 'fh' in preview_flags and len(
                    preview_flags) == 2:
                #face
                cv2.rectangle(frame, (face_coords[0], face_coords[1]),
                              (face_coords[2], face_coords[3]), (0, 250, 0), 3)
                #Head Pose
                cv2.putText(
                    preview_window,
                    "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}".
                    format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, .3, (255, 0, 0), 1)

            elif 'ff' in preview_flags and 'fl' in preview_flags and len(
                    preview_flags) == 2:
                #face
                cv2.rectangle(frame, (face_coords[0], face_coords[1]),
                              (face_coords[2], face_coords[3]), (0, 250, 0), 3)
                #eye
                cv2.rectangle(preview_window,
                              (eye_coords[0][0], eye_coords[0][1]),
                              (eye_coords[0][2], eye_coords[0][3]),
                              (150, 0, 150))
                cv2.rectangle(preview_window,
                              (eye_coords[1][0], eye_coords[1][1]),
                              (eye_coords[1][2], eye_coords[1][3]),
                              (150, 0, 150))

            elif 'fh' in preview_flags and 'fl' in preview_flags and 'fg' in preview_flags and len(
                    preview_flags) == 3:
                #Head Pose
                cv2.putText(
                    preview_window,
                    "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}".
                    format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, .3, (255, 0, 0), 1)

                #eye
                cv2.rectangle(preview_window,
                              (eye_coords[0][0], eye_coords[0][1]),
                              (eye_coords[0][2], eye_coords[0][3]),
                              (150, 0, 150))
                cv2.rectangle(preview_window,
                              (eye_coords[1][0], eye_coords[1][1]),
                              (eye_coords[1][2], eye_coords[1][3]),
                              (150, 0, 150))
                #Gaze
                draw_axes(preview_window, center_of_face, yaw, pitch, roll,
                          scale, focal_length)

            elif 'ff' in preview_flags and 'fg' in preview_flags and len(
                    preview_flags) == 2:
                #face
                cv2.rectangle(frame, (face_coords[0], face_coords[1]),
                              (face_coords[2], face_coords[3]), (0, 250, 0), 3)
                #gaze
                draw_axes(preview_window, center_of_face, yaw, pitch, roll,
                          scale, focal_length)

            elif 'fl' in preview_flags and 'fh' in preview_flags and len(
                    preview_flags) == 2:
                #eye
                cv2.rectangle(preview_window,
                              (eye_coords[0][0], eye_coords[0][1]),
                              (eye_coords[0][2], eye_coords[0][3]),
                              (150, 0, 150))
                cv2.rectangle(preview_window,
                              (eye_coords[1][0], eye_coords[1][1]),
                              (eye_coords[1][2], eye_coords[1][3]),
                              (150, 0, 150))
                #head pose
                cv2.putText(
                    preview_window,
                    "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}".
                    format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, .2, (255, 0, 0), 1)

            elif 'fg' in preview_flags and 'fl' in preview_flags and len(
                    preview_flags) == 2:
                #gaze
                draw_axes(preview_window, center_of_face, yaw, pitch, roll,
                          scale, focal_length)
                #eye
                cv2.rectangle(preview_window,
                              (eye_coords[0][0], eye_coords[0][1]),
                              (eye_coords[0][2], eye_coords[0][3]),
                              (150, 0, 150))
                cv2.rectangle(preview_window,
                              (eye_coords[1][0], eye_coords[1][1]),
                              (eye_coords[1][2], eye_coords[1][3]),
                              (150, 0, 150))

            else:
                #face
                cv2.rectangle(frame, (face_coords[0], face_coords[1]),
                              (face_coords[2], face_coords[3]), (0, 250, 0), 3)
                #gaze
                draw_axes(preview_window, center_of_face, yaw, pitch, roll,
                          scale, focal_length)
                #eye
                cv2.rectangle(preview_window,
                              (eye_coords[0][0], eye_coords[0][1]),
                              (eye_coords[0][2], eye_coords[0][3]),
                              (150, 0, 150))
                cv2.rectangle(preview_window,
                              (eye_coords[1][0], eye_coords[1][1]),
                              (eye_coords[1][2], eye_coords[1][3]),
                              (150, 0, 150))
                #head pose
                cv2.putText(
                    preview_window,
                    "Pose Angles: Yaw:{:.10f} | Pitch:{:.10f} |Roll:{:.10f}".
                    format(hp_angles[0], hp_angles[1], hp_angles[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, .3, (255, 0, 0), 1)

        if len(preview_flags) != 0:
            preview_image = np.hstack((cv2.resize(frame, (1500, 1500)),
                                       cv2.resize(preview_window,
                                                  (1500, 1500))))
        else:
            preview_image = cv2.resize(frame, (1500, 1500))

        cv2.imshow('Visualization', preview_image)

        mouse.move(gaze_coords[0], gaze_coords[1])

        key = cv2.waitKey(20)
        if key == 27:  # exit on ESC
            break
    inference_time = round(time.time() - start_time, 1)
    fps = int(counter) / inference_time
    log.info("Counter {} seconds".format(counter))
    log.info("Total Inference Time {} seconds".format(inference_time))
    log.info("fps {} frame/second".format(fps))
    log.info("Video has completed")
    log.info(
        "---------------------------------Program has ended ----------------------------------------"
    )

    feeder.close()
    cv2.destroyAllWindows()