Пример #1
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.

    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """

    #if args.input == 'cam':
    #    args.input = 0
    output_intermediate_model = args.output_intermediate_model

    ### TODO: Handle the input stream ###
    feed = InputFeeder(input_type=args.input_type, input_file=args.input_file)
    cap = feed.load_data()
    width = int(cap.get(3))
    height = int(cap.get(4))
    fps = int(cap.get(5))

    # Initialise the class
    try:
        infer_network_face_detection = BasePointer()
        infer_network_head_pose_estimation = BasePointer()
        infer_network_landmarks_regression_retail = BasePointer()
        infer_network_gaze_estimation = GazeEstimation()
    except:
        logging.error("Error in initializing models")
        exit(1)
    ### TODO: Load the model through `infer_network_face_detection` ###
    try:
        start_loading_time_face_detection = time.time()
        infer_network_face_detection.load_model(args.model1, args.device)
        load_model_face_detection_time_taken = time.time(
        ) - start_loading_time_face_detection

        start_loading_time_head_pose_estimation = time.time()
        infer_network_head_pose_estimation.load_model(args.model2, args.device)
        load_model_head_pose_estimation_time_taken = time.time(
        ) - start_loading_time_head_pose_estimation

        start_loading_time_landmarks_regression_retail = time.time()
        infer_network_landmarks_regression_retail.load_model(
            args.model3, args.device)
        load_model_landmarks_regression_retail_time_taken = time.time(
        ) - start_loading_time_landmarks_regression_retail

        start_loading_time_gaze_estimation = time.time()
        infer_network_gaze_estimation.load_model(args.model4, args.device)
        load_model_gaze_estimation_time_taken = time.time(
        ) - start_loading_time_gaze_estimation
    except:
        logging.error("Error in loading the models")
        exit(1)

    logging.debug(
        "Loading times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} "
        .format(load_model_face_detection_time_taken,
                load_model_landmarks_regression_retail_time_taken,
                load_model_head_pose_estimation_time_taken,
                load_model_gaze_estimation_time_taken))

    if output_intermediate_model == 'true':
        out = cv2.VideoWriter('out.mp4', CODEC, fps, (width, height))

    total_time_taken_to_infer_inf_face_detection = 0
    total_time_taken_to_infer_landmarks_regression_retail = 0
    total_time_taken_to_infer_inf_head_pose_estimation = 0
    total_time_taken_to_infer_gaze_estimation = 0

    ### TODO: Loop until stream is over ###
    for batch in feed.next_batch():
        ### TODO: Read from the video capture ###

        flag, frame = batch
        if not flag:
            break
        key_pressed = cv2.waitKey(60)

        ### TODO: Start inference for face detection ###
        start_inf_face_detection = time.time()
        outputs_face_detection = infer_network_face_detection.predict(frame)
        time_taken_to_infer_inf_face_detection = time.time(
        ) - start_inf_face_detection
        coords, frame = infer_network_face_detection.preprocess_output_face_detection(
            outputs_face_detection, width, height, args.prob_threshold, frame)
        if output_intermediate_model == 'true':
            out.write(frame)

        frame_crop_face = crop_face(coords, frame, output_intermediate_model)

        start_inf_head_pose_estimation = time.time()
        outputs_head_pose_estimation = infer_network_head_pose_estimation.predict(
            frame_crop_face)
        time_taken_to_infer_inf_head_pose_estimation = time.time(
        ) - start_inf_head_pose_estimation

        yaw, pitсh, roll = infer_network_head_pose_estimation.preprocess_output_head_pose_estimation(
            outputs_head_pose_estimation, frame_crop_face)
        head_pose_angles = [yaw, pitсh, roll]

        if output_intermediate_model == 'true':
            cv2.putText(frame, ("Yaw: " + str(int(yaw))), (100, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)
            cv2.putText(frame, ("Pitch: " + str(int(pitсh))), (100, 140),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)
            cv2.putText(frame, ("Roll: " + str(int(roll))), (100, 180),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)

        height_crop_face = coords[0][3] - coords[0][1]
        width_crop_face = coords[0][2] - coords[0][0]

        start_inf_landmarks_regression_retail = time.time()
        outputs_landmarks_regression_retail = infer_network_landmarks_regression_retail.predict(
            frame_crop_face)
        time_taken_to_infer_landmarks_regression_retail = time.time(
        ) - start_inf_landmarks_regression_retail

        coord_landmarks_regression_retail = infer_network_landmarks_regression_retail.preprocess_output_landmarks_regression_retail(
            outputs_landmarks_regression_retail, width_crop_face,
            height_crop_face, args.prob_threshold, frame)
        center_left_eye = ((coords[0][0] +
                            coord_landmarks_regression_retail[0]),
                           coords[0][1] + coord_landmarks_regression_retail[1])
        center_right_eye = ((coords[0][0] +
                             coord_landmarks_regression_retail[2]),
                            coords[0][1] +
                            coord_landmarks_regression_retail[3])

        xmin_left_eye = center_left_eye[0] - 30
        ymin_left_eye = center_left_eye[1] - 30
        xmax_left_eye = center_left_eye[0] + 30
        ymax_left_eye = center_left_eye[1] + 30
        xmin_right_eye = center_right_eye[0] - 30
        ymin_right_eye = center_right_eye[1] - 30
        xmax_right_eye = center_right_eye[0] + 30
        ymax_right_eye = center_right_eye[1] + 30

        frame_landmarks_regression_retail = cv2.circle(frame,
                                                       center_left_eye,
                                                       2, (0, 255, 0),
                                                       thickness=3)
        frame_landmarks_regression_retail = cv2.circle(frame,
                                                       center_right_eye,
                                                       2, (0, 255, 0),
                                                       thickness=3)
        box_left_eye = cv2.rectangle(frame, (xmin_left_eye, ymin_left_eye),
                                     (xmax_left_eye, ymax_left_eye),
                                     (0, 255, 0), 3)
        box_right_eye = cv2.rectangle(frame, (xmin_right_eye, ymin_right_eye),
                                      (xmax_right_eye, ymax_right_eye),
                                      (0, 255, 0), 3)
        if output_intermediate_model == 'true':
            out.write(frame_landmarks_regression_retail)

        ### TODO: Start inference for gaze estimation ###
        start_inf_gaze_estimation = time.time()
        outputs_gaze_estimation = infer_network_gaze_estimation.predict(
            box_left_eye, box_right_eye, head_pose_angles)
        time_taken_to_infer_gaze_estimation = time.time(
        ) - start_inf_gaze_estimation

        total_time_taken_to_infer_inf_face_detection = time_taken_to_infer_inf_face_detection + total_time_taken_to_infer_inf_face_detection
        total_time_taken_to_infer_landmarks_regression_retail = time_taken_to_infer_landmarks_regression_retail + total_time_taken_to_infer_landmarks_regression_retail
        total_time_taken_to_infer_inf_head_pose_estimation = time_taken_to_infer_inf_head_pose_estimation + total_time_taken_to_infer_inf_head_pose_estimation
        total_time_taken_to_infer_gaze_estimation = time_taken_to_infer_gaze_estimation + total_time_taken_to_infer_gaze_estimation

        arrow = 100
        g_x = int(outputs_gaze_estimation[0] * arrow)
        g_y = int(-(outputs_gaze_estimation[1]) * arrow)

        frame = cv2.arrowedLine(frame, (center_left_eye),
                                ((center_left_eye[0] + g_x),
                                 (center_left_eye[1] + g_y)), (0, 0, 255), 3)
        frame = cv2.arrowedLine(frame, (center_right_eye),
                                ((center_right_eye[0] + g_x),
                                 (center_right_eye[1] + g_y)), (0, 0, 255), 3)

        if output_intermediate_model == 'true':
            out.write(frame)

        mouse_controler_pc = MouseController("high", "fast")
        mouse_controler_pc.move(outputs_gaze_estimation[0],
                                outputs_gaze_estimation[1])

        if key_pressed == 27:
            break
    feed.close()

    logging.debug(
        "total inference times for facial detection : {} , landmark detection : {} , head pose detection : {} , gaze estimation : {} "
        .format(total_time_taken_to_infer_inf_face_detection,
                total_time_taken_to_infer_landmarks_regression_retail,
                total_time_taken_to_infer_inf_head_pose_estimation,
                total_time_taken_to_infer_gaze_estimation))
    if output_intermediate_model == 'true':
        out.release()
    #cap.release()
    cv2.destroyAllWindows()
    def run(self, args):
        '''
        Runs inference on specified input

        Args:
            args (Namespace): application arguments
        '''
        listener = keyboard.Listener(on_release=self.on_release)
        listener.start()
        if args.debug is not '':
            self.debug = True
        if not args.silent and args.debug is '':
            print('Press \'esc\' to exit')
        mouseController = MouseController('high', 'fast')
        mouseController.center()
        inputFeeder = InputFeeder(args.input)
        logging.info('Loading models')
        start_loading = time.time()
        # ----- Models Load ------------------------------------------------------------
        faceDetection = ModelFaceDetection()
        facialLanmarksDetection = ModelFacialLandmarksDetection(
            precision=args.precision)
        headPoseEstimation = ModelHeadPoseEstimation(device=args.device,
                                                     precision=args.precision)
        gazeEstimation = ModelGazeEstimation(precision=args.precision)
        # ------------------------------------------------------------------------------
        stop_loading = time.time()
        loading_time = stop_loading - start_loading
        pool = Pool(processes=1)  # Must be called after Models Load
        logging.info('Starting inference')
        frame = None
        image = None
        inference_time = 0
        counter = 0
        while self.execute:
            try:
                frame = next(inputFeeder.next_batch())
            except StopIteration:
                logging.error('Failed to obtain input stream.')
                break
            if frame is None:
                break
            start_inference = time.time()
            # ----- Inference --------------------------------------------------------------
            faceDetection.inputs(frame)  # GFlops 0.611
            faceDetection.wait()
            outputs = faceDetection.outputs()
            if len(outputs) == 0:
                logging.warning('No face detected')
                continue
            if len(outputs) > 1:
                logging.warning('More then one face detected')
            if outputs[0].shape[0] == 0 or outputs[0].shape[1] == 0 or \
                outputs[0].shape[2] < 3:
                logging.warning('Image too small')
                continue

            headPoseEstimation.inputs(outputs[0])  # GFlops 0.105
            facialLanmarksDetection.inputs(outputs[0])  # GFlops 0.021

            facialLanmarksDetection.wait()
            outputs = facialLanmarksDetection.outputs()
            if outputs[0].shape[0] < 60 or outputs[0].shape[1] < 60 or \
                outputs[0].shape[2] < 3 or outputs[1].shape[0] < 60 or \
                outputs[1].shape[1] < 60 or outputs[1].shape[2] < 3:
                logging.warning('Image too small')
                continue
            headPoseEstimation.wait()
            outputs.append(headPoseEstimation.outputs())

            gazeEstimation.inputs(outputs)  # GFlops 0.139
            gazeEstimation.wait()
            outputs = gazeEstimation.outputs()
            # ------------------------------------------------------------------------------
            stop_inference = time.time()
            result = pool.apply_async(mouseController.move,[outputs[0], \
                outputs[1]])
            inference_time = inference_time + stop_inference - start_inference
            counter = counter + 1
            if '1' in args.debug:
                image = faceDetection.debug[0]
                if '2' in args.debug:
                    self.axises(image, headPoseEstimation.debug)
                if '3' in args.debug:
                    self.points(image, facialLanmarksDetection.debug)
                if '4' in args.debug:
                    self.lines(image, gazeEstimation.debug)
                cv2.imshow('Debug Mode (Press \'esc\' to exit)', image)
                cv2.waitKey(50)
            if args.output is not None:
                image = faceDetection.debug[0]
                self.axises(image, headPoseEstimation.debug)
                self.points(image, facialLanmarksDetection.debug)
                self.lines(image, gazeEstimation.debug)
        inputFeeder.close()
        if args.output is not None:
            cv2.imwrite(args.output, image)
        if not args.silent:
            print('Total loading time of the models: ' + str(loading_time) +
                  ' s')
            print('Average inference time: ' + str(inference_time / counter) +
                  ' s')
            print('Frames per second: ' + str(counter / inference_time))
Пример #3
0
def infer_on_stream(args):

    # Set Probability threshold for detections
    prob_threshold = args.prob_threshold
    intermediatePreview = args.preview_flags

    face_detector_path = args.face_detector_model
    facial_landmark_path = args.facial_landmark_model
    head_pose_path = args.head_pose_estimation_model
    gaze_est_path = args.gaze_estimation_model

    device = args.device
    extension = args.cpu_extension
    input_type = args.type.lower()
    input_file = args.input

    speed = args.mouse_speed
    precision = args.mouse_prec

    # model classess intializing
    face_detector = FaceDetectionModel(model_name=face_detector_path,
                                       device=device,
                                       extensions=extension)
    face_landmark_detector = FacialLandmardDetectionModel(
        model_name=facial_landmark_path, device=device, extensions=extension)
    head_pose_estimation = HeadPoseEstimationModel(model_name=head_pose_path,
                                                   device=device,
                                                   extensions=extension)
    gaze_estimation = GazeEstimationModel(model_name=gaze_est_path,
                                          device=device,
                                          extensions=extension)

    log.info("Model loading...")
    # model loading
    model_loading = time.time()

    # inference pipeline
    face_detector.load_model()
    face_landmark_detector.load_model()
    head_pose_estimation.load_model()
    gaze_estimation.load_model()

    log.info("Models are loaded")
    log.info("Modal Loading Time: {:.3f}ms".format(
        (time.time() - model_loading) * 1000))

    # visual pipeline
    try:
        input_feeder = InputFeeder(input_type, input_file)
        input_feeder.load_data()
    except:
        log.error("Something went wrong with loading camera/mouse")
        exit(0)

    mouse = MouseController(precision, speed)
    frames = 0

    for ret, frame in input_feeder.next_batch():
        if not ret:
            break
        frames += 1

        key = cv2.waitKey(60)

        inf_start = time.time()

        face_coords, face_cropped_image = face_detector.predict(
            frame, prob_threshold)
        preview_image = face_cropped_image

        if (face_coords):
            if 'fl' in intermediatePreview:
                eye_coords, left_eye, right_eye, preview_image = face_landmark_detector.predict(
                    face_cropped_image, True)
            else:
                eye_coords, left_eye, right_eye, preview_image = face_landmark_detector.predict(
                    face_cropped_image)

            if 'hp' in intermediatePreview:
                head_pose_angles, preview_image = head_pose_estimation.predict(
                    face_cropped_image, preview_image)
            else:
                head_pose_angles = head_pose_estimation.predict(
                    face_cropped_image)

            if 'ge' in intermediatePreview:
                mouse_coord, gaze_coord, preview_image = gaze_estimation.predict(
                    left_eye, right_eye, head_pose_angles, preview_image)
            else:
                mouse_coord, gaze_coord = gaze_estimation.predict(
                    left_eye, right_eye, head_pose_angles)

            left_eye = (eye_coords[0][0] + 20, eye_coords[0][1] + 20)
            right_eye = (eye_coords[1][0] + 20, eye_coords[1][1] + 20)

            gaze_x = int(gaze_coord[0] * 250)
            gaze_y = int(-gaze_coord[1] * 250)

            if 'ge' in intermediatePreview:
                cv2.arrowedLine(preview_image, left_eye,
                                (left_eye[0] + gaze_x, left_eye[1] + gaze_y),
                                (0, 255, 0), 3)
                cv2.arrowedLine(preview_image, right_eye,
                                (right_eye[0] + gaze_x, right_eye[1] + gaze_y),
                                (0, 255, 0), 3)

            inference_time = time.time() - inf_start

            inf_time_message = "Inf Time Per Frame: {:.3f}ms"\
                               .format(inference_time * 1000)

            cv2.putText(preview_image, inf_time_message, (10, 10),
                        cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)

        cv2.imshow('frame', cv2.resize(preview_image, (400, 400)))

        if frames % 5 == 0:
            mouse.move(mouse_coord[0], mouse_coord[1])

    input_feeder.close()
Пример #4
0
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
                cv2.imshow('HP', preview_window3)

            if 'ge' in args.displayFlags:
                preview_window4 = cropped_face.copy()

                x, y, w = int(gaze_vec[0] * 12), int(gaze_vec[1] * 12), 160

                left_c = cv2.line(le.copy(), (x - w, y - w), (x + w, y + w),
                                  (255, 0, 255), 2)
                cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)

                right_c = cv2.line(re.copy(), (x - w, y - w), (x + w, y + w),
                                   (255, 0, 255), 2)
                cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)

                preview_window4[eye_coords[0][1]:eye_coords[0][3],
                                eye_coords[0][0]:eye_coords[0][2]] = left_c
                preview_window4[eye_coords[1][1]:eye_coords[1][3],
                                eye_coords[1][0]:eye_coords[1][2]] = right_c
                cv2.imshow('GE', preview_window4)

fps = frame_count / inference_time
logger.debug("Video ended.")
print("Loading time: " + str(model_loading_time) + " s")
print("Average inference time: " + str(inference_time / frame_count) + " s")
print("FPS : ", format(fps / 5))

cv2.destroyAllWindows()
inputFeeder.close()
Пример #5
0
def main():
    # command line args
    args = build_argparser().parse_args()
    input_file_path = args.input
    log_object = log.getLogger()
    oneneneflags = args.visualization_flag

    # Initialise the classes
    fd_object = FaceDetection(model_name=args.face_detection_model,
                              device=args.device,
                              threshold=args.prob_threshold,
                              extensions=args.cpu_extension)
    fl_object = FacialLandmarkDetection(model_name=args.facial_landmarks_model,
                                        device=args.device,
                                        extensions=args.cpu_extension)
    hp_object = HeadPoseEstimation(model_name=args.head_pose_model,
                                   device=args.device,
                                   extensions=args.cpu_extension)
    ge_object = GazeEstimation(model_name=args.gaze_estimation_model,
                               device=args.device,
                               extensions=args.cpu_extension)

    mouse_controller_object = MouseController('low', 'fast')

    ### Loading the models ###
    log_object.error(
        "=================== Models Load Time ====================")
    start_time = time.time()
    fd_object.load_model()
    log_object.error("Face detection model loaded in {:.3f} ms".format(
        (time.time() - start_time) * 1000))

    fl_start = time.time()
    fl_object.load_model()
    log_object.error(
        "Facial landmarks detection model loaded in {:.3f} ms".format(
            (time.time() - fl_start) * 1000))

    hp_start = time.time()
    hp_object.load_model()
    log_object.error("Head pose estimation model loaded in {:.3f} ms".format(
        (time.time() - hp_start) * 1000))

    ge_start = time.time()
    ge_object.load_model()
    log_object.error("Gaze estimation model loaded in {:.3f} ms".format(
        (time.time() - ge_start) * 1000))

    total_time = time.time() - start_time
    log_object.error(
        "=================== Models loaded successfully ===================")
    log_object.error("Total loading time is {:.3f} ms".format(total_time *
                                                              1000))

    counter = 0
    infer_start = time.time()
    log_object.error(
        "=================== Start inferencing on input video ===================="
    )

    if input_file_path == "CAM":
        input_feeder = InputFeeder("cam")
    else:
        if not os.path.isfile(input_file_path):
            exit(1)
        input_feeder = InputFeeder("video", input_file_path)

        log_object.error("Input feeders are loaded")
        input_feeder.load_data()

    for frame in input_feeder.next_batch():
        # if not flag:
        #     break
        pressed_key = cv2.waitKey(60)
        counter += 1

        face_coordinates, face_image = fd_object.predict(frame.copy())
        if face_coordinates == 0:
            continue

        hp_output = hp_object.predict(face_image)

        left_eye_image, right_eye_image, eye_coord = fl_object.predict(
            face_image)

        mouse_coordinate, gaze_vector = ge_object.predict(
            left_eye_image, right_eye_image, hp_output)

        if len(oneneneflags) != 0:
            preview_window = frame.copy()
            if 'fd' in oneneneflags:
                if len(oneneneflags) != 1:
                    preview_window = face_image
                else:
                    cv2.rectangle(preview_window,
                                  (face_coordinates[0], face_coordinates[1]),
                                  (face_coordinates[2], face_coordinates[3]),
                                  (0, 150, 0), 3)
            if 'fl' in oneneneflags:
                if not 'fd' in oneneneflags:
                    preview_window = face_image.copy()
                cv2.rectangle(preview_window,
                              (eye_coord[0][0], eye_coord[0][1]),
                              (eye_coord[0][2], eye_coord[0][3]),
                              (150, 0, 150))
                cv2.rectangle(preview_window,
                              (eye_coord[1][0], eye_coord[1][1]),
                              (eye_coord[1][2], eye_coord[1][3]),
                              (150, 0, 150))
            if 'hp' in oneneneflags:
                cv2.putText(
                    preview_window,
                    "yaw:{:.1f} | pitch:{:.1f} | roll:{:.1f}".format(
                        hp_output[0], hp_output[1], hp_output[2]), (20, 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0, 0, 0), 1)
            if 'ge' in oneneneflags:

                yaw = hp_output[0]
                pitch = hp_output[1]
                roll = hp_output[2]
                focal_length = 950.0
                scale = 50
                center_of_face = (face_image.shape[1] / 2,
                                  face_image.shape[0] / 2, 0)
                if 'fd' in oneneneflags or 'fl' in oneneneflags:
                    draw_axes(preview_window, center_of_face, yaw, pitch, roll,
                              scale, focal_length)
                else:
                    draw_axes(frame, center_of_face, yaw, pitch, roll, scale,
                              focal_length)

        if len(oneneneflags) != 0:
            img_hor = np.hstack((cv2.resize(frame, (500, 500)),
                                 cv2.resize(preview_window, (500, 500))))
        else:
            img_hor = cv2.resize(frame, (500, 500))

        cv2.imshow('Visualization', img_hor)
        mouse_controller_object.move(mouse_coordinate[0], mouse_coordinate[1])

        if pressed_key == 27:
            log_object.error("exit key is pressed..")
            break

    infer_time = round(time.time() - infer_start, 1)
    fps = int(counter) / infer_time
    log_object.error("counter {} seconds".format(counter))
    log_object.error("total inference time {} seconds".format(infer_time))
    log_object.error("fps {} frame/second".format(fps))
    log_object.error("Video session has ended")

    with open(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'stats.txt'), 'w') as f:
        f.write(str(infer_time) + '\n')
        f.write(str(fps) + '\n')
        f.write(str(total_time) + '\n')

    input_feeder.close()
    cv2.destroyAllWindows()
def main():

    # Grab command line args
    args = build_argparser().parse_args()

    inputFilePath = args.input
    inputFeeder = None

    if args.input == "CAM":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(args.input):
            log.info("Unable to find specified video file")
            sys.exit(1)
        inputFeeder = InputFeeder("video", args.input)

    modelPathDict = {
        'FaceDetectionModel': args.face_detection_model,
        'FacialLandmarksDetectionModel': args.facial_landmark_model,
        'GazeEstimationModel': args.gaze_estimation_model,
        'HeadPoseEstimationModel': args.head_pose_model
    }

    for fileNameKey in modelPathDict.keys():
        if not os.path.isfile(modelPathDict[fileNameKey]):
            log.info("Unable to find specified " + fileNameKey + " xml file")
            sys.exit(1)

    fdm = FaceDetectionModel(modelPathDict['FaceDetectionModel'], args.device,
                             args.cpu_extension)
    fldm = FacialLandmarksDetectionModel(
        modelPathDict['FacialLandmarksDetectionModel'], args.device,
        args.cpu_extension)
    gem = GazeEstimationModel(modelPathDict['GazeEstimationModel'],
                              args.device, args.cpu_extension)
    hpem = HeadPoseEstimationModel(modelPathDict['HeadPoseEstimationModel'],
                                   args.device, args.cpu_extension)

    mc = MouseController('medium', 'fast')

    inputFeeder.load_data()
    fdm.load_model()
    fldm.load_model()
    hpem.load_model()
    gem.load_model()

    frame_count = 0
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = fdm.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            log.info("Unable to detect the face.")
            if key == 27:
                break
            continue

        hp_out = hpem.predict(croppedFace.copy())

        left_eye, right_eye, eye_coords = fldm.predict(croppedFace.copy())

        new_mouse_coord, gaze_vector = gem.predict(left_eye, right_eye, hp_out)
        '''
        if (not len(previewFlags)==0):
            preview_frame = frame.copy()
            if 'fd' in previewFlags:
                #cv2.rectangle(preview_frame, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255,0,0), 3)
                preview_frame = croppedFace
            if 'fld' in previewFlags:
                cv2.rectangle(croppedFace, (eye_coords[0][0]-10, eye_coords[0][1]-10), (eye_coords[0][2]+10, eye_coords[0][3]+10), (0,255,0), 3)
                cv2.rectangle(croppedFace, (eye_coords[1][0]-10, eye_coords[1][1]-10), (eye_coords[1][2]+10, eye_coords[1][3]+10), (0,255,0), 3)
                #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace
                
            if 'hp' in previewFlags:
                cv2.putText(preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".format(hp_out[0],hp_out[1],hp_out[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
            if 'ge' in previewFlags:
                x, y, w = int(gaze_vector[0]*12), int(gaze_vector[1]*12), 160
                le =cv2.line(left_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2)
                cv2.line(le, (x-w, y+w), (x+w, y-w), (255,0,255), 2)
                re = cv2.line(right_eye.copy(), (x-w, y-w), (x+w, y+w), (255,0,255), 2)
                cv2.line(re, (x-w, y+w), (x+w, y-w), (255,0,255), 2)
                croppedFace[eye_coords[0][1]:eye_coords[0][3],eye_coords[0][0]:eye_coords[0][2]] = le
                croppedFace[eye_coords[1][1]:eye_coords[1][3],eye_coords[1][0]:eye_coords[1][2]] = re
                #preview_frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] = croppedFace
                
            cv2.imshow("visualization",cv2.resize(preview_frame,(500,500)))
        '''
        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    log.info("VideoStream ended...")
    cv2.destroyAllWindows()
    inputFeeder.close()
Пример #7
0
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    try:
        logging.basicConfig(level=logging.DEBUG,
                            format="%(asctime)s [%(levelname)s] %(message)s",
                            handlers=[
                                logging.FileHandler("debug.log"),
                                logging.StreamHandler()
                            ])

        # Initialise the class
        mc = MouseController("low", "fast")
        fdnet = FaceDetectionModel(args.fdmodel)
        lmnet = FacialLandMarksDetectionModel(args.lmmodel)
        hpnet = HeadPoseEstimationModel(args.hpmodel)
        genet = GazeEstimationModel(args.gemodel)

        start_time = time.time()
        fdnet.load_model()
        logging.info(
            f"Face Detection Model: {1000 * (time.time() - start_time):.1f}ms")

        start_time = time.time()
        lmnet.load_model()
        logging.info(
            f"Facial Landmarks Detection Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        start_time = time.time()
        hpnet.load_model()
        logging.info(
            f"Headpose Estimation Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        start_time = time.time()
        genet.load_model()
        logging.info(
            f"Gaze Estimation Model: {1000 * (time.time() - start_time):.1f}ms"
        )

        # Get and open video capture
        feeder = InputFeeder('video', args.input)
        feeder.load_data()

        frame_count = 0

        fd_infertime = 0
        lm_infertime = 0
        hp_infertime = 0
        ge_infertime = 0

        while True:
            # Read the next frame
            try:
                frame = next(feeder.next_batch())
            except StopIteration:
                break

            key_pressed = cv2.waitKey(60)
            frame_count += 1

            # face detection
            p_frame = fdnet.preprocess_input(frame)
            start_time = time.time()
            fd_output = fdnet.predict(p_frame)
            fd_infertime += time.time() - start_time
            out_frame, bboxes = fdnet.preprocess_output(
                fd_output, frame, args.print)

            for bbox in bboxes:

                face = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]
                p_frame = lmnet.preprocess_input(face)

                start_time = time.time()
                lm_output = lmnet.predict(p_frame)
                lm_infertime += time.time() - start_time
                out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output(
                    lm_output, bbox, out_frame, args.print)

                # get head pose estimation
                p_frame = hpnet.preprocess_input(face)
                start_time = time.time()
                hp_output = hpnet.predict(p_frame)
                hp_infertime += time.time() - start_time
                out_frame, headpose_angles = hpnet.preprocess_output(
                    hp_output, out_frame, face, bbox, args.print)

                # get gaze  estimation
                out_frame, left_eye, right_eye = genet.preprocess_input(
                    out_frame, face, left_eye_point, right_eye_point,
                    args.print)
                start_time = time.time()
                ge_output = genet.predict(left_eye, right_eye, headpose_angles)
                ge_infertime += time.time() - start_time
                out_frame, gaze_vector = genet.preprocess_output(
                    ge_output, out_frame, bbox, left_eye_point,
                    right_eye_point, args.print)

                if not args.no_video:
                    cv2.imshow('image', out_frame)

                if not args.no_move:
                    mc.move(gaze_vector[0], gaze_vector[1])

                break

            if key_pressed == 27:
                break

        if frame_count > 0:
            logging.info(
                f"Face Detection:{1000* fd_infertime/frame_count:.1f}ms")
            logging.info(
                f"Facial Landmarks Detection:{1000* lm_infertime/frame_count:.1f}ms"
            )
            logging.info(
                f"Headpose Estimation:{1000* hp_infertime/frame_count:.1f}ms")
            logging.info(
                f"Gaze Estimation:{1000* ge_infertime/frame_count:.1f}ms")

        feeder.close()
        cv2.destroyAllWindows()
    except Exception as ex:
        logging.exception(f"Error during inference:{str(ex)}")
Пример #8
0
def main():
    args = build_argparser().parse_args()
    logger = logging.getLogger('main')
    logging.basicConfig(filename='example.log', level=logging.ERROR)
    init_model(args)

    # Initialize variables with the input arguments for easy access
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'LandmarkRegressionModel': args.landmarkRegressionModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }

    preview_flags = args.previewFlags
    input_filename = args.input
    output_path = args.output_path
    prob_threshold = args.prob_threshold

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger.error("Unable to find specified model file" +
                         str(model_path))
            exit(1)

    feeder.load_data()
    w = int(feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(feeder.cap.get(cv2.CAP_PROP_FPS))
    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'), fps, (w, h),
                                True)

    frame_count = 0
    for ret, frame in feeder.next_batch():
        if not ret:
            break
        frame_count += 1
        key = cv2.waitKey(60)

        try:
            cropped_image, face_cords = face_model.predict(
                frame, prob_threshold)

            if type(cropped_image) == int:
                print("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye, right_eye, eye_cords = landmark_model.predict(
                cropped_image)
            pose_output = head_pose_model.predict(cropped_image)
            mouse_cord, gaze_vector = gaze_model.predict(
                left_eye, right_eye, pose_output)
        except Exception as e:
            print(str(e) + " for frame " + str(frame_count))
            continue

        image = cv2.resize(frame, (w, h))
        if not len(preview_flags) == 0:
            preview_frame = frame.copy()
            const = 10
            if 'ff' in preview_flags:
                if len(preview_flags) != 1:
                    preview_frame = cropped_image
                    cv2.rectangle(frame, (face_cords[0], face_cords[1]),
                                  (face_cords[2], face_cords[3]), (255, 0, 0),
                                  3)

            if 'fl' in preview_flags:
                cv2.rectangle(
                    cropped_image,
                    (eye_cords[0][0] - const, eye_cords[0][1] - const),
                    (eye_cords[0][2] + const, eye_cords[0][3] + const),
                    (0, 255, 0), 2)
                cv2.rectangle(
                    cropped_image,
                    (eye_cords[1][0] - const, eye_cords[1][1] - const),
                    (eye_cords[1][2] + const, eye_cords[1][3] + const),
                    (0, 255, 0), 2)

            if 'fh' in preview_flags:
                cv2.putText(
                    frame,
                    "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".
                    format(pose_output[0], pose_output[1], pose_output[2]),
                    (20, 40), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 255), 2)

            if 'fg' in preview_flags:
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.arrowedLine(le, (x - w, y + w), (x + w, y - w),
                                (255, 0, 255), 2)
                re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.arrowedLine(re, (x - w, y + w), (x + w, y - w),
                                (255, 0, 255), 2)
                preview_frame[eye_cords[0][1]:eye_cords[0][3],
                              eye_cords[0][0]:eye_cords[0][2]] = le
                preview_frame[eye_cords[1][1]:eye_cords[1][3],
                              eye_cords[1][0]:eye_cords[1][2]] = re
            image = np.hstack((cv2.resize(frame, (500, 500)),
                               cv2.resize(preview_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(frame)

        if frame_count % 5 == 0:
            mouse_controller.move(mouse_cord[0], mouse_cord[1])

        if key == 27:
            break

    logger.info('Video stream ended')
    cv2.destroyAllWindows()
    feeder.close()
def infer_on_stream(args):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.
    :param args: Command line arguments parsed by `build_argparser()`
    :return: None
    """
    # --- INPUT ---
    # Initialize the input_type
    input_type = None

    # Check if the input is a webcam
    if args.input == 'CAM':
        input_type = 'cam'

    # Check if the input is an image
    elif args.input.endswith(('.jpg', '.bmp', '.png')):
        input_type = 'image'

    # Check if the input is a video
    elif args.input.endswith(('.mp4', '.avi')):
        input_type = 'video'

    else:
        sys.exit(
            f"[ ERRO ] The format of the input file '{args.input.endswith}' is not supported."
        )

    # Initialize the InputFeeder
    input_feeder = InputFeeder(input_type, args.input)
    input_feeder.load_data()

    # --- MODELS ---
    # Load the Face Detection Model
    face_detection_model = FaceDetectionModel(
        model_xml_path=args.model_face_detection,
        device=args.device,
        extensions_path=args.cpu_extension,
    )

    face_detection_model.load_model()

    # Load the Head Pose Estimation Model
    head_pose_estimation_model = HeadPoseEstimationModel(
        model_xml_path=args.model_head_pose,
        device=args.device,
        extensions_path=args.cpu_extension,
    )

    head_pose_estimation_model.load_model()

    # Load the Facial Landmarks Detection Model
    facial_landmarks_detection_model = FacialLandmarksDetectionModel(
        model_xml_path=args.model_face_landmark,
        device=args.device,
        extensions_path=args.cpu_extension,
    )

    facial_landmarks_detection_model.load_model()

    # Load the Gaze Estimation Model
    gaze_estimation_model = GazeEstimationModel(
        model_xml_path=args.model_gaze_estimation,
        device=args.device,
        extensions_path=args.cpu_extension,
    )

    gaze_estimation_model.load_model()

    # --- POINTER CONTROLLER ---
    pointer_controller = MouseController(
        precision='medium',
        speed='medium',
    )

    # --- WINDOW ---
    # Set the window to fullscreen
    # cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
    # cv2.setWindowProperty(WINDOW_NAME, cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

    # Initialize list to track the inference time
    list_inference_time = []

    #Loop until stream is over
    for frame in input_feeder.next_batch():
        # If there is no frame break the loop
        if frame is None:
            break

        # start the timer
        start_time = time.time()

        # Initialize the frame to be displayed
        display_frame = frame

        # --- DETECT HEAD ---
        # Detect the head on the frame
        list_heads = face_detection_model.predict(frame)

        # Draw the outputs of the head detection algorithm
        if args.display_outputs:
            display_frame = face_detection_model.display_output(
                frame, list_heads)

        # --- HEAD POSE ESTIMATION ---
        # Extract the roi of the head with the highest confidence score
        head = list_heads[0]
        head_x_max = head.x + head.w
        head_y_max = head.y + head.h

        head_roi = frame[head.y:head_y_max, head.x:head_x_max, :]

        # Estimate the pose of the best head
        head_angles = head_pose_estimation_model.predict(head_roi)

        # Draw the pose of the best head
        if args.display_outputs:
            display_head_pose = head_pose_estimation_model.display_output(
                head_roi, head_angles)
            display_frame[head.y:head_y_max,
                          head.x:head_x_max, :] = display_head_pose

        # --- FACIAL LANDMARKS DETECTION ---
        # Detect the facial landmarks on the head with the highest confidence score
        face_landmarks = facial_landmarks_detection_model.predict(head_roi)

        # Draw the facial landmarks of the best head
        if args.display_outputs:
            # Set display_name to True to display the name of the landmarks
            display_facial_landmarks = facial_landmarks_detection_model.display_output(
                display_head_pose, face_landmarks, display_name=True)
            display_frame[head.y:head_y_max,
                          head.x:head_x_max, :] = display_facial_landmarks

        # --- GAZE ESTIMATION ---
        # Calculate the eye ROI size
        eye_roi_size = int(head_roi.shape[1] / 3)

        # Extract the roi of the left eyes
        left_eye_roi, left_eye_bbox = extract_landmark_roi(
            name='left_eye',
            landmarks=face_landmarks,
            roi_size=eye_roi_size,
            image=frame,
            origin_x=head.x,
            origin_y=head.y,
        )

        # Extract the roi of the Rigth eyes
        right_eye_roi, right_eye_bbox = extract_landmark_roi(
            name='right_eye',
            landmarks=face_landmarks,
            roi_size=eye_roi_size,
            image=frame,
            origin_x=head.x,
            origin_y=head.y,
        )

        # Predict the gaze
        gaze_vector = gaze_estimation_model.predict(
            left_eye_image=left_eye_roi,
            right_eye_image=right_eye_roi,
            head_angles=head_angles,
        )

        # normalize the gaze vector based on the left eye
        left_eye_x_center = left_eye_bbox.x + int(left_eye_bbox.w / 2)
        left_eye_y_center = left_eye_bbox.y + int(left_eye_bbox.h / 2)
        start_vector = np.array([left_eye_x_center, left_eye_y_center, 0])

        end_vector = np.array([
            left_eye_x_center + gaze_vector.x,
            left_eye_y_center - gaze_vector.y, 0 + gaze_vector.z
        ])

        vector = end_vector - start_vector
        norm_gaze_vector = vector / np.sqrt(np.dot(vector, vector))

        # Draw the gaze output and the eyes ROI
        if args.display_outputs:
            # draw the bbox around each eyes
            display_frame = face_detection_model.display_output(
                display_frame,
                [left_eye_bbox, right_eye_bbox],
                color=(255, 255, 255),
                display_conf=False,
            )

            # draw the gaze from both eyes
            display_frame = gaze_estimation_model.display_output(
                display_frame,
                norm_gaze_vector,
                [left_eye_bbox, right_eye_bbox],
            )

        # Update position of the Computer Pointer
        if not args.disable_pointer_controller:
            pointer_controller.move(gaze_vector.x, gaze_vector.y)

        # Calculate the inference time
        stop_time = time.time()
        list_inference_time.append(stop_time - start_time)

        # Calculate and print the FPS
        fps = round(1 / (stop_time - start_time), 2)
        cv2.rectangle(display_frame, (10, 2), (120, 20), (255, 255, 255), -1)
        cv2.putText(display_frame, f"{fps} FPS", (15, 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))

        # Display the frame
        cv2.imshow(WINDOW_NAME, display_frame)

        # Wait for 'ESC' or 'q' to exit the program
        keyboard = cv2.waitKey(30)
        if keyboard == 'q' or keyboard == 27:
            break

    # Release the input feeder
    input_feeder.close()

    # Destroy any OpenCV windows
    cv2.destroyAllWindows()

    # Display the average inference time and fps
    average_fps = round(1 / (mean(list_inference_time)), 2)
    print(
        f"[ INFO ] Average inference time was {mean(list_inference_time)}s ({average_fps} FPS)."
    )

    print(f"[ INFO ] Successfully exited the program.")
def main(args):
    logger = logging.getLogger()

    feeder = None
    if args.input_type == constants.VIDEO or args.input_type == constants.IMAGE:
        extension = str(args.input).split('.')[1]
        # if not extension.lower() in constants.ALLOWED_EXTENSIONS:
        #     logger.error('Please provide supported extension.' + str(constants.ALLOWED_EXTENSIONS))
        #     exit(1)

        # if not os.path.isfile(args.input):
        #     logger.error("Unable to find specified video/image file")
        #     exit(1)

        feeder = InputFeeder(args.input_type, args.input)
    elif args.input_type == constants.IP_CAMERA:
        if not str(args.input).startswith('http://'):
            logger.error('Please provide ip of server with http://')
            exit(1)

        feeder = InputFeeder(args.input_type, args.input)
    elif args.input_type == constants.WEBCAM:
        feeder = InputFeeder(args.input_type)

    mc = MouseController("medium", "fast")

    feeder.load_data()

    face_model = Face_Model(args.face, args.device, args.cpu_extension)
    face_model.check_model()

    landmark_model = Landmark_Model(args.landmarks, args.device,
                                    args.cpu_extension)
    landmark_model.check_model()

    # gaze_model = Gaze_Estimation_Model(args.gazeestimation, args.device, args.cpu_extension)
    # gaze_model.check_model()

    head_model = Head_Pose_Model(args.headpose, args.device,
                                 args.cpu_extension)
    head_model.check_model()

    face_model.load_model()
    logger.info("Face Detection Model Loaded...")
    landmark_model.load_model()
    logger.info("Landmark Detection Model Loaded...")
    # gaze_model.load_model()
    # logger.info("Gaze Estimation Model Loaded...")
    head_model.load_model()
    logger.info("Head Pose Detection Model Loaded...")
    print('Loaded')

    try:
        frame_count = 0
        for ret, frame in feeder.next_batch():
            if not ret:
                break

            if frame is None:
                continue

            frame_count += 1
            crop_face = None
            if True:

                crop_face, box = face_model.predict(frame.copy())

                if crop_face is None:
                    logger.error("Unable to detect the face.")
                    continue
                imshow('frame', crop_face, width=400)

                (lefteye_x, lefteye_y), (
                    righteye_x, righteye_y
                ), eye_coords, left_eye, right_eye = landmark_model.predict(
                    crop_face.copy(), eye_surrounding_area=15)

                # imshow("left_eye", left_eye, width=100)
                # imshow("right_eye", right_eye, width=100)
                '''TODO dlib is better to crop eye with perfection'''

                head_position = head_model.predict(crop_face.copy())

                if True:
                    if cv2.waitKey(20) & 0xFF == ord('q'):
                        break
                    continue

                gaze, (mousex,
                       mousey) = gaze_model.predict(left_eye.copy(),
                                                    right_eye.copy(),
                                                    head_position)

                if (len(args.debug) > 0):
                    debuFrame = frame.copy()
                    if crop_face is None:
                        continue

                    thickness = 2
                    radius = 2
                    color = (0, 0, 255)
                    [[le_xmin, le_ymin, le_xmax, le_ymax],
                     [re_xmin, re_ymin, re_xmax, re_ymax]] = eye_coords

                    if 'face' in args.debug:
                        cv2.rectangle(debuFrame, (box[0], box[1]),
                                      (box[2], box[3]), (255, 255, 255), 2)

                        cv2.rectangle(crop_face, (re_xmin, re_ymin),
                                      (re_xmax, re_ymax), (100, 255, 100), 2)
                        cv2.rectangle(crop_face, (le_xmin, le_ymin),
                                      (le_xmax, le_ymax), (100, 255, 100), 2)
                    '''
                    LandMark
                    '''

                    cv2.circle(crop_face, (lefteye_x, lefteye_y), radius,
                               color, thickness)
                    cv2.circle(crop_face, (righteye_x, righteye_y), radius,
                               color, thickness)

                    debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face

                    if 'headpose' in args.debug:
                        yaw = head_position[0]
                        pitch = head_position[1]
                        roll = head_position[2]

                        sinY = math.sin(yaw * math.pi / 180.0)
                        sinP = math.sin(pitch * math.pi / 180.0)
                        sinR = math.sin(roll * math.pi / 180.0)

                        cosY = math.cos(yaw * math.pi / 180.0)
                        cosP = math.cos(pitch * math.pi / 180.0)
                        cosR = math.cos(roll * math.pi / 180.0)

                        cH, cW = crop_face.shape[:2]
                        arrowLength = 0.4 * cH * cW

                        xCenter = int(cW / 2)
                        yCenter = int(cH / 2)

                        # center to right
                        # cv2.line(crop_face, (xCenter, yCenter),
                        #          (int((xCenter + arrowLength * (cosR * cosY + sinY * sinP * sinR))),
                        #           int((yCenter + arrowLength * cosP * sinR))), (186, 204, 2), 1)
                        #
                        #             # center to top
                        #             cv2.line(crop_face, (xCenter, yCenter),
                        #                      (int(((xCenter + arrowLength * (cosR * sinY * sinP + cosY * sinR)))),
                        #                       int((yCenter - arrowLength * cosP * cosR))), (186, 204, 2), 1)
                        #
                        #             # center to forward
                        #             cv2.line(crop_face, (xCenter, yCenter),
                        #                      (int(((xCenter + arrowLength * sinY * cosP))),
                        #                       int((yCenter + arrowLength * sinP))), (186, 204, 2), 1)
                        #
                        cv2.putText(
                            crop_face,
                            'head pose: (y={:.2f}, p={:.2f}, r={:.2f})'.format(
                                yaw, pitch, roll), (0, 20),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 255), 1)

                    if 'gaze' in args.debug:
                        cH, cW = crop_face.shape[:2]
                        arrowLength = 0.6 * cH

                        gazeArrowX = gaze[0] * arrowLength
                        gazeArrowY = -gaze[1] * arrowLength

                        debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face

                        cv2.arrowedLine(crop_face, (lefteye_x, lefteye_y),
                                        (int(lefteye_x + gazeArrowX),
                                         int(lefteye_y + gazeArrowY)),
                                        (184, 113, 57), 2)
                        cv2.arrowedLine(crop_face, (righteye_x, righteye_y),
                                        (int(righteye_x + gazeArrowX),
                                         int(righteye_y + gazeArrowY)),
                                        (184, 113, 57), 2)

                        cv2.putText(crop_face,
                                    'gaze angles: h={}, v={}'.format("!", "2"),
                                    (0, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.35,
                                    (255, 255, 255), 1)

                        debuFrame[box[1]:box[3], box[0]:box[2]] = crop_face

            #
            #             imshow("face", crop_face, width=400)
            #             cv2.moveWindow("face", 0, 0)
            #             imshow("debug", debuFrame, width=400)
            #             cv2.moveWindow("debug", cW * 2, cH)

            # try:
            #     if frame_count % 5 == 0:
            #         mc.move(mousex, mousey)
            # except Exception as err:
            #     logger.error("Moving cursor outside the PC not supported yet !!")

            # key = cv2.waitKey(60)
                    imshow('frame', debuFrame, width=1210)

            if cv2.waitKey(20) & 0xFF == ord('q'):
                break
    except Exception as err:
        logger.error(err)

    cv2.destroyAllWindows()
    feeder.close()
def main_benchmark(args):
    feed = InputFeeder(input_type=args.it, input_file=args.i)

    face_model = FaceDetectionModel(args.fm, args.d, args.c, float(args.p))
    start_time = time.time()
    face_model.load_model()
    face_load_model_time = time.time() - start_time

    landmarks_model = LandmarksDetectionModel(args.lm, args.d, args.c)
    start_time = time.time()
    landmarks_model.load_model()
    landmarks_model_time = time.time() - start_time

    headpose_model = HeadPoseDetectionModel(args.hpm, args.d, args.c)
    start_time = time.time()
    headpose_model.load_model()
    headpose_model_time = time.time() - start_time

    gaze_model = GazeEstimationModel(args.gem, args.d, args.c)
    start_time = time.time()
    gaze_model.load_model()
    gaze_model_time = time.time() - start_time

    feed.load_data()
    for batch in feed.next_batch():
        try:
            start_time = time.time()
            cropped_face, coords, face_time_prediction = face_model.predict(
                batch)
            cv2.rectangle(batch, (coords[0], coords[1]),
                          (coords[2], coords[3]), (255, 0, 0), 2)
            io_face_model_time = time.time() - start_time

            start_time = time.time()
            left_eye, right_eye, eyes_coords, landmarks_time_prediction = landmarks_model.predict(
                cropped_face)
            io_landmarks_model_time = time.time() - start_time

            start_time = time.time()
            head_pose_angles, headpose_time_prediction = headpose_model.predict(
                cropped_face)
            io_head_pose_model_time = time.time() - start_time

            start_time = time.time()
            x, y, z, gaze_time_prediction = gaze_model.predict(
                left_eye, right_eye, head_pose_angles, cropped_face,
                eyes_coords)
            io_gaze_model_time = time.time() - start_time

            print("Graphing loading time...")
            graph_loading_time(face_load_model_time, landmarks_model_time,
                               headpose_model_time, gaze_model_time, args.bm)
            print("Graphing io processing time...")
            graph_io_processing_time(io_face_model_time,
                                     io_landmarks_model_time,
                                     io_head_pose_model_time,
                                     io_gaze_model_time, args.bm)
            print("Graphing inference time...")
            graph_model_inference_time(face_time_prediction,
                                       landmarks_time_prediction,
                                       headpose_time_prediction,
                                       gaze_time_prediction, args.bm)
            print("Done")

            break

        except:
            print("Frame without prediction. Error: ", sys.exc_info()[0])
            log.error(sys.exc_info()[0])
    feed.close()
def main():
    # command line arguments
    args = build_argparser().parse_args()
    input_filename = args.input
    log_object = log.getLogger()
    visual_flags = args.visualization_flag
    print("Visual flags:",visual_flags)

    device_models=args.device
    print("deviceModels:",device_models)
    device_list = device_models.split(",")
    print("deviceList:",device_list)
    print("deviceFirst:",device_list[1])
    str_cam ="cam"
    output_path = args.output_path

    if input_filename.lower() == str_cam:
        input_feeder = InputFeeder(str_cam)
    else:
        if not os.path.isfile(input_filename):
            log_object.error("Error: Can not find the video o image file.")
            exit(1)
        input_feeder = InputFeeder("video", input_filename)


    
    obj_face_detection = Face_detection_model(model_name=args.face_detection_model,device=device_list[0], threshold=args.prob_threshold,
                                                          extensions=args.cpu_extension)
    
    obj_facial_landmarks  = Facial_landmarks_detection_model(model_name=args.facial_landmarks_model,device=device_list[1],
                                                             extensions=args.cpu_extension)

    obj_gaze_estimation = Gaze_estimation_model(model_name=args.gaze_estimation_model, device=device_list[2],
                                                 extensions=args.cpu_extension)

    obj_head_pose_estimation = Head_pose_estimation_model(model_name=args.head_pose_model, device=device_list[3]
                                                        , extensions=args.cpu_extension)
    
    mouse_controller_object = MouseController('medium', 'fast')

    start_time = time.time()
    obj_face_detection.load_model()
    model_load_time_face = time.time() - start_time
    
    start_landmark_time = time.time()
    obj_facial_landmarks.load_model()
    model_load_time_landmarks = time.time() - start_landmark_time

    start_headpose_time = time.time()
    obj_head_pose_estimation.load_model()
    model_load_time_headpose = time.time() - start_headpose_time    

 
    start_gaze_time = time.time()
    obj_gaze_estimation.load_model()
    model_load_time_gaze = time.time() - start_gaze_time

    models_load_time = time.time() - start_time
    
    log_object.info("Info:Models loading time(face, landmark, gaze, head_pose): {:.3f} ms".format(models_load_time * 1000))
    
    input_feeder.load_data()
    
    counter = 0
    start_inference_time = time.time()
    
    log_object.info("Info:Start inferencing ")
    print(input_feeder.next_batch())
    for ret,frame in input_feeder.next_batch():
        #print(flag)
        #print(frame)
        if not ret:
            break
        pressed_key = cv2.waitKey(60)
        counter = counter + 1
        print("counter:",counter)


        
        first_coords, image_change = obj_face_detection.predict(frame)
        inference_face_time = round(time.time() - start_inference_time, 1)
        
        print("Inference face time:",inference_face_time)

        left_eye_img, right_eye_img, eye_coord = obj_facial_landmarks.predict(image_change)
        inference_landmark_time = round(time.time() - start_inference_time, 1)

        print("Inference landmark time:",inference_landmark_time)
        
        if first_coords == 0:
            continue
            
        output_head_pose_estimation = obj_head_pose_estimation.predict(image_change)
        inference_head_time = round(time.time() - start_inference_time, 1)

        print("Inference inference_head_time:",inference_head_time)
        
        mouse_coordinate, gaze_vector = obj_gaze_estimation.predict(left_eye_img, right_eye_img,
                                                                             output_head_pose_estimation)

        inference_gaze_time = round(time.time() - start_inference_time, 1)

        print("Inference inference_gaze_time:",inference_gaze_time)

        frame_image = frame.copy()                
        if len(visual_flags) != 0:

            preview=process_visual_flags(frame_image,visual_flags,frame,image_change,first_coords
                                                ,eye_coord,output_head_pose_estimation) 
        else:
            preview = frame_image

        

        fps_face = int(counter) / inference_gaze_time
        color =(0,255,0)

        cv2.putText(frame_image,"Inference: = {:.2f}".format(inference_gaze_time),(20, 180),cv2.FONT_HERSHEY_COMPLEX,1, color, 2)  
        mouse_controller_object.move(mouse_coordinate[0], mouse_coordinate[1])        

        cv2.putText(frame_image,"FPS: = {:.2f}".format(fps_face),(20, 220),cv2.FONT_HERSHEY_COMPLEX,1, color, 2)  
        
        image_new = cv2.resize(preview, (700, 700))
        cv2.imshow('Visualization', image_new)
        mouse_controller_object.move(mouse_coordinate[0], mouse_coordinate[1])        
                 
    
        if pressed_key == 27:
            log_object.error("exit key is pressed..")
            break

    #Time calculations for every model.
    #inference_facefinal_time =    inference_face_time
    #inference_landmarkfinal_time    = inference_landmark_time - inference_face_time
    #inference_headfinal_time    = inference_head_time - inference_landmark_time
    #inference_gazefinal_time    = inference_gaze_time - inference_head_time

    inference_total_time = round(time.time() - start_inference_time, 1)
    print("Inference inference_total_time:",inference_total_time)
    
    #fps_face = int(counter) / inference_face_time
    #fps_landmark = int(counter) / inference_landmark_time
    #fps_head  = int(counter) / inference_head_time
    #fps_gaze  = int(counter) / inference_gaze_time
    fps_total = int(counter) / inference_total_time
    print("fps_total:",fps_total)
    with open(output_path+'statstotal.txt', 'w') as f:    
        f.write(str(inference_total_time) + '\n')
        f.write(str(fps_total) + '\n')
        f.write(str(models_load_time) + '\n')


    with open(output_path+'statsmodels.txt', 'w') as f:
       # f.write(str(inference_facefinal_time)+ ','+str(inference_landmarkfinal_time)+','+str(inference_headfinal_time)+','+str(inference_gazefinal_time)+ '\n')
       # f.write(str(fps_face)+ ','+str(fps_landmark)+','+str(fps_head)+','+str(fps_gaze)+ '\n')
       # f.write(str(model_load_time_face)+ ','+str(model_load_time_landmarks)+','+str(model_load_time_headpose)+','+str(model_load_time_gaze)+ '\n')
        f.write(str(model_load_time_face) + '\n')
        f.write(str(model_load_time_landmarks) + '\n')
        f.write(str(model_load_time_headpose) + '\n')
        f.write(str(model_load_time_gaze) + '\n')
        
    log_object.info("Info:Finishing Video")    
    input_feeder.close()
    cv2.destroyAllWindows()
Пример #13
0
def main():
    args = build_argparser().parse_args()

    frameNum = 0
    inferenceTime = 0
    counter = 0

    # Initialize the Inference Engine
    fd = FaceDetection()
    ld = Facial_Landmarks_Detection()
    ge = gazeEstimation()
    hp = headPose()
    modelStart = time.time()
    # Load Models
    fd.loadModel(args.faceDetectionModel, args.device)
    ld.loadModel(args.faceLandmarkModel, args.device)
    ge.loadModel(args.gazeEstimationModel, args.device)
    hp.loadModel(args.headPoseModel, args.device)
    print("Model Load timing:", (time.time() - modelStart) * 1000, "ms")

    # Get the input feeder
    if args.input == "cam":
        feed = InputFeeder("cam")
    else:
        assert os.path.isfile(args.input), "Specified input file doesn't exist"
        feed = InputFeeder("video", args.input)
    feed.load_data()
    frameCount = 0
    # Mouse Controller precision and speed
    mc = MouseController('medium', 'fast')

    for frame in feed.next_batch():
        frameCount += 1

        if frame is not None:
            key = cv2.waitKey(60)

            inferenceStart = time.time()

            # make predictions
            detected_face, faceCoords = fd.predict(frame.copy(),
                                                   args.prob_threshold)
            hpOutput = hp.predict(detected_face.copy())
            leftEye, rightEye, eyeCoords = ld.predict(detected_face.copy())
            new_mouse_coord, gazeVector = ge.predict(leftEye, rightEye,
                                                     hpOutput)

            inferenceTime = time.time() - inferenceStart
            counter = counter + 1

            # Visualization
            preview = args.visualization
            if preview:
                preview_frame = frame.copy()
                faceFrame = detected_face.copy()

                drawFaceBoundingBox(preview_frame, faceCoords)
                displayHp(preview_frame, hpOutput, faceCoords)
                draw_landmarks(faceFrame, eyeCoords)
                draw_gaze(faceFrame, gazeVector, leftEye.copy(),
                          rightEye.copy(), eyeCoords)
            if preview:
                img = np.hstack((cv2.resize(preview_frame, (500, 500)),
                                 cv2.resize(faceFrame, (500, 500))))
            else:
                img = cv2.resize(frame, (500, 500))
            cv2.imshow('Visualization', img)

            # set speed
            if frameCount % 5 == 0:
                mc.move(new_mouse_coord[0], new_mouse_coord[1])

            print("Frame Number:", frameNum)
            print("Inference Time:", inferenceTime * 1000)

            frameNum += 1

            if key == 27:
                break
    feed.close()
Пример #14
0
def main():
    args = build_argparser().parse_args()
    visual = args.visual_flag
    log = logging.getLogger()
    input_source = args.input_source
    try:
        video_path = args.input_path
    except Exception as e:
        video_path = None
    feed = None
    if input_source.lower() == 'cam':
        feed = InputFeeder('cam')
    elif input_source.lower() == 'video' and os.path.isfile(video_path):
        feed = InputFeeder('video', video_path)
    else:
        log.error('Wrong input feed. (check the video path).')
        exit(1)

    fd = Model_Face(args.face_detection_model, args.device, args.extension)
    hp = Model_HeadPose(args.head_pose_model, args.device, args.extension)
    fl = Model_Faciallandmark(args.facial_landmarks_model, args.device,
                              args.extension)
    ga = Model_Gaze(args.gaze_model, args.device, args.extension)
    ### You can specify the value of precision and speed directly.
    ##  OR
    ## 'high'(100),'low'(1000),'medium','low-med' - precision
    ## 'fast'(1), 'slow'(10), 'medium', 'slow-med' - speed
    #     mouse = MouseController('low-med', 'slow-med')
    mouse = MouseController(500, 4)

    feed.load_data()
    # load models
    fd.load_model()
    hp.load_model()
    fl.load_model()
    ga.load_model()
    count = 0
    for ret, frame in feed.next_batch():
        if not ret:
            break
        count += 1
        if count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))
        key = cv2.waitKey(60)
        frame_cp = frame.copy()
        face, face_position = fd.predict(frame_cp, args.threshold)
        if type(face) == int:
            log.error('Prediction Error: Cant find face.')
            if key == 27:
                break
            continue
        face_cp = face.copy()
        hp_output = hp.predict(face_cp)
        left_eye, right_eye, facial = fl.predict(face_cp)
        #         print('left',left_eye,'\n','right',right_eye,'\n')
        mouse_coord, gaze_vector = ga.predict(left_eye, right_eye, hp_output)

        if (not len(visual) == 0):
            visual_frame = frame.copy()
            ### Visual FLAGS
            # face detection
            if 'fd' in visual:
                visual_frame = face
            # Head pose
            if 'hp' in visual:
                cv2.putText(
                    visual_frame,
                    "Yaw: {:.2f} Pitch: {:.2f} Roll: {:.2f}".format(
                        hp_output[0], hp_output[1], hp_output[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.3, (0, 255, 50), 1)
            # Facial landmarks
            if 'fl' in visual:
                cv2.rectangle(face, (facial[0][0] - 10, facial[0][1] - 10),
                              (facial[0][2] + 10, facial[0][3] + 10),
                              (255, 0, 0), 3)
                cv2.rectangle(face, (facial[1][0] - 10, facial[1][1] - 10),
                              (facial[1][2] + 10, facial[1][3] + 10),
                              (255, 0, 0), 3)
            # Gaze estimation
            if 'ga' in visual:
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 255, 0), 2)
                cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 50, 150), 2)
                re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 255, 0), 2)
                cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 50, 150), 2)
                face[facial[0][1]:facial[0][3], facial[0][0]:facial[0][2]] = le
                face[facial[1][1]:facial[1][3], facial[1][0]:facial[1][2]] = re
            cv2.namedWindow('Visualization', cv2.WINDOW_AUTOSIZE)
            cv2.moveWindow('Visualization', 900, 900)
            cv2.imshow('Visualization', cv2.resize(visual_frame, (500, 500)))


#             if count%10==0:
#                 cv2.imwrite(str(count)+'_visual.jpg',visual_frame)
        if count % 5 == 0:
            mouse.move(mouse_coord[0], mouse_coord[1])
        if key == 27:
            break

    log.error('INFO: Ended!')
    cv2.destroyAllWindows()
    feed.close()
class Pipeline:
    def __init__(self, args):
        self.log_level = "INFO" if os.environ.get(
            "LOGLEVEL") == "INFO" or args.verbose_stage else "WARNING"
        log.basicConfig(level=self.log_level)

        input_type = 'cam' if args.cam else 'video'
        self.feed = InputFeeder(input_type, args.video)
        if not self.feed.load_data():
            raise Exception('Input valid image or video file')

        fps, w, h = self.feed.get_props()
        self.out_video = cv2.VideoWriter(args.out,
                                         cv2.VideoWriter_fourcc(*'MJPG'), fps,
                                         (w, h), True)

        args.head_pose_model = os.path.join(
            args.head_pose_model, args.precision,
            os.path.basename(args.head_pose_model))
        args.landmarks_model = os.path.join(
            args.landmarks_model, args.precision,
            os.path.basename(args.landmarks_model))
        args.gaze_model = os.path.join(args.gaze_model, args.precision,
                                       os.path.basename(args.gaze_model))

        self.fd = FaceDetect(args.face_model, args.device, args.extension,
                             args.threshold)
        self.fd.load_model()
        self.fd.set_out_size(w, h)

        self.hp = HeadPoseEstimate(args.head_pose_model, args.device,
                                   args.extension, args.threshold)
        self.hp.load_model()

        self.fl = FacialLandMarkDetect(args.landmarks_model, args.device,
                                       args.extension, args.threshold)
        self.fl.load_model()

        self.gz = GazeEstimate(args.gaze_model, args.device, args.extension,
                               args.threshold)
        self.gz.load_model()

        self.mc = MouseController()
        self.verbose_stage = args.verbose_stage

    def get_bounding_rect(self, x, y):
        width, height = 40, 20
        x1, y1 = x - int(width / 2), y - int(height / 2)
        x2, y2 = x + int(width / 2), y + int(height / 2)
        return x1, y1, x2, y2

    def verbose_stage_draw(self, frame, face_coord, eye_coord,
                           head_pose_angles, mouse_coord):
        f_x1, f_y1, f_x2, f_y2 = face_coord
        self.fd.draw_rect(frame, (f_x1, f_y1), (f_x2, f_y2))

        e_x1, e_y1, e_x2, e_y2 = eye_coord
        left_x, left_y, right_x, right_y = self.get_bounding_rect(e_x1, e_y1)
        self.fl.draw_rect(frame, (f_x1 + left_x, f_y1 + left_y),
                          (f_x1 + right_x, f_y1 + right_y))
        left_x, left_y, right_x, right_y = self.get_bounding_rect(e_x2, e_y2)
        self.fl.draw_rect(frame, (f_x1 + left_x, f_y1 + left_y),
                          (f_x1 + right_x, f_y1 + right_y))

        text = "Yaw: {:+.0f}, Pitch: {:+.0f}, Roll: {:+.0f}".format(
            *head_pose_angles)
        self.hp.draw_text(frame, text, (100, 100))

        self.gz.draw_circle(frame, mouse_coord, 10)

    def run(self):
        abs_mouse_x = abs_mouse_y = 0
        for frame in self.feed.next_batch():
            f_x1, f_y1, f_x2, f_y2 = self.fd.predict(frame)

            face_frame = frame[f_y1:f_y2, f_x1:f_x2]

            if not face_frame.size:  # skip if face not detected
                continue

            head_pose_angles = self.hp.predict(face_frame)

            self.fl.set_out_size(f_x2 - f_x1, f_y2 - f_y1)
            e_x1, e_y1, e_x2, e_y2 = self.fl.predict(face_frame)

            left_x, left_y, right_x, right_y = self.get_bounding_rect(
                e_x1, e_y1)
            left_eye_frame = face_frame[left_y:right_y, left_x:right_x]
            left_x, left_y, right_x, right_y = self.get_bounding_rect(
                e_x2, e_y2)
            right_eye_frame = face_frame[left_y:right_y, left_x:right_x]

            if not left_eye_frame.size or not right_eye_frame.size:  # skip if eyes not detected
                continue

            g_x, g_y, _ = self.gz.predict(left_eye_frame, right_eye_frame,
                                          [[*head_pose_angles]])

            self.mc.move(g_x, g_y)

            if self.verbose_stage:
                _, w, h = self.feed.get_props()
                if abs_mouse_x == 0 and abs_mouse_y == 0:
                    abs_mouse_x = int(f_x1 + (e_x1 + e_x2) / 2)
                    abs_mouse_y = int(f_y1 + (e_y1 + e_y2) / 2)
                else:
                    abs_mouse_x += int(g_x * w / 250)
                    abs_mouse_y -= int(g_y * h / 250)

                self.verbose_stage_draw(frame, (f_x1, f_y1, f_x2, f_y2),
                                        (e_x1, e_y1, e_x2, e_y2),
                                        head_pose_angles,
                                        (abs_mouse_x, abs_mouse_y))

            self.out_video.write(frame)

    def close(self):
        self.feed.close()
        self.out_video.release()
def main(args):
    print("Main script running...")
    log_name = 'stats_' + args.device + '_' + args.hpe + args.fld + args.ge

    if not os.path.exists('output'):
        os.makedirs('output')
    print(f"Logging to: output/{log_name}")
    log = open('output/' + log_name, 'w+')

    print("Initializing models...")

    fd = FaceDetector(
        model_name=
        'models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001',
        device=args.device,
        extensions=None)

    fd.load_model()

    if args.v: print(f"Face Detection Load Time: {fd.load_time}")

    hpe = HeadPoseEstimator(
        model_name=
        f'models/intel/head-pose-estimation-adas-0001/{args.hpe}/head-pose-estimation-adas-0001',
        device=args.device,
        extensions=None)
    hpe.load_model()

    if args.v: print(f"Head Pose Estimation Load Time: {hpe.load_time}")

    fld = FacialLandmarkDetector(
        model_name=
        f'models/intel/landmarks-regression-retail-0009/{args.fld}/landmarks-regression-retail-0009',
        device=args.device,
        extensions=None)
    fld.load_model()

    if args.v: print(f"Facial Landmarks Detection Load Time: {fld.load_time}")

    ge = GazeEstimator(
        model_name=
        f'models/intel/gaze-estimation-adas-0002/{args.ge}/gaze-estimation-adas-0002',
        device=args.device,
        extensions=None)
    ge.load_model()

    if args.v: print(f"Gaze Estimation Load Time: {ge.load_time}")

    image = False

    print("Initializing source feed...")
    feed = InputFeeder(input_type=args.input_type, input_file=args.input_file)
    if args.input_type == 'image':
        image = True

    feed.load_data()

    for batch in feed.next_batch():
        if args.v:
            print()
        cv2.imshow('Batch', batch)
        if image:
            cv2.imwrite('output/Batch.png', batch)

        coords, bounding_face = fd.predict(batch)
        if not coords:
            print("No face")
            continue
        if image: cv2.imwrite('output/Face.png', bounding_face)
        box = coords[0]
        face = bounding_face[box[1]:box[3], box[0]:box[2]]

        if args.v:
            print(f"Face Time: {fd.infer_time}")
        log.write("FD_infer: " + str(fd.infer_time) + "\n")
        if image:
            cv2.imshow('Cropped Face', face)

        # Landmark Detection
        coords, landmark_detection, landmark_points = fld.predict(face)
        if image: cv2.imwrite('output/Landmarks.png', landmark_detection)
        if image: cv2.imshow('Landmark Detection', landmark_detection)
        if args.v: print(f"Landmark Time: {fld.infer_time}")
        log.write("FLD_infer: " + str(fld.infer_time) + "\n")
        right_box, left_box = coords[0:2]
        if args.v: print(f"Eye Coords: {coords}")

        if left_box == None or right_box == None:
            print("No eyes")
            continue

        left_eye = face[left_box[1]:left_box[3], left_box[0]:left_box[2]]
        cv2.putText(face, 'L', (left_box[0], left_box[3]),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)

        right_eye = face[right_box[1]:right_box[3], right_box[0]:right_box[2]]
        cv2.putText(face, 'R', (right_box[0], right_box[3]),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)

        if args.v:
            print(f"Eye Shape: {left_eye.shape} :: {right_eye.shape}")

        #Head Pose Estimation
        head_yaw, head_pitch, head_roll = hpe.predict(face)
        if args.v: print(f"Head Pose Time: {hpe.infer_time}")
        log.write("HPE_infer: " + str(hpe.infer_time) + "\n")
        head_angles = [head_yaw[0][0], head_pitch[0][0], head_roll[0][0]]

        #Gaze Estimation
        # expects pose as  (yaw, pitch, and roll)
        gaze = ge.predict(left_eye, right_eye, head_angles)

        if args.v:
            print(f"Gaze Time: {ge.infer_time}")
        log.write("GE_infer: " + str(ge.infer_time) + "\n")
        gaze_point = (int(gaze[0][0] * 50), int(gaze[0][1] * 50))

        arrows = cv2.arrowedLine(face, landmark_points[0],
                                 (landmark_points[0][0] + gaze_point[0],
                                  landmark_points[0][1] - gaze_point[1]),
                                 (0, 0, 255), 2)
        arrows = cv2.arrowedLine(face, landmark_points[1],
                                 (landmark_points[1][0] + gaze_point[0],
                                  landmark_points[1][1] - gaze_point[1]),
                                 (0, 0, 255), 2)
        if image:
            cv2.imwrite('output/Gaze.png', arrows)

        if not image:
            mouse = MouseController(precision='medium', speed='medium')
            mouse.move(gaze[0][0], gaze[0][1])

        if image:
            cv2.imshow('Arrows', arrows)

        if image:
            log.write("FD_LoadTime: " + str(fd.load_time) + "\n")
            log.write("FD_PreprocessTime: " + str(fd.preprocess_input_time) +
                      "\n")
            log.write("FD_PostrocessTime: " + str(fd.preprocess_output_time) +
                      "\n")

            log.write("FLD_LoadTime: " + str(fld.load_time) + "\n")
            log.write("FLD_PreprocessTime: " + str(fld.preprocess_input_time) +
                      "\n")
            log.write("FLD_PostprocessTime: " +
                      str(fld.preprocess_output_time) + "\n")

            log.write("HPE_LoadTime: " + str(hpe.load_time) + "\n")
            log.write("HPE_PreprocessTime: " + str(hpe.preprocess_input_time) +
                      "\n")

            log.write("GE_LoadTime: " + str(ge.load_time) + "\n")
            log.write("GE_PreprocessTime: " + str(ge.preprocess_input_time) +
                      "\n")

            cv2.waitKey(0)
        else:
            if cv2.waitKey(15) & 0xFF == ord('q'):
                break

    feed.close()
    log.close()
    cv2.destroyAllWindows
Пример #17
0
def infer_on_stream(args):
    models = None
    # Check selected precision model
    if "FP32" in args.precision:
        models = select_precision(args.precision)

    if "FP16" in args.precision:
        models = select_precision(args.precision)

    if "INT8" in args.precision:
        models = select_precision(args.precision)

    # Get Input
    input_feeder = InputFeeder(args.input_type, args.input_file)
    input_feeder.load_data()

    # Load face detection model
    face = FaceDetection(model_name=models[0],
                         device=args.device,
                         extensions=args.cpu_extension)
    face.load_model()

    # Load head pose model
    head = HeadPoseEstimation(model_name=models[1],
                              device=args.device,
                              extensions=args.cpu_extension)
    head.load_model()

    # Load facial landmark model
    landmark = FacialLandmarkDetection(model_name=models[2],
                                       device=args.device,
                                       extensions=args.cpu_extension)
    landmark.load_model()

    # Load gaze estimation model
    gaze = GazeEstimation(model_name=models[3],
                          device=args.device,
                          extensions=args.cpu_extension)
    gaze.load_model()

    # Initalize mouse controller
    mouse = MouseController('high', 'fast')

    for frame in input_feeder.next_batch():
        # Break if number of next frame less then number of batch
        if frame is None:
            break

        # Estimate face region
        output_frame, cropped_face, box_coord = face.predict(frame)

        # Estimate head pose position
        head_pose = head.predict(cropped_face)
        head_pose = np.array(head_pose)

        # Estimate eyes landmark coordinates
        lr_eyes = landmark.predict(cropped_face)

        eyes = []

        # Calculate eye image region
        for coord in lr_eyes:
            x = int(coord[0] + box_coord[0])
            y = int(coord[1] + box_coord[1])
            cv2.circle(output_frame, (x, y), 5, (255, 0, 0), -1)

            eye_box, cropped_eye = eyes_crop(output_frame, x, y, 40)
            cv2.rectangle(output_frame, eye_box[0], eye_box[1], (255, 0, 0), 1)
            eyes.append(cropped_eye)

        # Estimate gaze direction
        gaze_coords = gaze.predict(eyes[0], eyes[1], head_pose)

        # Move the mouse cursor
        mouse.move(gaze_coords[0], gaze_coords[1])

        if "True" in args.visualize:
            cv2.imshow('Capture', output_frame)

            if cv2.waitKey(30) & 0xFF == ord('q'):
                break

    input_feeder.close()
    if "True" in args.visualize:
        cv2.destroyAllWindows()
Пример #18
0
def main(args):
    # get all arguments
    model_face=args.model_face
    model_landmark=args.model_landmark
    model_pose=args.model_pose
    model_gaze=args.model_gaze
    device=args.device
    extensions=args.extensions
    video_file=args.video
    output_path=args.output_path
    face_confidence=args.threshold_face_detection
    precision=args.mouse_precision
    speed=args.mouse_speed
    show_frame=args.show_frame
    show_log=args.debug

    # set up logging
    if show_log:
        logging.basicConfig(format='%(asctime)s %(levelname)s:%(name)s:%(message)s', level=logging.DEBUG)
    else:
        logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.ERROR)

    logger = logging.getLogger('Main')

    # initialize models
    print('Initializing models')
    start = time.time()
    face_detector= ModelFaceDetection(model_name=model_face, device=device, extensions=extensions, threshold=face_confidence)
    face_detector.load_model()
    print ('...Successfully loading face detection model in {:.2f} ms'.format(time.time() -start))
    start = time.time()
    landmark_detector= ModelLandmarksDetection(model_name=model_landmark)
    landmark_detector.load_model()
    print ('...Successfully loading landmarks detection model in {:.2f} ms'.format(time.time() -start))
    start = time.time()
    pose_estimator=ModelHeadPoseEstimation(model_name=model_pose)
    pose_estimator.load_model()
    print ('...Successfully loading head pose estimation model in {:.2f} ms'.format(time.time() -start))
    start = time.time()
    gaze_estimator=ModelGazeEstimation(model_name=model_gaze)
    gaze_estimator.load_model()
    print ('...Successfully loading gaze estimation model in {:.2f} ms'.format(time.time() -start))

    # get input
    print('Getting input data')
    input_type = 'video'
    if video_file=='cam':
        input_type = 'cam'
        logger.info('Using camera')
    elif not support_video_format(video_file):
        print ('Unsupported input format! Please use only video file or cam as input')
        exit(1)
    logger.info('Using video input from ', video_file)

    feed=InputFeeder(input_type=input_type, input_file=video_file)
    feed.load_data()
    initial_w = int(feed.getCap().get(cv2.CAP_PROP_FRAME_WIDTH))
    initial_h = int(feed.getCap().get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_len = int(feed.getCap().get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(feed.getCap().get(cv2.CAP_PROP_FPS))
    if output_path:
        out_video = cv2.VideoWriter(os.path.join(output_path, 'output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps, (initial_w, initial_h), True)
    print('...Video size hxw= {}x{}'.format(initial_h, initial_w))

    # mouse controller
    print('Initializing mouse controller')
    if precision in ['high', 'low', 'medium'] and speed in ['fast', 'slow', 'medium']:
        center = (initial_w/2, initial_h/2)
        mouse_controller=MouseController(precision, speed, center)
    else:
        print('Please setup mouse precision and speed correctly!')
        exit(1)

    count = 0
    print('Looping through all the frame and doing inference')
    for batch in feed.next_batch():
        count = count + 1
        logger.info('Frame#{}'.format(count))
        logger.info('Detecting face')
        face, coord, image = face_detector.predict(batch)
        if face is None:
            print('...There might be no face or more than 1 face detected. Skip this frame')
            continue
        logger.info('Successfully detecting 1 face')
        logger.info('Estimating head pose')
        pose, image = pose_estimator.predict(face.copy(), image)
        logger.info('Detecting facial landmarks')
        eyes, eyes_center, image = landmark_detector.predict(face.copy(), coord, image)
        logger.info('Estimating gaze')
        gaze, image = gaze_estimator.predict(eyes[0], eyes[1], pose, eyes_center, image)
        logger.info('Gaze vector (x,y,z)= ({},{},{})'.format(gaze[0][0], gaze[0][1], gaze[0][2]))
        if output_path:
            logger.info('Writing output frame to file')
            out_video.write(image)
        if show_frame and (count % 5==0):
            # show intermediate result every 5 frames
            cv2.imshow('frame'.format(count), image)
            # Press Q on keyboard to stop
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        #TODO comment the following to deactivate mouse movement
        #if want to focus more on the intermediate result!
        if count%10==0:
            # pyautogui.moveRel blocking 0.1s -> blocking inference -> move only every 10 frames
            mouse_controller.move(gaze[0][0], gaze[0][1])

    if output_path:
        print('Finished inference and successfully stored output to ', os.path.join(output_path, 'output_video.mp4'))
    else:
        print('Finished inference')

    running_time_report(face_detector.get_time(), landmark_detector.get_time(), pose_estimator.get_time(), gaze_estimator.get_time())

    print('Releasing resources')
    if output_path:
        out_video.release()
    feed.close()
    cv2.destroyAllWindows()
def infer_on_stream(args):

    network_fd = Face_Detection(args.face_detection_model, args.device)
    network_hp = Head_Pose_Estimation(args.head_pose_model, args.device)
    network_fl = Facial_Landmarks_Detection(args.facial_landmarks_model,
                                            args.device)
    network_ge = Gaze_Estimation(args.gaze_estimation_model, args.device)

    mouse_cont = MouseController(args.mouse_precision, args.mouse_speed)

    starting_loading = time.time()

    network_fd.load_model()
    network_hp.load_model()
    network_fl.load_model()
    network_ge.load_model()

    duration_loading = time.time() - starting_loading

    input_type = handle_input(args.input)

    feed = InputFeeder(input_type=input_type, input_file=args.input)

    feed.load_data()

    starting_inference = time.time()

    for flag, frame in feed.next_batch():
        if not flag:
            break
        key_pressed = cv2.waitKey(60)

        out_frame, face, face_coords = network_fd.predict(
            frame, args.prob_threshold, args.display)

        if len(face_coords) == 0:
            log.error("There is no face in the stream!")
            continue

        out_frame, head_angle = network_hp.predict(out_frame, face,
                                                   face_coords, args.display)
        out_frame, eye_left, eye_right, eye_center = network_fl.predict(
            out_frame, face, face_coords, args.display)
        out_frame, gaze = network_ge.predict(out_frame, eye_left, eye_right,
                                             eye_center, head_angle,
                                             args.display)

        mouse_cont.move(gaze[0], gaze[1])

        if key_pressed == 27:
            break

        cv2.imshow('Visualization', cv2.resize(out_frame, (600, 400)))

    duration_inference = time.time() - starting_inference

    print("Total loading time is: {}\nTotal inference time is: {} ".format(
        duration_loading, duration_inference))

    feed.close()
    cv2.destroyAllWindows
Пример #20
0
def main():
    args = build_argparser().parse_args()
    logger = logging.getLogger('main')

    is_benchmarking = False
    # initialize variables with the input arguments for easy access
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'LandmarkRegressionModel': args.landmarkRegressionModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }
    preview_flags = args.previewFlags
    input_filename = args.input
    device_name = args.device
    prob_threshold = args.prob_threshold
    output_path = args.output_path

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger.error("Unable to find specified model file" +
                         str(model_path))
            exit(1)

    # instantiate model
    face_detection_model = FaceDetectionModel(
        model_path_dict['FaceDetectionModel'],
        device_name,
        threshold=prob_threshold)
    landmark_detection_model = LandmarkDetectionModel(
        model_path_dict['LandmarkRegressionModel'],
        device_name,
        threshold=prob_threshold)
    head_pose_estimation_model = HeadPoseEstimationModel(
        model_path_dict['HeadPoseEstimationModel'],
        device_name,
        threshold=prob_threshold)
    gaze_estimation_model = GazeEstimationModel(
        model_path_dict['GazeEstimationModel'],
        device_name,
        threshold=prob_threshold)

    # load Models
    start_model_load_time = time.time()
    face_detection_model.load_model()
    landmark_detection_model.load_model()
    head_pose_estimation_model.load_model()
    gaze_estimation_model.load_model()
    total_model_load_time = time.time() - start_model_load_time

    feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.get_fps() / 10), (1920, 1080), True)

    frame_count = 0
    gaze_vectors = []
    start_inference_time = time.time()
    for ret, frame in feeder.next_batch():

        if not ret:
            break

        frame_count += 1

        key = cv2.waitKey(60)

        try:
            face_cords, cropped_image = face_detection_model.predict(frame)

            if type(cropped_image) == int:
                logger.warning("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict(
                cropped_image)
            pose_output = head_pose_estimation_model.predict(cropped_image)
            mouse_cord, gaze_vector = gaze_estimation_model.predict(
                left_eye_image, right_eye_image, pose_output)
            gaze_vectors.append(gaze_vector)

        except Exception as e:
            logger.warning("Could predict using model" + str(e) +
                           " for frame " + str(frame_count))
            continue

        image = cv2.resize(frame, (500, 500))

        if not len(preview_flags) == 0:
            preview_frame = draw_preview(frame, preview_flags, cropped_image,
                                         left_eye_image, right_eye_image,
                                         face_cords, eye_cords, pose_output,
                                         gaze_vector)
            image = np.hstack((cv2.resize(frame, (500, 500)),
                               cv2.resize(preview_frame, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(frame)

        if key == 27:
            break

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = frame_count / total_inference_time

    gaze_df = pd.DataFrame(gaze_vectors,
                           columns=['vector_x', 'vector_y', 'vector_z'])
    gaze_df.to_csv("gaze_vectors_excercise_video.csv", index=False)
    logger.info('Model load time: ' + str(total_model_load_time))
    logger.info('Inference time: ' + str(total_inference_time))
    logger.info('FPS: ' + str(fps))

    logger.info('Video stream ended')
    cv2.destroyAllWindows()
    feeder.close()
    """
Пример #21
0
def main():
    #Building the arguments
    args = build_parser().parse_args()
    previewFlag = args.previewFlags

    log = logging.getLogger()
    input_path = args.input
    inputFeed = None

    if input_path.lower() == 'cam':
        inputFeed = InputFeeder('cam')
    else:
        if not os.path.isfile(input_path):
            log.error("Unable to find the input file specified.")
            exit(1)
        inputFeed = InputFeeder('video', input_path)

    #Creating Model paths
    model_path = {
        'FaceDetectionModel': args.facedetectionmodel,
        'FacialLandmarksDetectionModel': args.faciallandmarkmodel,
        'GazeEstimationModel': args.gazeestimationmodel,
        'HeadPoseEstimationModel': args.headposemodel
    }

    for fnameKey in model_path.keys():
        if not os.path.isfile(model_path[fnameKey]):
            log.error('Unable to find the specified ' + fnameKey +
                      'binary file(.xml)')
            exit(1)

    #Creating Model Instances
    fd = FaceDetection(model_path['FaceDetectionModel'], args.device,
                       args.cpu_extension)
    flm = FacialLandmarkDetection(model_path['FacialLandmarksDetectionModel'],
                                  args.device, args.cpu_extension)
    gm = GazeEstimation(model_path['GazeEstimationModel'], args.device,
                        args.cpu_extension)
    hpe = Head_Pose_estimation(model_path['HeadPoseEstimationModel'],
                               args.device, args.cpu_extension)

    m_control = MouseController('medium', 'fast')

    #Loading data
    inputFeed.load_data()
    fd.load_model()
    flm.load_model()
    hpe.load_model()
    gm.load_model()

    frame_count = 0
    for ret, frame in inputFeed.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 10 == 0:
            cv2.imshow('Original Video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        coords, img = fd.predict(frame, args.prob_threshold)
        if type(img) == int:
            log.error("No face detected")
            if key == 27:
                break
            continue

        hpout = hpe.predict(img)
        left_eye, right_eye, eye_coord = flm.predict(img)
        mouse_coord, gaze_vec = gm.predict(left_eye, right_eye, hpout)

        if (not len(previewFlag) == 0):
            preview_img = img
            if 'fd' in previewFlag:
                preview_img = img
            if 'fld' in previewFlag:
                start_l = (eye_coord[0][0] - 10, eye_coord[0][1] - 10)
                end_l = (eye_coord[0][2] + 10, eye_coord[0][3] + 10)
                start_r = (eye_coord[1][0] - 10, eye_coord[1][1] - 10)
                end_r = (eye_coord[1][2] + 10, eye_coord[1][3] + 10)
                cv2.rectangle(img, start_l, end_l, (0, 255, 0), 2)
                cv2.rectangle(img, start_r, end_r, (0, 255, 0), 2)
            if 'hp' in previewFlag:
                cv2.putText(
                    preview_img,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hpout[0], hpout[1], hpout[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (255, 255, 255), 1)
            if 'ge' in previewFlag:
                x, y, w = int(gaze_vec[0] * 12), int(gaze_vec[1] * 12), 160
                lefteye = cv2.line(left_eye, (x - w, y - w), (x + w, y + w),
                                   (100, 0, 255), 1)
                cv2.line(lefteye, (x - w, y + w), (x + w, y - w),
                         (100, 0, 255), 1)
                righteye = cv2.line(right_eye, (x - w, y - w), (x + w, y + w),
                                    (100, 0, 255), 1)
                cv2.line(righteye, (x - w, y + w), (x + w, y - w),
                         (100, 0, 255), 1)
                img[eye_coord[0][1]:eye_coord[0][3],
                    eye_coord[0][0]:eye_coord[0][2]] = lefteye
                img[eye_coord[1][1]:eye_coord[1][3],
                    eye_coord[1][0]:eye_coord[1][2]] = righteye

            cv2.imshow("Detections", cv2.resize(preview_img, (500, 500)))
        if frame_count % 10 == 0:
            m_control.move(mouse_coord[0], mouse_coord[1])
        if key == 27:
            break
    log.error("Videostream Completed")
    cv2.destroyAllWindows()
    inputFeed.close()
Пример #22
0
def main():

    args = build_argparser().parse_args()
    Flags_ = args.Flags

    logger = logging.getLogger()
    inputFilePath = args.input_model
    inputFeeder = None
    if inputFilePath.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFilePath):
            logger.error("Unable to find specified video file")
            exit(1)
        inputFeeder = InputFeeder("video", inputFilePath)
    FDM, FLDM, GEM, HPEM = model_assigner(args, logger)

    mc = MouseController('medium', 'fast')

    inputFeeder.load_data()
    FDM.load_model()
    FLDM.load_model()
    HPEM.load_model()
    GEM.load_model()

    frame_count = 0
    logger.info(inputFeeder)
    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        frame_count += 1
        if frame_count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = FDM.predict(frame.copy(),
                                               args.prob_threshold)
        if type(croppedFace) == int:
            logger.error("Unable to detect the face.")
            if key == 27:
                break
            continue

        hp_out = HPEM.predict(croppedFace.copy())

        left_eye, right_eye, eye_coords = FLDM.predict(croppedFace.copy())

        new_mouse_coord, gaze_vector = GEM.predict(left_eye, right_eye, hp_out)

        if (not len(Flags_) == 0):
            preview_frame = frame.copy()
            if 'fd' in Flags_:
                preview_frame = croppedFace
            if 'fld' in Flags_:
                cv2.rectangle(croppedFace,
                              (eye_coords[0][0] - 10, eye_coords[0][1] - 10),
                              (eye_coords[0][2] + 10, eye_coords[0][3] + 10),
                              (0, 255, 0), 3)
                cv2.rectangle(croppedFace,
                              (eye_coords[1][0] - 10, eye_coords[1][1] - 10),
                              (eye_coords[1][2] + 10, eye_coords[1][3] + 10),
                              (0, 255, 0), 3)

            if 'hp' in Flags_:
                cv2.putText(
                    preview_frame,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hp_out[0], hp_out[1], hp_out[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)
            if 'ge' in Flags_:
                cv2.putText(
                    frame,
                    "Gaze Cords: x= {:.2f} , y= {:.2f} , z= {:.2f}".format(
                        gaze_vector[0], gaze_vector[1], gaze_vector[2]),
                    (20, 80), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2)
                x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] *
                                                        12), 160
                le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                croppedFace[eye_coords[0][1]:eye_coords[0][3],
                            eye_coords[0][0]:eye_coords[0][2]] = le
                croppedFace[eye_coords[1][1]:eye_coords[1][3],
                            eye_coords[1][0]:eye_coords[1][2]] = re

            cv2.imshow("visualization", cv2.resize(preview_frame, (500, 500)))

        if frame_count % 5 == 0:
            mc.move(new_mouse_coord[0], new_mouse_coord[1])
        if key == 27:
            break
    logger.error("VideoStream ended...")
    cv2.destroyAllWindows()
    inputFeeder.close()
Пример #23
0
def main():

    args = build_argparser().parse_args()
    Flags = args.Flags

    logger = logging.getLogger()
    inputFilePath = args.input
    inputFeeder = None

    if inputFilePath.lower() == "cam":
        inputFeeder = InputFeeder("cam")
    else:
        if not os.path.isfile(inputFilePath):
            logger.error("Unable to find video file")
            exit(1)
        inputFeeder = InputFeeder("video", inputFilePath)

    Dir = {
        'facedetection': args.facedetectionmodel,
        'facelandmarksdetection': args.faciallandmarkmodel,
        'Gaze': args.gazeestimationmodel,
        'head_pose': args.headposemodel
    }

    for fileKey in Dir.keys():
        if not os.path.isfile(Dir[fileKey]):
            logger.error("Unable to find  " + fileKey + " xml file")
            exit(1)

    Fd = facedetection(Dir['facedetection'], args.device, args.cpu_extension)
    Fl = facelandmarksdetection(Dir['facelandmarksdetection'], args.device,
                                args.cpu_extension)
    Ge = Gaze(Dir['Gaze'], args.device, args.cpu_extension)
    Hp = head_pose(Dir['head_pose'], args.device, args.cpu_extension)
    Mc = MouseController('medium', 'fast')

    #loading
    start_model_load_time = time.time()

    inputFeeder.load_data()
    Fd.load_model()
    Fl.load_model()
    Hp.load_model()
    Ge.load_model()

    total_model_load_time = time.time() - start_model_load_time

    count = 0
    start_inference_time = time.time()

    for ret, frame in inputFeeder.next_batch():
        if not ret:
            break
        count += 1

        if count % 5 == 0:
            cv2.imshow('video', cv2.resize(frame, (500, 500)))

        key = cv2.waitKey(60)
        croppedFace, face_coords = Fd.predict(frame.copy(),
                                              args.prob_threshold)

        if type(croppedFace) == int:
            logger.error("unsupported layers, could not detect face")
            if key == 27:
                break
            continue

        hp_out = Hp.predict(croppedFace.copy())

        l_coords, r_coords, coords = Fl.predict(croppedFace.copy())

        new_coord, Gaze_vec = Ge.predict(l_coords, r_coords, hp_out)

        total_time = time.time() - start_inference_time
        total_inference_time = round(total_time, 1)

        fps = count / total_inference_time

        if (not len(Flags) == 0):
            new_frame = frame.copy()
            if 'fd' in Flags:
                new_frame = croppedFace

            if 'fl' in Flags:
                cv2.rectangle(croppedFace,
                              (coords[0][0] - 10, coords[0][1] - 10),
                              (coords[0][2] + 10, coords[0][3] + 10),
                              (0, 255, 0), 3)
                cv2.rectangle(croppedFace,
                              (coords[1][0] - 10, coords[1][1] - 10),
                              (coords[1][2] + 10, coords[1][3] + 10),
                              (0, 255, 0), 3)

            if 'hp' in Flags:
                cv2.putText(
                    new_frame,
                    "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}".
                    format(hp_out[0], hp_out[1], hp_out[2]), (10, 20),
                    cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1)

            if 'ge' in Flags:
                x, y, w = int(Gaze_vec[0] * 12), int(Gaze_vec[1] * 12), 160
                le = cv2.line(l_coords.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                re = cv2.line(r_coords.copy(), (x - w, y - w), (x + w, y + w),
                              (255, 0, 255), 2)
                cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                croppedFace[coords[0][1]:coords[0][3],
                            coords[0][0]:coords[0][2]] = le
                croppedFace[coords[1][1]:coords[1][3],
                            coords[1][0]:coords[1][2]] = re

            cv2.imshow("visualization", cv2.resize(new_frame, (500, 500)))

        if count % 5 == 0:
            Mc.move(new_coord[0], new_coord[1])
        if key == 27:
            break

    logger.error("Video Done...")
    print(total_inference_time)
    print(fps)
    print(total_model_load_time)

    cv2.destroyAllWindows()
    inputFeeder.close()
def main():
    arg_parser = ArgParser()
    args = arg_parser.get_args()

    input_file = args.input

    # If input file defined then use it else use the webcam
    if input_file:
        if not os.path.isfile(input_file):
            log.error("Input file cannot be found")
            exit()
        input_feeder = InputFeeder("video", input_file)
    else:
        input_feeder = InputFeeder("cam")

    face_detection_model = FaceDetection(args.face_detection_model,
                                         args.device, args.extensions)
    face_detection_model.load_model()

    facial_landmarks_model = FacialLandmarksDetection(
        args.facial_landmark_detection_model, args.device, args.extensions)
    facial_landmarks_model.load_model()

    gaze_model = GazeEstimation(args.gaze_estimation_model, args.device,
                                args.extensions)
    gaze_model.load_model()

    head_pose_model = HeadPoseEstimation(args.head_pose_estimation_model,
                                         args.device, args.extensions)
    head_pose_model.load_model()

    mouse_controller = MouseController('medium', 'fast')

    input_feeder.load_data()

    frame_count = 0
    total_face_detection_inference_time = 0
    total_facial_landmark_inference_time = 0
    total_head_pose_inference_time = 0
    total_gaze_estimation_inference_time = 0
    total_inference_time = 0
    for ret, frame in input_feeder.next_batch():

        if not ret:
            log.error("ret variable not found")
            break

        frame_count += 1

        if frame_count % args.mouse_update_interval == 0:
            cv2.imshow('Input', frame)

        key_pressed = cv2.waitKey(60)

        # Run inference on the face detection model
        start_time = time.time()
        cropped_face, face_coordinates = face_detection_model.predict(
            frame.copy(), args.probability_threshold)
        finish_time = time.time()
        total_face_detection_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # If no face detected get the next frame
        if len(face_coordinates) == 0:
            continue

        # Run inference on the facial landmark detection model
        start_time = time.time()
        results = facial_landmarks_model.predict(cropped_face.copy())
        finish_time = time.time()
        left_eye_coordinates = results[0]
        right_eye_coordinates = results[1]
        left_eye_image = results[2]
        right_eye_image = results[3]
        left_eye_crop_coordinates = results[4]
        right_eye_crop_coordinates = results[5]
        total_facial_landmark_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # Run inference on the head pose estimation model
        start_time = time.time()
        head_pose = head_pose_model.predict(cropped_face.copy())
        finish_time = time.time()
        total_head_pose_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        # Run inference on the gaze estimation model
        start_time = time.time()
        new_mouse_x_coordinate, new_mouse_y_coordinate, gaze_vector = gaze_model.predict(
            left_eye_image, right_eye_image, head_pose)
        finish_time = time.time()
        total_gaze_estimation_inference_time += finish_time - start_time
        total_inference_time += finish_time - start_time

        if frame_count % args.mouse_update_interval == 0:
            log.info("Mouse controller new coordinates: x = {}, y = {}".format(
                new_mouse_x_coordinate, new_mouse_y_coordinate))
            mouse_controller.move(new_mouse_x_coordinate,
                                  new_mouse_y_coordinate)

            # Optional visualization configuration:
            if args.show_detected_face:
                showDetectedFace(frame, face_coordinates)

            if args.show_head_pose:
                showHeadPose(frame, head_pose)

            if args.show_facial_landmarks:
                showFacialLandmarks(cropped_face, left_eye_crop_coordinates,
                                    right_eye_crop_coordinates)

            if args.show_gaze_estimation:
                showGazeEstimation(frame, right_eye_coordinates,
                                   left_eye_coordinates, gaze_vector,
                                   cropped_face, face_coordinates)

        # Break if escape key pressed
        if key_pressed == 27:
            log.warning("Keyboard interrupt triggered")
            break

    # Release the capture and destroy any OpenCV windows
    cv2.destroyAllWindows()
    input_feeder.close()
    log.info("Average face detection inference time: {} seconds".format(
        total_face_detection_inference_time / frame_count))
    log.info(
        "Average facial landmark detection inference time: {} seconds".format(
            total_facial_landmark_inference_time / frame_count))
    log.info("Average head pose estimation inference time: {} seconds".format(
        total_head_pose_inference_time / frame_count))
    log.info("Average gaze estimation inference time: {} seconds".format(
        total_gaze_estimation_inference_time / frame_count))
    log.info("Average total inference time: {} seconds".format(
        total_inference_time / frame_count))
def main():
    # Grab command line args
    args = build_args().parse_args()
    # Config Logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger()
    #os.system('clear')
    print("\n")
    logger.info("starting app ...")
    print("\n==========<COMPUTER POINTER CONTROLLER>==========")
    print("============>(c) Ibrahim Ishaka 2020<============\n")

    # initialize model object for each class
    FDModel = FaceDetectionModel(model=args.face_detection_model,
                                 device=args.device,
                                 extensions=args.extension,
                                 threshold=args.prob_threshold)
    FLDModel = FacialLandmarksDetectionModel(model=args.facial_landmark_model,
                                             device=args.device,
                                             extensions=args.extension)
    HPEModel = HeadPoseEstimationModel(model=args.head_pose_model,
                                       device=args.device,
                                       extensions=args.extension)
    GEModel = GazeEstimationModel(model=args.gaze_estimation_model,
                                  device=args.device,
                                  extensions=args.extension)

    models = {'fd': FDModel, 'fl': FLDModel, 'hp': HPEModel, 'ge': GEModel}

    models_loading_time = 0
    for k in models:
        # load model
        logger.info("Loading {} Model".format(models[k].model_name))
        model_loading_start = time.time()
        models[k].load_model()
        model_loading_finish = (time.time() - model_loading_start)
        models_loading_time = models_loading_time + model_loading_finish
        logger.info("time taken to load Model: {:.3f}secs".format(
            model_loading_finish))

        # check if model output visualization is specified in sh arg
        if k in args.show_output or args.show_output == 'all':
            models[k].show = True
        logger.info("show {} outputs: {} \n".format(models[k].model_name,
                                                    models[k].show))

    logger.info("time taken to load All Models: {:.3f}secs\n".format(
        models_loading_time))

    # setting for mouse controller
    _precision = "medium"
    _speed = "fast"
    mouse_controller = MouseController(precision=_precision, speed=_speed)

    # verify and handle input stream
    input_source = args.input
    input_feeder = None
    input_type = ""
    if input_source.lower() != "cam":
        # check if input file exist
        if os.path.exists(input_source) and os.path.isfile(input_source):
            image_formats = [".png", ".jpg", ".bmp", ".jpeg"]
            is_image = [
                True for x in image_formats if input_source.endswith(x)
            ]
            if is_image:
                input_type = "image"
            else:
                input_type = "video"
            input_feeder = InputFeeder(input_type=input_type,
                                       input_file=input_source)
        else:
            logger.error("Input file is not a file, or does't exist")
            sys.exit(1)
    elif input_source.lower() == "cam":
        input_type = "cam"
        input_feeder = InputFeeder(input_type=input_type)

    input_feeder.load_data()
    frame_count = 0
    total_inference_time_all = 0
    window_closed = False

    for flag, frame in input_feeder.next_batch():
        if flag is False:
            # no frame to read
            break
        frame_count = frame_count + 1
        key_pressed = cv2.waitKey(60)

        if input_source == 'cam':
            # preprocess frame as webcam is backwards/inverted
            frame = cv2.flip(frame, 1)

        face_detection_result = FDModel.predict(frame)
        # The prediction result should return None, if no face detected
        if face_detection_result is None:
            if not window_closed:
                cv2.imshow(input_type, cv2.resize(frame, (500, 500)))
            logger.info("NO FACE DETECTED... skipping")
            continue
        cropped_face = face_detection_result[0]
        face_coords = face_detection_result[1]
        hp_result = HPEModel.predict(cropped_face)
        left_eye, right_eye = FLDModel.predict(cropped_face)
        new_mouse_coords, gaze_vector = GEModel.predict(
            left_eye, right_eye, hp_result)

        total_inference_time = 0
        for key in models:
            total_inference_time = total_inference_time + models[
                key].inference_time
            total_inference_time_all = total_inference_time_all + total_inference_time

        #uncomment the following line to see the inference time for each frame
        #logger.info("Inference Time : {:.3f}".format(total_inference_time))

        try:
            x, y = new_mouse_coords
        except:
            logger.error(
                "unable to get mouse coordinates for current frame\nReading Next Frame..."
            )
            continue

        if GEModel.show == True:
            GEModel.show_gaze(left_eye, right_eye, gaze_vector)
        if HPEModel.show == True:
            frame = HPEModel.show_hp(frame, hp_result)

        if new_mouse_coords is None:
            # Error during LR_eyes processing
            continue
        '''
        wait on before moving mouse again
        this is recomended to avoid failsafe exception
        but you change this setting
        '''
        if input_type == "image":
            cv2.imshow(input_type, cv2.resize(frame, (500, 500)))
            mouse_controller.move(x, y)
            break

        if frame_count % 5 == 0:
            try:
                logger.info("changing mouse position... moving")
                mouse_controller.move(x, y)
            except pyautogui.FailSafeException:
                logger.error("safe exception From pyautogui")
                continue

        if not window_closed:
            cv2.imshow(input_type, cv2.resize(frame, (500, 500)))

        # Break if escape key pressed
        if key_pressed == 27:
            break

        # close the OpenCV window if q key pressed
        if key_pressed == ord('q'):
            window_closed = True
            cv2.destroyWindow(input_type)
            logger.info(input_type +
                        " window closed... to exit app, press CTRL+Z")

    if frame_count != 0:
        # Release the capture and destroy any OpenCV window
        input_feeder.close()
        cv2.destroyAllWindows()

        logger.info("Stream ended !")

        fps = round(frame_count / total_inference_time_all, 2)
        print("\n==========SUMMARY===========")
        print("models loading time  : ", round(models_loading_time, 2))
        print("frames per seconds   : ", fps)
        print("total inference time : ", round(total_inference_time_all, 2))
        print("============================")

    else:
        logger.error("Unable to handle Unsupported file ")
        sys.exit(1)
Пример #26
0
def infer_on_stream(args):
    
    start_model_load_time=time.time()
    
    #initiate and load models
    face_det_net = Face_Detection_Model(args.face_model)
    face_det_net.load_model()
    head_pose_net = Head_Pose_Model(args.head_model)
    head_pose_net.load_model()
    facial_landmarks_net = Facial_Landmarks_Model(args.landmarks_model)
    facial_landmarks_net.load_model()
    gaze_est_net = Gaze_Estimation_Model(args.gaze_model)
    gaze_est_net.load_model()
    total_model_load_time = time.time() - start_model_load_time
    
    #initiate stream
    counter=0
    start_inference_time=time.time()
    
    if args.input.lower()=="cam":
        frame_feeder = InputFeeder(input_type='cam')
        frame_feeder.load_data()
    else:
        frame_feeder = InputFeeder(input_type='video', input_file=args.input)
        frame_feeder.load_data()
    fps = frame_feeder.get_fps()
    log.info('Video started')
    
    #initiate mouse controller
    mouse_controller = MouseController('medium','fast')
    
    ## write output video in Winows
    out_video = cv2.VideoWriter('../output.mp4',cv2.VideoWriter_fourcc(*'avc1'),
                                fps,(frame_feeder.get_size()), True)
    
    ## write output video in Linux
    #out_video = cv2.VideoWriter('output.mp4',cv2.VideoWriter_fourcc(*'avc1'),
    #fps,(frame_feeder.get_size()))
    
    for flag,frame in frame_feeder.next_batch():
        if flag == True:             
            key = cv2.waitKey(60) 
            counter+=1
            coords, image, face = face_det_net.predict(frame)
            pose = head_pose_net.predict(face)
            land, left_eye_image, right_eye_image, eye_coords = facial_landmarks_net.predict(face)
            
            if left_eye_image.shape == (40, 40, 3):
                mouse_coords, gaze = gaze_est_net.predict(left_eye_image, right_eye_image, pose)
                
            mouse_controller.move(mouse_coords[0], mouse_coords[1])
            
            if args.visual.lower()=="yes":
                frame = draw_outputs(coords, eye_coords, pose, gaze, 
                                     mouse_coords[0], mouse_coords[1],
                                     image)
                cv2.imshow('video', frame)
                out_video.write(frame)
                cv2.imshow('video', frame)
            else:
                cv2.imshow('video', frame)
            if key == 27:
                break 
        else:
            log.info('Video ended')
            total_time=time.time()-start_inference_time
            total_inference_time=round(total_time, 1)
            f_ps=counter/total_inference_time
            log.info("Models load time {:.2f}.".format(total_model_load_time))
            log.info("Total inference time {:.2f}.".format(total_inference_time))
            log.info("Inference frames pre second {:.2f}.".format(f_ps))
            cv2.destroyAllWindows()
            frame_feeder.close()
            break
Пример #27
0
def main():
    args = build_argparser().parse_args()
    logger = logging.getLogger('main')

    is_benchmarking = False
    total_score = 0

    # initialize variables with the input arguments for easy access
    model_path_dict = {
        'FaceDetectionModel': args.faceDetectionModel,
        'LandmarkRegressionModel': args.landmarkRegressionModel,
        'HeadPoseEstimationModel': args.headPoseEstimationModel,
        'GazeEstimationModel': args.gazeEstimationModel
    }
    preview_flags = args.previewFlags
    input_filename = args.input
    device_name = args.device
    prob_threshold = args.prob_threshold
    output_path = args.output_path

    # add path for exercise video data
    exercise_video_path = '../bin/demo.mp4'
    exercise_gaze_path = '../bin/demo.csv'

    exercise_gaze_df = pd.read_csv(exercise_gaze_path)

    if input_filename.lower() == 'cam':
        feeder = InputFeeder(input_type='cam')
    else:
        if not os.path.isfile(input_filename):
            logger.error("Unable to find specified video file")
            exit(1)
        feeder = InputFeeder(input_type='video', input_file=input_filename)

    exercise_feeder = InputFeeder(input_type='video',
                                  input_file=exercise_video_path)

    for model_path in list(model_path_dict.values()):
        if not os.path.isfile(model_path):
            logger.error("Unable to find specified model file" +
                         str(model_path))
            exit(1)

    # instantiate model
    face_detection_model = FaceDetectionModel(
        model_path_dict['FaceDetectionModel'],
        device_name,
        threshold=prob_threshold)
    landmark_detection_model = LandmarkDetectionModel(
        model_path_dict['LandmarkRegressionModel'],
        device_name,
        threshold=prob_threshold)
    head_pose_estimation_model = HeadPoseEstimationModel(
        model_path_dict['HeadPoseEstimationModel'],
        device_name,
        threshold=prob_threshold)
    gaze_estimation_model = GazeEstimationModel(
        model_path_dict['GazeEstimationModel'],
        device_name,
        threshold=prob_threshold)

    # load Models
    start_model_load_time = time.time()
    face_detection_model.load_model()
    landmark_detection_model.load_model()
    head_pose_estimation_model.load_model()
    gaze_estimation_model.load_model()
    total_model_load_time = time.time() - start_model_load_time

    feeder.load_data()
    exercise_feeder.load_data()

    out_video = cv2.VideoWriter(os.path.join('output_video.mp4'),
                                cv2.VideoWriter_fourcc(*'avc1'),
                                int(feeder.get_fps() / 10), (1000, 500), True)

    frame_count = 0
    gaze_vectors = []
    start_inference_time = time.time()
    for ret, frame in feeder.next_batch():

        # flip the image to make it similar to video image
        frame = np.flip(frame, 1)
        ex_ret, ex_frame = next(exercise_feeder.next_batch())

        if not ret:
            break

        # This will stop the cam when exercise video is over
        if len(exercise_gaze_df) <= len(gaze_vectors):
            break

        frame_count += 1

        key = cv2.waitKey(60)

        try:
            face_cords, cropped_image = face_detection_model.predict(frame)

            if type(cropped_image) == int:
                logger.warning("Unable to detect the face")
                if key == 27:
                    break
                continue

            left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict(
                cropped_image)
            pose_output = head_pose_estimation_model.predict(cropped_image)
            mouse_cord, gaze_vector = gaze_estimation_model.predict(
                left_eye_image, right_eye_image, pose_output)
            gaze_vectors.append(gaze_vector)

        except Exception as e:
            logger.warning("Could predict using model" + str(e) +
                           " for frame " + str(frame_count))
            continue

        if not len(preview_flags) == 0:
            preview_frame = draw_preview(frame, 'ff', cropped_image,
                                         left_eye_image, right_eye_image,
                                         face_cords, eye_cords, pose_output,
                                         gaze_vector)
            cropped_image = np.hstack((cv2.resize(ex_frame, (500, 500)),
                                       cv2.resize(preview_frame, (500, 500))))

        instructor_gaze_vector = exercise_gaze_df.iloc[frame_count - 1].values
        score = cosine(instructor_gaze_vector, gaze_vector)
        if score > 0.1:
            total_score += 1

        # show score on output video
        cv2.putText(
            ex_frame,
            "Instructor Gaze Vector: {} ".format(instructor_gaze_vector),
            (40, 60), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2)
        cv2.putText(ex_frame, "User Gaze Vector: {}".format(gaze_vector),
                    (40, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2)
        cv2.putText(ex_frame, "Gaze Match Score : {}".format(total_score),
                    (40, 145), cv2.FONT_HERSHEY_COMPLEX, 1.5, (0, 0, 0), 2)
        ex_frame = cv2.rectangle(ex_frame, (20, 20), (1200, 160), (0, 0, 0), 2)

        image = np.hstack(
            (cv2.resize(ex_frame,
                        (500, 500)), cv2.resize(cropped_image, (500, 500))))

        cv2.imshow('preview', image)
        out_video.write(image)

        if key == 0:
            break

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = frame_count / total_inference_time

    if input_filename == "cam":
        filename = "cam.csv"
    else:
        filename = input_filename.split("/")[-1].split(".")[0] + ".csv"

    gaze_df = pd.DataFrame(gaze_vectors,
                           columns=['vector_x', 'vector_y', 'vector_z'])
    gaze_df.to_csv(filename, index=False)
    logger.info('Model load time: ' + str(total_model_load_time))
    logger.info('Inference time: ' + str(total_inference_time))
    logger.info('FPS: ' + str(fps))
    logger.info('Video stream ended')
    cv2.destroyAllWindows()
    feeder.close()
    """
Пример #28
0
def main():
    args =arg_parser().parse_args()
    input_file = args.input
    visual = args.visualization

    if input_file == "cam":
        input_feeder = InputFeeder("cam")
    
    elif input_file == "image":
        input_feeder = InputFeeder("image", input_file)

    elif not input_file:
        log.error("Input file not found")
        exit(1)
    
    else:
        input_feeder = InputFeeder("video", input_file)

    face_d = Face_Detector("../models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001.xml", "../models/intel/face-detection-adas-binary-0001/FP32-INT1/face-detection-adas-binary-0001.bin", args.device, args.extension)
    face_l = Face_Landmark_Detector("../models/intel/landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.xml", "../models/intel/landmarks-regression-retail-0009/FP32/landmarks-regression-retail-0009.bin", args.device, args.extension)
    gaze = Gaze_Estimator("../models/intel/gaze-estimation-adas-0002/FP32/gaze-estimation-adas-0002.xml", "../models/intel/gaze-estimation-adas-0002/FP32/gaze-estimation-adas-0002.bin", args.device, args.extension)
    head = Head_Pose_Estimator("../models/intel/head-pose-estimation-adas-0001/FP32/head-pose-estimation-adas-0001.xml", "../models/intel/head-pose-estimation-adas-0001/FP32/head-pose-estimation-adas-0001.bin", args.device, args.extension)

    mouse_control = MouseController('medium', 'fast')

    input_feeder.load_data()

    face_d.load_model()
    face_l.load_model()
    gaze.load_model()
    head.load_model()

    count = 0
    f_count = 0
    inf_time = 0

    for _, frame in input_feeder.next_batch():
        if not _:
            break;

        if frame is not None:
            f_count += 1
            if f_count%5 == 0:
                cv2.imshow('video', cv2.resize(frame, (500, 500)))      
            
            key = cv2.waitKey(60)

            crop_face, face_coords = face_d.predict(frame, 0.5)
            if isinstance(crop_face, int):
                log.info("No face in frame")
                if key == 27:
                    break
                continue
            
            head_pose = head.predict(crop_face)
            le_eye, ri_eye, eye_coords = face_l.predict(crop_face)   
            new_mouse_coord, gaze_vector = gaze.predict(le_eye, ri_eye, head_pose)
            count = count + 1
            if (not len(visual) == 0):
                preview_window = frame.copy()
                
                if 'face' in visual:
                    if len(visual) != 1:
                        preview_window = crop_face
                    else:
                        cv2.rectangle(preview_window, (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (0, 150, 0), 3)

                if 'facel' in visual:
                    if not 'face' in visual:
                        preview_window = crop_face.copy()

                    cv2.rectangle(preview_window, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0,255,0), 3)
                    cv2.rectangle(preview_window, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0,255,0), 3)
                    
                if 'head' in visual:
                    cv2.putText(
                        preview_window, 
                        "Pose Angles: pitch:{:.2f} , roll:{:.2f} , yaw:{:.2f}".format(head_pose[0], head_pose[1], head_pose[2]), (50, 50), 
                        cv2.FONT_HERSHEY_COMPLEX, 
                        1, 
                        (0, 255, 0), 
                        1, 
                        cv2.LINE_AA
                    )

                if 'gaze' in visual:
                    if not 'face' in visual:
                        preview_window = crop_face.copy()

                    x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160
                    
                    le = cv2.line(le_eye, (x - w, y - w), (x + w, y + w), (255, 0, 255), 2)
                    cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                    
                    re = cv2.line(ri_eye, (x - w, y - w), (x + w, y + w), (255, 0, 255), 2)
                    cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2)
                    
                    preview_window[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = le
                    preview_window[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = re
            
            if len(visual) != 0:
                img_h = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_window, (500, 500))))
            else:
                img_h = cv2.resize(frame, (500, 500))

            cv2.imshow('Visuals', img_h)

            if f_count%5 == 0:                       
                mouse_control.move(new_mouse_coord[0], new_mouse_coord[1])    
            
            if key == 27:
                break
    
    log.info("End of session.")
    cv2.destroyAllWindows()
    input_feeder.close()
Пример #29
0
class Computer_Pointer_Controller:
    def __init__(self, args):

        # load the objects corresponding to the models
        self.face_detection = Face_Detection(args.face_detection_model,
                                             args.device, args.extensions,
                                             args.perf_counts)
        self.gaze_estimation = Gaze_Estimation(args.gaze_estimation_model,
                                               args.device, args.extensions,
                                               args.perf_counts)
        self.head_pose_estimation = Head_Pose_Estimation(
            args.head_pose_estimation_model, args.device, args.extensions,
            args.perf_counts)
        self.facial_landmarks_detection = Facial_Landmarks_Detection(
            args.facial_landmarks_detection_model, args.device,
            args.extensions, args.perf_counts)

        start_models_load_time = time.time()
        self.face_detection.load_model()
        self.gaze_estimation.load_model()
        self.head_pose_estimation.load_model()
        self.facial_landmarks_detection.load_model()

        logger = logging.getLogger()
        input_T = args.input_type
        input_F = args.input_file

        if input_T.lower() == 'cam':
            # open the video feed
            self.feed = InputFeeder(args.input_type, args.input_file)
            self.feed.load_data()
        else:
            if not os.path.isfile(input_F):
                logger.error('Unable to find specified video file')
                exit(1)
            file_extension = input_F.split(".")[-1]
            if (file_extension in ['jpg', 'jpeg', 'bmp']):
                self.feed = InputFeeder(args.input_type, args.input_file)
                self.feed.load_data()
            elif (file_extension in ['avi', 'mp4']):
                self.feed = InputFeeder(args.input_type, args.input_file)
                self.feed.load_data()
            else:
                logger.error(
                    "Unsupported file Extension. Allowed ['jpg', 'jpeg', 'bmp', 'avi', 'mp4']"
                )
                exit(1)

        print("Models total loading time :",
              time.time() - start_models_load_time)

        # init mouse controller
        self.mouse_controller = MouseController('low', 'fast')

    def run(self):
        inferences_times = []
        face_detections_times = []
        for batch in self.feed.next_batch():
            if batch is None:
                break

            # as we want the webcam to act as a mirror, flip the frame
            batch = cv2.flip(batch, 1)

            inference_time = time.time()
            face = self.face_detection.predict(batch)
            if face is None:
                logger.error('Unable to detect the face.')
                continue
            else:
                face_detections_times.append(time.time() - inference_time)

                left_eye_image, right_eye_image = self.facial_landmarks_detection.predict(
                    face)
                if left_eye_image is None or right_eye_image is None:
                    continue
                head_pose_angles = self.head_pose_estimation.predict(face)
                if head_pose_angles is None:
                    continue
                vector = self.gaze_estimation.predict(left_eye_image,
                                                      right_eye_image,
                                                      head_pose_angles)
                inferences_times.append(time.time() - inference_time)
                if args.show_face == "True":
                    cv2.imshow("Detected face", face)
                    cv2.waitKey(1)
                self.mouse_controller.move(vector[0], vector[1])

        self.feed.close()
        cv2.destroyAllWindows()
        print("Average face detection inference time:",
              sum(face_detections_times) / len(face_detections_times))
        print("Average total inferences time:",
              sum(inferences_times) / len(inferences_times))
def main(args):
    start_model_load_time=time.time()

    # load model
    class_face_detection = ModelFaceDetection(args.model_face_detection, args.device, args.threshold)
    class_face_detection.load_model()

    class_head_pose_estimation = ModelHeadPoseEstimation(args.model_head_pose_estimation, args.device)
    class_head_pose_estimation.load_model()

    class_facial_landmarks_detection = ModelFacialLandmarksDetection(args.model_facial_landmarks_detection, args.device)
    class_facial_landmarks_detection.load_model()

    class_gaze_estimation = ModelGazeEstimation(args.model_gaze_estimation, args.device)
    class_gaze_estimation.load_model()

    total_model_load_time = time.time() - start_model_load_time

    # input image
    feed=InputFeeder(input_type='video', input_file=args.input_path)
    feed.load_data()

    # output
    initial_w, initial_h, initial_fps = feed.get_info()

    counter = 0
    start_inference_time = time.time()

    # debug
    #print("initial_w:{}, initial_h:{}, initial_fps:{}".format(initial_w, initial_h, initial_fps))

    #out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), initial_fps, (initial_w, initial_h), True)
    out_video = cv2.VideoWriter('output_video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 10, (initial_w, initial_h), True)

    class_face_detection.initial_size(initial_w, initial_h)

    #mc = MouseController(precision='low', speed='slow')
    mc = MouseController(precision='high', speed='fast')

    for flag, batch in feed.next_batch():
        if not flag:
            break

        counter += 1

        # debug
        #print("batch.shape:{}".format(batch.shape))
        # if batch is not None:

        # face_detection
        cropped_face = class_face_detection.predict(batch)

        # head_pose_estimation
        head_pose_angles = class_head_pose_estimation.predict(cropped_face)

        # debug
        #print("angle_y_fc:{}, angle_p_fc:{}, angle_r_fc:{}".format(head_pose_angles[0], head_pose_angles[1], head_pose_angles[2]))

        # facial_landmarks_detection
        left_eye_image, right_eye_image, left_eye_center, right_eye_center= class_facial_landmarks_detection.predict(cropped_face)

        # gaze_estimation
        x, y, gaze_vector = class_gaze_estimation.predict(left_eye_image, right_eye_image, head_pose_angles)

        cv2.line(cropped_face, left_eye_center, (int(left_eye_center[0] + gaze_vector[0] * 100), int(left_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2)
        cv2.line(cropped_face, right_eye_center, (int(right_eye_center[0] + gaze_vector[0] * 100), int(right_eye_center[1] - gaze_vector[1] * 100)), (255,255,255), 2)

        # output
        cv2.imshow('output', batch)
        cv2.waitKey(30)
        cv2.imwrite('output.jpg', batch);

        out_video.write(batch)

        # MouseController
        mc.move(x, y)

    total_time = time.time() - start_inference_time
    total_inference_time = round(total_time, 1)
    fps = counter/total_inference_time

    print("total_model_load_time:{}, total_inference_time:{}, fps:{}".format(total_model_load_time, total_inference_time, fps))

    feed.close()
    cv2.destroyAllWindows()