def run_inference(args): feed = InputFeeder(input_type='video', input_file=args.input) feed.load_data() for batch in feed.next_batch(): cv2.imshow("Output", cv2.resize(batch, (500, 500))) key = cv2.waitKey(60) if (key == 27): break # getting face faceDetection = FaceDetection(model_name=args.face_detection_model) faceDetection.load_model() face = faceDetection.predict(batch) # getting eyes facialLandmarksDetection = FacialLandmarksDetection( args.facial_landmarks_detection_model) facialLandmarksDetection.load_model() left_eye, right_eye = facialLandmarksDetection.predict(face) # getting head pose angles headPoseEstimation = HeadPoseEstimation( args.head_pose_estimation_model) headPoseEstimation.load_model() head_pose = headPoseEstimation.predict(face) print("head pose angles: ", head_pose) # get mouse points gazeEstimation = GazeEstimation(args.gaze_estimation_model) gazeEstimation.load_model() mouse_coords = gazeEstimation.predict(left_eye, right_eye, head_pose) print("gaze output: ", mouse_coords) feed.close()
def main(): args = build_argparser().parse_args() inputFile = args.input inputFeeder = None if inputFile.lower() == "cam": inputFeeder = InputFeeder("cam") else: if not os.path.isfile(inputFile): print("Unable to find input file") exit(1) inputFeeder = InputFeeder("video",inputFile) start_model_loading = time.time() detect,landmark,gaze,pose=init_models(args) inputFeeder.load_data() LoadModel(detect, landmark, gaze, pose) model_loading_time = time.time() - start_model_loading frame_count,inference_time = inference_frame(detect,pose,landmark,gaze,inputFeeder,args) fps = frame_count / inference_time print("video is complete!") print(f'Model took {model_loading_time} s to load') print(f'Inference time of the model is: {inference_time} s') print(f'Average inference time of the model is : {inference_time/frame_count} s') print(f'FPS is {fps/5} frame/second') cv2.destroyAllWindows() inputFeeder.close()
def main(): # Load parameters params = get_args() mouse_prec = params['mouse_prec'] mouse_speed = params['mouse_speed'] mouse = MouseController(mouse_prec, mouse_speed) models = load_models(params) # Load input feed input_type = params['input_type'] if input_type=='cam': input_file = None else: input_file = params['input_file_path'] feed=InputFeeder(input_type=input_type, input_file=input_file) feed.load_data() for batch in feed.next_batch(): if batch is not None: image, pos = main_loop(batch, models) cv2.imshow('frame', image) if cv2.waitKey(1) & 0xFF == ord('q'): break mouse.move(pos[0], pos[1]) # break else: break feed.close()
def process_video(file_input, file_output, display_intermediate_output): if file_input is None: feed = InputFeeder(input_type='cam') else: feed = InputFeeder(input_type='video', input_file=file_input) feed.load_data() w = int(feed.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(feed.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(feed.cap.get(cv2.CAP_PROP_FPS)) out = cv2.VideoWriter(file_output, cv2.VideoWriter_fourcc(*'avc1'), fps, (w, h), True) frame_counter = 0 for batch in feed.next_batch(): frame_counter += 1 result, frame = process_single_frame(batch, display_intermediate_output) out.write(frame) logging.debug(f'Frame #{frame_counter} result: {result}') if type(result) == str and result == 'No face detected': logging.warning('Frame {}: No face detected', frame_counter) if mouse_controller is not None: mouse_controller.move(result[0], result[1]) out.release() feed.close()
def process_image(file_path, file_output, display_intermediate_output): feed = InputFeeder(input_type='image', input_file=file_path) feed.load_data() for batch in feed.next_batch(): result, image = process_single_frame(batch, display_intermediate_output) # cv2.imshow('demo image', image) cv2.imwrite(file_output, image) cv2.waitKey(0) cv2.destroyAllWindows() feed.close()
def main(args): mouse_controller = MouseController('medium', 'fast') print("Model Loading..") face_detection = Model_FaceDetection(args.face_detection, args.device) face_landmark = Model_FacialLandmarksDetection(args.face_landmark, args.device) head_pose = Model_HeadPoseEstimation(args.head_pose, args.device) gaze_estimation = Model_GazeEstimation(args.gaze_estimation, args.device) print("Model loaded successfully") input_feeder = InputFeeder(input_type='video', input_file=args.input) input_feeder.load_data() face_detection.load_model() head_pose.load_model() face_landmark.load_model() gaze_estimation.load_model() for frame in input_feeder.next_batch(): try: frame.shape except Exception as err: break key = cv2.waitKey(60) face,face_coord = face_detection.predict(frame.copy(), args.prob_threshold) if type(face)==int: print("Unable to detect the face.") if key==27: break continue headPose = head_pose.predict(face.copy()) left_eye, right_eye, eye_coord = face_landmark.predict(face.copy()) mouse_coord, gaze_vector = gaze_estimation.predict(left_eye, right_eye, headPose) cv2.imshow('video',frame) mouse_controller.move(mouse_coord[0], mouse_coord[1]) input_feeder.close() cv2.destroyAllWindows()
def process_video(input_video, video_output, visualize): if input_video is None: feed = InputFeeder(input_type='cam') else: feed = InputFeeder(input_type='video', input_file=input_video) feed.load_data() w = int(feed.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(feed.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(feed.cap.get(cv2.CAP_PROP_FPS)) fps = int(fps / 4) fourcc = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(video_output, fourcc, fps, (w, h), True) frame_counter = 0 for frame in feed.next_batch(): if frame is not None: frame_counter += 1 key = cv2.waitKey(10) result, output_frame = process_frame(frame, visualize) out.write(output_frame) print("Frame: {} result: {}".format(frame_counter, result)) logger.info("Frame: {} result: {}".format(frame_counter, result)) esc_code = 27 if key == esc_code: break if mouse_controller is not None: try: mouse_controller.move(result[0], result[1]) except Exception as e: print("Mouse controller exception:\n", e) logger.info("Mouse controller exception:{}".format(e)) else: break cv2.destroyAllWindows() out.release() feed.close() print("Saved the video") logger.info("Saved the video")
def main(model_dir, device, precision, input_type, input_file, inspect): mouse_controller = MouseController("medium", "fast") input_feeder = InputFeeder(input_type=input_type, input_file=input_file) input_feeder.load_data() gaze_detect = GazeDetect(model_dir=model_dir, device=device, precision=precision) gaze_detect.load_model() for image in input_feeder.next_batch(): with Timer() as t: outputs = gaze_detect.predict(image) if outputs is not None: angle_y_fc, angle_p_fc, angle_r_fc = outputs.reshape(3) mouse_controller.move(-angle_y_fc, angle_p_fc) print( f"Mouse move x: {-angle_y_fc}, y: {angle_p_fc}, execution time: {t.elapsed}" )
def start_pipeline(cla, codec): """ Initializes feeds inputs to models, moving the mouse cursor based on the final gaze estimation. :param cla: Command line arguments for configuring the pipeline. :param codec: Depending on the platform this is run on, OpenCV requires a codec to be specified. Supply it here. :return: None """ preview_flags = cla.preview_flags logger = logging.getLogger() input_file_path = cla.input if input_file_path.lower() == "cam": in_feeder = InputFeeder("cam") elif not os.path.isfile(input_file_path): # top = os.path.dirname(os.path.realpath(__file__)) # walktree(top, visit_file) logger.error("Cannot locate video file provided. Exiting..") sys.exit(1) else: in_feeder = InputFeeder("video", input_file_path) start_model_load_time = time.time() fdm, fldm, hpem, gem = prep_models(cla) total_model_load_time = time.time() - start_model_load_time mc = None if not cla.is_benchmark: mc = MouseController('medium', 'fast') in_feeder.load_data() fps, total_inference_time, total_time = handle_input_feed( logger, preview_flags, fdm, fldm, hpem, gem, mc, in_feeder, cla.frame_out_rate, codec, cla.output_path) with open(os.path.join(cla.output_path, 'stats.txt'), 'w') as f: f.write("Total inference time, " + str(total_inference_time) + '\n') f.write("FPS, " + str(fps) + '\n') f.write("Total model load time, " + str(total_model_load_time) + '\n') f.write("Total time, " + str(total_time) + '\n') logger.error("Video stream ended...") cv2.destroyAllWindows() in_feeder.close()
def main(): """ Load the network and parse the output. :return: None """ # Grab command line args args = build_argparser().parse_args() start_time = time.time() face_detector = FaceDetect(model_name=args.face, device=args.device, output=args.output) face_detector.load_model() print("Time taken to load face detection model (in seconds):", time.time()-start_time) start_time = time.time() eyes_detector = EyesDetect(model_name=args.eyes, device=args.device, output=args.output) eyes_detector.load_model() print("Time taken to load landmark detection model (in seconds):", time.time()-start_time) start_time = time.time() angle_detector = AngleDetect(model_name=args.angle, device=args.device) angle_detector.load_model() print("Time taken to load head pose estimation model (in seconds):", time.time()-start_time) start_time = time.time() gaze_detector = GazeDetect(model_name=args.gaze, device=args.device) gaze_detector.load_model() print("Time taken to load gaze estimation model (in seconds):", time.time()-start_time) mouse_controller = MouseController('medium','medium') feed=InputFeeder(input_type=args.video, input_file=args.input) feed.load_data() for batch in feed.next_batch(): if batch is None: # catch last frame break face = face_detector.predict(batch) left_eye, right_eye = eyes_detector.predict(face) angles = angle_detector.predict(face) x, y = gaze_detector.predict(left_eye, right_eye, angles) mouse_controller.move(x, y) feed.close()
def setup(args): global input_path, output_path, device, cpu_extension, prob_threshold, flags, mouse_controller, feeder, video_writer, model_dict, model_loading_total_time model_args = [ args.face_detection_model, args.facial_landmarks_detection_model, args.head_pose_estimation_model, args.gaze_estimation_model, ] model_class = [ Model_FaceDetection, Model_FacialLandMarkDetection, Model_HeadPoseEstimation, Model_GazeEstimation, ] input_path = input_path_generator(args.input) if args.input != "CAM" else None output_path = output_path_generator(args.output) device = args.device cpu_extension = args.cpu_extension prob_threshold = args.prob_threshold flags = args.flags if not os.path.exists(output_path): os.mkdir(output_path) mouse_controller = MouseController("low", "fast") if input_path: if input_path.endswith(".jpg"): feeder = InputFeeder("image", input_path) else: feeder = InputFeeder("video", input_path) else: feeder = InputFeeder("cam") feeder.load_data() fps = feeder.fps() initial_w, initial_h, video_len = feeder.frame_initials_and_length() video_writer = cv2.VideoWriter( os.path.join(output_path, "output_video.mp4"), cv2.VideoWriter_fourcc(*"avc1"), fps / 10, (initial_w, initial_h), True, ) model_dict, model_loading_total_time = generate_model_dict(model_args, model_class) return
def main(args): inference = Inference(args.model) inference.load_model() input = args.input if input == 0: input_feeder = InputFeeder('cam', input) elif input.endswith('.jpg') or input.endswith('.jpeg') or input.endswith( '.bmp'): input_feeder = InputFeeder('image', input) is_image = True else: input_feeder = InputFeeder('video', input) input_feeder.load_data() if is_image: outputs = inference.predict(input_feeder.cap) inference.preprocess_output(outputs) return 0 frames = 0 for ret, frame in input_feeder.next_batch(): if not ret: break frames += 1 key = cv2.waitKey(60) if key == 27: break outputs = inference.predict(frame) inference.preprocess_output(outputs) input_feeder.close()
def main(args): feed = InputFeeder(input_type=args.it, input_file=args.i) face_model = FaceDetectionModel(args.fm, args.d, args.c, float(args.p)) face_model.load_model() landmarks_model = LandmarksDetectionModel(args.lm, args.d, args.c) landmarks_model.load_model() headpose_model = HeadPoseDetectionModel(args.hpm, args.d, args.c) headpose_model.load_model() gaze_model = GazeEstimationModel(args.gem, args.d, args.c) gaze_model.load_model() mouse = MouseController("medium", "fast") feed.load_data() for batch in feed.next_batch(): # try: cropped_face, coords, _ = face_model.predict(batch) cv2.rectangle(batch, (coords[0], coords[1]), (coords[2], coords[3]), (255, 0, 0), 2) left_eye, right_eye, eyes_coords, _ = landmarks_model.predict( cropped_face) head_pose_angles, _ = headpose_model.predict(cropped_face) x, y, z, _ = gaze_model.predict(left_eye, right_eye, head_pose_angles, cropped_face, eyes_coords) mouse.move(x, y) cv2.imshow("img", batch) if cv2.waitKey(25) & 0xFF == ord('q'): break # except: # print("Frame without prediction. Error: ", sys.exc_info()[0]) # log.error(sys.exc_info()[0]) feed.close()
def main(): args = build_argparser().parse_args() visual = args.visual_flag log = logging.getLogger() input_source = args.input_source try: video_path = args.input_path except Exception as e: video_path = None feed = None if input_source.lower() == 'cam': feed = InputFeeder('cam') elif input_source.lower() == 'video' and os.path.isfile(video_path): feed = InputFeeder('video', video_path) else: log.error('Wrong input feed. (check the video path).') exit(1) fd = Model_Face(args.face_detection_model, args.device, args.extension) hp = Model_HeadPose(args.head_pose_model, args.device, args.extension) fl = Model_Faciallandmark(args.facial_landmarks_model, args.device, args.extension) ga = Model_Gaze(args.gaze_model, args.device, args.extension) ### You can specify the value of precision and speed directly. ## OR ## 'high'(100),'low'(1000),'medium','low-med' - precision ## 'fast'(1), 'slow'(10), 'medium', 'slow-med' - speed # mouse = MouseController('low-med', 'slow-med') mouse = MouseController(500, 4) feed.load_data() # load models fd.load_model() hp.load_model() fl.load_model() ga.load_model() count = 0 for ret, frame in feed.next_batch(): if not ret: break count += 1 if count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) frame_cp = frame.copy() face, face_position = fd.predict(frame_cp, args.threshold) if type(face) == int: log.error('Prediction Error: Cant find face.') if key == 27: break continue face_cp = face.copy() hp_output = hp.predict(face_cp) left_eye, right_eye, facial = fl.predict(face_cp) # print('left',left_eye,'\n','right',right_eye,'\n') mouse_coord, gaze_vector = ga.predict(left_eye, right_eye, hp_output) if (not len(visual) == 0): visual_frame = frame.copy() ### Visual FLAGS # face detection if 'fd' in visual: visual_frame = face # Head pose if 'hp' in visual: cv2.putText( visual_frame, "Yaw: {:.2f} Pitch: {:.2f} Roll: {:.2f}".format( hp_output[0], hp_output[1], hp_output[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.3, (0, 255, 50), 1) # Facial landmarks if 'fl' in visual: cv2.rectangle(face, (facial[0][0] - 10, facial[0][1] - 10), (facial[0][2] + 10, facial[0][3] + 10), (255, 0, 0), 3) cv2.rectangle(face, (facial[1][0] - 10, facial[1][1] - 10), (facial[1][2] + 10, facial[1][3] + 10), (255, 0, 0), 3) # Gaze estimation if 'ga' in visual: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 le = cv2.line(left_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 255, 0), 2) cv2.line(le, (x - w, y + w), (x + w, y - w), (255, 50, 150), 2) re = cv2.line(right_eye.copy(), (x - w, y - w), (x + w, y + w), (255, 255, 0), 2) cv2.line(re, (x - w, y + w), (x + w, y - w), (255, 50, 150), 2) face[facial[0][1]:facial[0][3], facial[0][0]:facial[0][2]] = le face[facial[1][1]:facial[1][3], facial[1][0]:facial[1][2]] = re cv2.namedWindow('Visualization', cv2.WINDOW_AUTOSIZE) cv2.moveWindow('Visualization', 900, 900) cv2.imshow('Visualization', cv2.resize(visual_frame, (500, 500))) if args.visual_save.lower() == 'y': if count % 10 == 0: cv2.imwrite(str(count) + '_visual.jpg', visual_frame) if count % 5 == 0: mouse.move(mouse_coord[0], mouse_coord[1]) if key == 27: break log.error('INFO: Ended!') cv2.destroyAllWindows() feed.close()
def main(): args = build_argparser().parse_args() device_name = args.device prob_threshold = args.prob_threshold logger_object = log.getLogger() # Initialize variables with the input arguments model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'FacialLandmarkModel': args.facialLandmarksModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } # Instantiate model face_model = FaceDetection(model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_model = FacialLandmarksDetection(model_path_dict['FacialLandmarkModel'], device_name, threshold=prob_threshold) head_pose_model = HeadPoseEstimation(model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_model = GazeEstimation(model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) mouse_controller = MouseController('medium', 'fast') # Load Models and get time start_time = time.time() face_model.load_model() logger_object.error("Face detection model loaded: time: {:.3f} ms".format((time.time() - start_time) * 1000)) first_mark = time.time() landmark_model.load_model() logger_object.error( "Facial landmarks detection model loaded: time: {:.3f} ms".format((time.time() - first_mark) * 1000)) second_mark = time.time() head_pose_model.load_model() logger_object.error("Head pose estimation model loaded: time: {:.3f} ms".format((time.time() - second_mark) * 1000)) third_mark = time.time() gaze_model.load_model() logger_object.error("Gaze estimation model loaded: time: {:.3f} ms".format((time.time() - third_mark) * 1000)) load_total_time = time.time() - start_time logger_object.error("Total loading time: time: {:.3f} ms".format(load_total_time * 1000)) logger_object.error("All models are loaded successfully..") # Check extention of these unsupported layers face_model.check_model() landmark_model.check_model() head_pose_model.check_model() gaze_model.check_model() preview_flags = args.previewFlags input_filename = args.input output_path = args.output_path prob_threshold = args.prob_threshold if input_filename.lower() == 'cam': input_feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger_object.error("Unable to find specified video file") exit(1) input_feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger_object.error("Unable to find specified model file" + str(model_path)) exit(1) input_feeder.load_data() width = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(input_feeder.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(input_feeder.cap.get(cv2.CAP_PROP_FPS)) out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), fps, (width, height), True) frame_counter = 0 start_inf_time = time.time() for ret, frame in input_feeder.next_batch(): if not ret: break frame_counter += 1 key = cv2.waitKey(60) try: cropped_image, face_cords = face_model.predict(frame, prob_threshold) if type(cropped_image) == int: print("Unable to detect the face") if key == 27: break continue left_eye, right_eye, eye_cords = landmark_model.predict(cropped_image) pose_output = head_pose_model.predict(cropped_image) x, y, z = gaze_model.predict(left_eye, right_eye, pose_output, cropped_image, eye_cords) mouse_controller.move(x, y) except Exception as e: print(str(e) + " for frame " + str(frame_counter)) continue image = cv2.resize(frame, (width, height)) if not len(preview_flags) == 0: preview_frame = frame.copy() if 'fd' in preview_flags: if len(preview_flags) != 1: preview_frame = cropped_image cv2.rectangle(frame, (face_cords[0], face_cords[1]), (face_cords[2], face_cords[3]), (0, 0, 255), 3) if 'hp' in preview_flags: cv2.putText( frame, "Pose Angles: yaw= {:.2f} , pitch= {:.2f} , roll= {:.2f}".format( pose_output[0], pose_output[1], pose_output[2]), (20, 40), cv2.FONT_HERSHEY_DUPLEX, 1, (255, 0, 0), 3) if 'ge' in preview_flags: cv2.putText( frame, "Gaze vector: x= {:.2f} , y= {:.2f} , z= {:.2f}".format( x, y, z), (15, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 3) image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(image) if frame_counter % 5 == 0: mouse_controller.move(x, y) if key == 27: break inference_time = round(time.time() - start_inf_time, 1) fps = int(frame_counter) / inference_time logger_object.error("counter {} seconds".format(frame_counter)) logger_object.error("total inference time {} seconds".format(inference_time)) logger_object.error("fps {} frame/second".format(fps)) with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stats.txt'), 'w') as f: f.write('inference time : ' + str(inference_time) + '\n') f.write('fps: ' + str(fps) + '\n') f.write('Models Loading: '+ str(load_total_time) + '\n') logger_object.error('Video stream ended') cv2.destroyAllWindows() input_feeder.close()
def main(): # Grab command line args args = build_argparser().parse_args() flags = args.models_outputs_flags logger = logging.getLogger() input_file_path = args.input input_feeder = None if input_file_path.lower() == "cam": input_feeder = InputFeeder("cam") else: if not os.path.isfile(input_file_path): logger.error("Unable to find specified video file") exit(1) input_feeder = InputFeeder("video", input_file_path) model_path_dict = { 'FaceDetection': args.face_detection_model, 'FacialLandmarks': args.facial_landmarks_model, 'GazeEstimation': args.gaze_estimation_model, 'HeadPoseEstimation': args.head_pose_estimation_model } for file_name_key in model_path_dict.keys(): if not os.path.isfile(model_path_dict[file_name_key]): logger.error("Unable to find specified " + file_name_key + " xml file") exit(1) fdm = FaceDetection(model_path_dict['FaceDetection'], args.device, args.cpu_extension) flm = FacialLandmarks(model_path_dict['FacialLandmarks'], args.device, args.cpu_extension) gem = GazeEstimation(model_path_dict['GazeEstimation'], args.device, args.cpu_extension) hpem = HeadPoseEstimation(model_path_dict['HeadPoseEstimation'], args.device, args.cpu_extension) mc = MouseController('medium', 'fast') input_feeder.load_data() fdm.load_model() flm.load_model() hpem.load_model() gem.load_model() frame_count = 0 for ret, frame in input_feeder.next_batch(): if not ret: break frame_count += 1 if frame_count % 5 == 0: cv2.imshow('video', cv2.resize(frame, (500, 500))) key = cv2.waitKey(60) cropped_face, face_coords = fdm.predict(frame, args.prob_threshold) if type(cropped_face) == int: logger.error("Unable to detect any face.") if key == 27: break continue hp_output = hpem.predict(cropped_face) left_eye_img, right_eye_img, eye_coords = flm.predict(cropped_face) new_mouse_coord, gaze_vector = gem.predict(left_eye_img, right_eye_img, hp_output) if (not len(flags) == 0): preview_frame = frame if 'fd' in flags: preview_frame = cropped_face if 'fld' in flags: cv2.rectangle(cropped_face, (eye_coords[0][0] - 10, eye_coords[0][1] - 10), (eye_coords[0][2] + 10, eye_coords[0][3] + 10), (0, 255, 0), 3) cv2.rectangle(cropped_face, (eye_coords[1][0] - 10, eye_coords[1][1] - 10), (eye_coords[1][2] + 10, eye_coords[1][3] + 10), (0, 255, 0), 3) if 'hp' in flags: cv2.putText( preview_frame, "Pose Angles: yaw:{:.2f} | pitch:{:.2f} | roll:{:.2f}". format(hp_output[0], hp_output[1], hp_output[2]), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 0.25, (0, 255, 0), 1) if 'ge' in flags: x, y, w = int(gaze_vector[0] * 12), int(gaze_vector[1] * 12), 160 left_eye = cv2.line(left_eye_img, (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(left_eye, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) right_eye = cv2.line(right_eye_img, (x - w, y - w), (x + w, y + w), (255, 0, 255), 2) cv2.line(right_eye, (x - w, y + w), (x + w, y - w), (255, 0, 255), 2) cropped_face[eye_coords[0][1]:eye_coords[0][3], eye_coords[0][0]:eye_coords[0][2]] = left_eye cropped_face[eye_coords[1][1]:eye_coords[1][3], eye_coords[1][0]:eye_coords[1][2]] = right_eye cv2.imshow("Visualization", cv2.resize(preview_frame, (500, 500))) if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) if key == 27: break logger.error("VideoStream ended...") cv2.destroyAllWindows() input_feeder.close()
def infer_on_stream(args, model): ''' :param args: argparser arguments :param model: loaded model ''' # get the loaded model instance objectDetection = model # Handle the input stream # Check if the input is a webcam or video or image if args.input == 'cam': feed = InputFeeder(input_type='cam', flip=1) feed.set_camera_properties(args.width, args.height, args.fps) elif args.input == 'picam': feed = InputFeeder(input_type='picam') feed.set_camera_properties(args.width, args.height, args.fps) elif args.input.endswith('.jpg') or args.input.endswith( '.bmp') or args.input.endswith('.png'): feed = InputFeeder(input_type='image', input_file=args.input) elif args.input.endswith('.mp4'): feed = InputFeeder(input_type='video', input_file=args.input) else: print( "ERROR: Invalid input, it must be CAM, image (.jpg, .bmp or .png) or video (.mp4)!" ) raise NotImplementedError feed.load_data() # run-time switches ui_marking = True fps_marking = False label_background_color = (125, 175, 75) label_text_color = (255, 255, 255) # white text cv2.namedWindow("Frame", cv2.WINDOW_NORMAL) cv2.setWindowProperty("Frame", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) # Start recording of output saving is enabled if args.save_output: now = datetime.datetime.now() out = cv2.VideoWriter(now.strftime("out-%Y%m%d-%H%M%S.avi"), cv2.VideoWriter_fourcc(*'MJPG'), 15, (args.width, args.height)) for batch in feed.next_batch(): if batch is None: continue # start measuring overall execution time start_processing_time = time.time() # 1) First detect objects on the image start_object_infer_time = time.time() # time measurement started objects = objectDetection.predict(batch) total_object_infer_time = time.time( ) - start_object_infer_time # time measurement finished # executed only if there are objects on the image if len(objects) > 0: # if UI marking is turned on draw the vectors, rectangles, etc if ui_marking: # objects bounding boxes for obj in objects: # draw the bounding box cv2.rectangle(batch, (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), obj['color'], 2) # prepare the label label_text = f"{obj['class']}: {obj['confidence']*100:.3}%" label_size = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)[0] label_left = obj['xmin'] label_top = obj['ymin'] - label_size[1] if (label_top < 1): label_top = 1 label_right = label_left + label_size[0] label_bottom = label_top + label_size[1] - 3 cv2.rectangle(batch, (label_left - 1, label_top - 6), (label_right + 1, label_bottom + 1), label_background_color, -1) cv2.putText(batch, label_text, (label_left, label_bottom), cv2.FONT_HERSHEY_SIMPLEX, 0.8, label_text_color, 1) # Measure overall FPS total_processing_time = time.time() - start_processing_time if total_processing_time == 0: total_processing_time = 0.001 # handle zero division total_fps = 1 / (total_processing_time) # if FPS marking run time switch is turned on print some details on the image if fps_marking: label_text = f"FPS: {total_fps:.3}" cv2.putText(batch, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) label_text = f"Object detection inference time: {total_object_infer_time*1000:.4}ms" cv2.putText(batch, label_text, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) # Show the output image and save the output video cv2.imshow('Frame', batch) if args.save_output: out.write(batch) # Press q on keyboard to exit # Press r on keyboard to toggle roll compensation # Press u on keyboard to toggle ui drawings # Press f on keyboard to fps drawings ret = cv2.waitKey(20) if ret & 0xFF == ord('q'): break elif ret & 0xFF == ord('u'): ui_marking = not ui_marking elif ret & 0xFF == ord('f'): fps_marking = not fps_marking # close the feed when stopping and finish the video saving #feed.close() if args.save_output: out.release()
def infer(self, args): # Create instances from the models' classes FDM_net = ModelFaceDetection() HPE_net = ModelHeadPoseEstimation() FLD_net = ModelFacialLandmarksDetection() GEM_net = ModelGazeEstimation() mouse_controller = MouseController('high', 'fast') # Load the models start1 = time.time() FDM_net.load_model(args.face_detection_model, args.device) FDM_load_t = time.time() - start1 start2 = time.time() HPE_net.load_model(args.head_pose_estimation_model, args.device) HPE_load_t = time.time() - start2 start3 = time.time() FLD_net.load_model(args.facial_landmarks_detection_model, args.device) FLD_load_t = time.time() - start3 start4 = time.time() GEM_net.load_model(args.gaze_estimation_model, args.device) GEM_load_t = time.time() - start4 print('All models are loaded!') #Check the inputs # To make the mouse moving we need video stream either from camera or video path if args.input.lower() == 'cam': # Initialise the InputFeeder class input_feeder = InputFeeder(input_type='cam', input_file=args.input) else: if not os.path.isfile(args.input): log.error("Please insert valid video path to run the app.") exit() # Initialise the InputFeeder class input_feeder = InputFeeder(input_type='video', input_file=args.input) # Load the video capture input_feeder.load_data() # Inference time inference = time.time() # Read from the video capture for flag, frame in input_feeder.next_batch(): if not flag: break key_pressed = cv2.waitKey(60) # Run inference on the models start5 = time.time() face_coords = FDM_net.predict(frame) FDM_infer_t = time.time() - start5 # crop the face from the frame cropped_face = frame[face_coords[1]:face_coords[3], face_coords[0]:face_coords[2]] #Everything depends on the face detection output, if no face detected then repeat if len(face_coords) == 0: log.error("There is no faces detected.") continue start6 = time.time() HP_angles = HPE_net.predict(cropped_face, face_coords) HPE_infer_t = time.time() - start6 if args.display_flag: #### display the face O_frame = cv2.rectangle(frame.copy(), (face_coords[0], face_coords[1]), (face_coords[2], face_coords[3]), (255, 255, 0), 2) #### display the pose angles # Link for pose estimation output code resource: https://sudonull.com/post/6484-Intel-OpenVINO-on-Raspberry-Pi-2018-harvest cos_r = cos(HP_angles[2] * pi / 180) sin_r = sin(HP_angles[2] * pi / 180) cos_y = cos(HP_angles[0] * pi / 180) sin_y = sin(HP_angles[0] * pi / 180) cos_p = cos(HP_angles[1] * pi / 180) sin_p = sin(HP_angles[1] * pi / 180) x = int((face_coords[0] + face_coords[2]) / 2) y = int((face_coords[1] + face_coords[3]) / 2) cv2.line(O_frame, (x, y), (x + int(65 * (cos_r * cos_y + sin_y * sin_p * sin_r)), y + int(65 * cos_p * sin_r)), (255, 0, 0), thickness=2) cv2.line(O_frame, (x, y), (x + int(65 * (cos_r * sin_y * sin_p + cos_y * sin_r)), y - int(65 * cos_p * cos_r)), (0, 255, 0), thickness=2) cv2.line(O_frame, (x, y), (x + int(65 * sin_y * cos_p), y + int(65 * sin_p)), (0, 0, 255), thickness=2) start7 = time.time() l_e, r_e, l_e_image, r_e_image, e_center = FLD_net.predict( O_frame, cropped_face, face_coords) FLD_infer_t = time.time() - start7 ###display landmarks for both eyes if args.display_flag: cv2.circle(O_frame, (face_coords[0] + l_e[0], face_coords[1] + l_e[1]), 29, (0, 255, 255), 2) cv2.circle(O_frame, (face_coords[0] + r_e[0], face_coords[1] + r_e[1]), 29, (0, 255, 255), 2) start8 = time.time() g_vec = GEM_net.predict(l_e_image, r_e_image, HP_angles) GEM_infer_t = time.time() - start8 ###display gaze model output if args.display_flag: cv2.arrowedLine(O_frame, (int(e_center[0][0]), int(e_center[0][1])), (int(e_center[0][0]) + int(g_vec[0] * 90), int(e_center[0][1]) + int(-g_vec[1] * 90)), (203, 192, 255), 2) cv2.arrowedLine(O_frame, (int(e_center[1][0]), int(e_center[1][1])), (int(e_center[1][0]) + int(g_vec[0] * 90), int(e_center[1][1]) + int(-g_vec[1] * 90)), (203, 192, 255), 2) # change the pointer position according to the estimated gaze direction mouse_controller.move(g_vec[0], g_vec[1]) if key_pressed == 27: break # Display the resulting frame cv2.imshow('Mouse Controller App Results', cv2.resize(O_frame, (750, 550))) inference_time = time.time() - inference print("Loading time: \n1-Face detection: " + str(FDM_load_t) + "\n2- Head pose estimation: " + str(HPE_load_t) + "\n3-Facial landmarks model: " + str(FLD_load_t) + "\n4-Gaze estimation model: " + str(GEM_load_t)) print("Output inference time: \n1-Face detection: " + str(FDM_infer_t) + "\n2- Head pose estimation: " + str(HPE_infer_t) + "\n3-Facial landmarks model: " + str(FLD_infer_t) + "\n4-Gaze estimation model: " + str(GEM_infer_t)) # close the input feeder and destroy all opened windows input_feeder.close() cv2.destroyAllWindows
def main(args): # getting the arguments if args.get_perf_counts.lower() == "true": perf_counts = True elif args.get_perf_counts.lower() == "false": perf_counts = False precision = args.precision.lower() speed = args.speed.lower() media_type = args.media_type.lower() media_path = args.media_file toggle_ui = args.show_video print(toggle_ui) batch_size = args.batch_size device = args.device iterations = 1 if media_type == "cam" else int(args.iterations) #initialize the mouse object mouse = MouseController(precision, speed) # Initialize the input feeder feed = InputFeeder(media_type, batch_size, media_path) # Initialize and load the inference models model = Model(face_detection, facial_landmarks, gaze_estimation, head_pose_estimation, device) model.load_models() for _ in range(iterations): feed.load_data() #This will be used as a way to keep track of the average time for the preprocessing and inference of the models times = np.zeros((8, )) counter_frames = 0 if media_type != "image": width = feed.cap.get(3) height = feed.cap.get(4) else: height, width, _ = feed.cap.shape try: for frame in feed.next_batch(media_type): counter_frames += 1 #generates the prediction x, y, gaze_vector, times = model.predict( frame, width, height, times) #generates the movement on the cursor mouse.move(x, y) if perf_counts: cv2.putText( frame, "Preprocess Face Detection: " + str(times[0] / counter_frames * 1000) + " ms", (0, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3) cv2.putText( frame, "Inference Face Detection: " + str(times[1] / counter_frames * 1000) + " ms", (0, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3) cv2.putText( frame, "Preprocess Facial Landmarks: " + str(times[2] / counter_frames * 1000) + " ms", (0, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3) cv2.putText( frame, "Inference Facial Landmarks: " + str(times[3] / counter_frames * 1000) + " ms", (0, 200), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3) cv2.putText( frame, "Preprocess Head Pose: " + str(times[4] / counter_frames * 1000) + " ms", (0, 250), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3) cv2.putText( frame, "Inference Head Pose: " + str(times[5] / counter_frames * 1000) + " ms", (0, 300), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3) cv2.putText( frame, "Preprocess Gaze Estimation: " + str(times[6] / counter_frames * 1000) + " ms", (0, 350), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3) cv2.putText( frame, "Inference Gaze Estimation: " + str(times[7] / counter_frames * 1000) + " ms", (0, 400), cv2.FONT_HERSHEY_SIMPLEX, 1, (209, 80, 0), 3) print("Preprocess Face Detection: " + str(times[0] / counter_frames * 1000) + " ms") print("Inference Face Detection: " + str(times[1] / counter_frames * 1000) + " ms") print("Preprocess Facial Landmarks: " + str(times[2] / counter_frames * 1000) + " ms") print("Inference Facial Landmarks: " + str(times[3] / counter_frames * 1000) + " ms") print("Preprocess Head Pose: " + str(times[4] / counter_frames * 1000) + " ms") print("Inference Head Pose: " + str(times[5] / counter_frames * 1000) + " ms") print("Preprocess Gaze Estimation: " + str(times[6] / counter_frames * 1000) + " ms") print("Inference Gaze Estimation: " + str(times[7] / counter_frames * 1000) + " ms") if toggle_ui == True: cv2.imshow("Frame", frame) if cv2.waitKey(1) & 0xFF == ord('q'): break if cv2.waitKey(1) & 0xFF == ord('i'): toggle_UI = False if toggle_UI else True except: print("Video has ended or couldn't continue") if perf_counts: print("Final average: ") print("Preprocess Face Detection: " + str(times[0] / counter_frames * 1000) + " ms") print("Inference Face Detection: " + str(times[1] / counter_frames * 1000) + " ms") print("Preprocess Facial Landmarks: " + str(times[2] / counter_frames * 1000) + " ms") print("Inference Facial Landmarks: " + str(times[3] / counter_frames * 1000) + " ms") print("Preprocess Head Pose: " + str(times[4] / counter_frames * 1000) + " ms") print("Inference Head Pose: " + str(times[5] / counter_frames * 1000) + " ms") print("Preprocess Gaze Estimation: " + str(times[6] / counter_frames * 1000) + " ms") print("Inference Gaze Estimation: " + str(times[7] / counter_frames * 1000) + " ms") feed.close() cv2.destroyAllWindows()
def main(): args = get_args().parse_args() path_filender = args.input four_flags = args.flags_checker loger = logging.getLogger() feeder_in = None out_path = args.out_path if path_filender.lower() == "cam": feeder_in = InputFeeder("cam") else: if not os.path.isfile(path_filender): loger.error("The video was not found") exit(1) feeder_in = InputFeeder("video", path_filender) model_locations = { 'FaceDetection': args.face_detection_model, 'HeadPoseEstimation': args.head_pose_estimation_model, 'FacialLandmarksDetection': args.facial_landmarks_detection_model, 'GazeEstimation': args.gaze_estimation_model } for key_name in model_locations.keys(): if not os.path.isfile(model_locations[key_name]): loger.error("The system cannot find the " + key_name + " xml file") exit(1) dt = FaceDetection(model_locations['FaceDetection'], args.device, args.cpu_extension) pe = HeadPoseEstimation(model_locations['HeadPoseEstimation'], args.device, args.cpu_extension) ld = FacialLandmarksDetection(model_locations['FacialLandmarksDetection'], args.device, args.cpu_extension) ge = GazeEstimation(model_locations['GazeEstimation'], args.device, args.cpu_extension) cursor = MouseController('medium', 'fast') feeder_in.load_data() model_load_time_start = time.time() dt.load_model() pe.load_model() ld.load_model() ge.load_model() total_load_time = time.time() - model_load_time_start frame_counter = 0 inference_time_start = time.time() for ret, frame in feeder_in.next_batch(): if not ret: break frame_counter = frame_counter + 1 if frame_counter % 1 == 0: cv2.imshow('video', cv2.resize(frame, (600, 600))) key = cv2.waitKey(60) face_detected, coords_face = dt.predict(frame, args.p_th) if type(face_detected) == int: loger.error("The system cannot detect any face.") if key == 27: break continue head_pose_output = pe.predict(face_detected) eye_left_detect, eye_right_detect, eye_coordinates_detect = ld.predict( face_detected) coordi_update_pointer, coordi_gaze = ge.predict( eye_left_detect, eye_right_detect, head_pose_output) if (not len(four_flags) == 0): result_app = frame if 'fad' in four_flags: result_app = face_detected if 'hpe' in four_flags: cv2.putText( result_app, "HP Angles: YAW:{:.3f} * PITCH:{:.3f} * ROLL:{:.3f}". format(head_pose_output[0], head_pose_output[1], head_pose_output[2]), (5, 40), cv2.FONT_HERSHEY_COMPLEX, 0.25, (153, 76, 0), 0) if 'fld' in four_flags: cv2.rectangle(face_detected, (eye_coordinates_detect[0][0] - 4, eye_coordinates_detect[0][1] - 4), (eye_coordinates_detect[0][2] + 4, eye_coordinates_detect[0][3] + 4), (255, 255, 0), 4) cv2.rectangle(face_detected, (eye_coordinates_detect[1][0] - 4, eye_coordinates_detect[1][1] - 4), (eye_coordinates_detect[1][2] + 4, eye_coordinates_detect[1][3] + 4), (255, 255, 0), 4) if 'gae' in four_flags: x = int(coordi_gaze[0] * 2) y = int(coordi_gaze[1] * 2) w = 150 right_E = cv2.line(eye_right_detect, (x - w, y - w), (x + w, y + w), (51, 255, 153), 1) cv2.line(right_E, (x - w, y + w), (x + w, y - w), (51, 255, 253), 1) left_E = cv2.line(eye_left_detect, (x - w, y - w), (x + w, y + w), (51, 255, 153), 1) cv2.line(left_E, (x - w, y + w), (x + w, y - w), (51, 255, 253), 1) face_detected[ eye_coordinates_detect[1][1]:eye_coordinates_detect[1][3], eye_coordinates_detect[1][0]:eye_coordinates_detect[1] [2]] = right_E face_detected[ eye_coordinates_detect[0][1]:eye_coordinates_detect[0][3], eye_coordinates_detect[0][0]:eye_coordinates_detect[0] [2]] = left_E cv2.imshow("Result of the App", cv2.resize(result_app, (600, 600))) if frame_counter % 5 == 0: cursor.move(coordi_update_pointer[0], coordi_update_pointer[1]) if key == 27: break total_time = time.time() - inference_time_start total_time_for_inference = round(total_time, 1) fps = frame_counter / total_time_for_inference with open(out_path + 'stats.txt', 'w') as f: f.write('Inference time: ' + str(total_time_for_inference) + '\n') f.write('FPS: ' + str(fps) + '\n') f.write('Model load time: ' + str(total_load_time) + '\n') loger.error("The video stream is over...") cv2.destroyAllWindows() feeder_in.close()
class MoveMouse: ''' Main Class for the Mouse Controller app. This is the class where all the models are stitched together to control the mouse pointer ''' def __init__(self, args): ''' This method instances variables for the Facial Landmarks Detection Model. Args: args = All arguments parsed by the arguments parser function Return: None ''' init_start_time = time.time() self.output_path = args.output_path self.show_output = args.show_output self.total_processing_time = 0 self.count_batch = 0 self.inference_speed = [] self.avg_inference_speed = 0 if args.all_devices != 'CPU': args.face_device = args.all_devices args.face_landmark_device = args.all_devices args.head_pose_device = args.all_devices args.gaze_device = args.all_devices model_init_start = time.time() self.face_model = FaceDetection(args.face_model, args.face_device, args.face_device_ext, args.face_prob_threshold) self.landmarks_model = FacialLandmarksDetection( args.face_landmark_model, args.face_landmark_device, args.face_landmark_device_ext, args.face_landmark_prob_threshold) self.head_pose_model = HeadPoseEstimation( args.head_pose_model, args.head_pose_device, args.head_pose_device_ext, args.head_pose_prob_threshold) self.gaze_model = GazeEstimation(args.gaze_model, args.gaze_device, args.gaze_device_ext, args.gaze_prob_threshold) self.model_init_time = time.time() - model_init_start log.info('[ Main ] All required models initiallized') self.mouse_control = MouseController(args.precision, args.speed) log.info('[ Main ] Mouse controller successfully initialized') self.input_feeder = InputFeeder(args.batch_size, args.input_type, args.input_file) log.info('[ Main ] Initialized input feeder') model_load_start = time.time() self.face_model.load_model() self.landmarks_model.load_model() self.head_pose_model.load_model() self.gaze_model.load_model() self.model_load_time = time.time() - model_load_start self.app_init_time = time.time() - init_start_time log.info('[ Main ] All moadels loaded to Inference Engine\n') return None def draw_face_box(self, frame, face_coords): ''' Draws face's bounding box on the input frame Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with bounding box of faces drawn on it ''' start_point = (face_coords[0][0], face_coords[0][1]) end_point = (face_coords[0][2], face_coords[0][3]) thickness = 5 color = (255, 86, 0) frame = cv2.rectangle(frame, start_point, end_point, color, thickness) return frame def draw_eyes_boxes(self, frame, left_eye_coords, right_eye_coords): ''' Draws face's bounding box on the input frame Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with bounding box of left and right eyes drawn on it ''' left_eye_start_point = (left_eye_coords[0], left_eye_coords[1]) left_eye_end_point = (left_eye_coords[2], left_eye_coords[3]) right_eye_start_point = (right_eye_coords[0], right_eye_coords[1]) right_eye_end_point = (right_eye_coords[2], right_eye_coords[3]) thickness = 5 color = (0, 210, 0) frame = cv2.rectangle(frame, left_eye_start_point, left_eye_end_point, color, thickness) frame = cv2.rectangle(frame, right_eye_start_point, right_eye_end_point, color, thickness) return frame def draw_outputs(self, frame): ''' Draws the inference outputs (bounding boxes of the face and both eyes and the 3D head pose directions) of the four models onto the frames. Args: frame = Input frame from video or camera feed. It could also be an input image Return: frame = Frame with all inference outputs drawn on it ''' frame = self.draw_face_box(frame, self.face_coords) frame = self.draw_eyes_boxes(frame, self.left_eye_coords, self.right_eye_coords) frame_id = f'Batch id = {self.count_batch}' avg_inference_speed = f'Avg. inference speed = {self.avg_inference_speed:.3f}fps' total_processing_time = f'Total infer. time = {self.total_processing_time:.3f}s' cv2.putText(frame, frame_id, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) cv2.putText(frame, avg_inference_speed, (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) cv2.putText(frame, total_processing_time, (15, 45), cv2.FONT_HERSHEY_COMPLEX, 0.45, (255, 86, 0), 1) return frame def run_inference(self, frame): ''' Performs inference on the input video or image by passing it through all four models to get the desired coordinates for moving the mouse pointer. Args: frame = Input image, frame from video or camera feed Return: None ''' self.input_feeder.load_data() for frame in self.input_feeder.next_batch(): if self.input_feeder.frame_flag == True: log.info('[ Main ] Started processing a new batch') start_inference = time.time() self.face_coords, self.face_crop = self.face_model.predict( frame) if self.face_coords == []: log.info( '[ Main ] No face detected.. Waiting for you to stare at the camera' ) f.write('[ Error ] No face was detected') else: self.head_pose_angles = self.head_pose_model.predict( self.face_crop) self.left_eye_coords, self.left_eye_image, self.right_eye_coords, self.right_eye_image = self.landmarks_model.predict( self.face_crop) self.x, self.y = self.gaze_model.predict( self.left_eye_image, self.right_eye_image, self.head_pose_angles) log.info( f'[ Main ] Relative pointer coordinates: [{self.x:.2f}, {self.y:.2f}]' ) batch_process_time = time.time() - start_inference self.total_processing_time += batch_process_time self.count_batch += 1 log.info( f'[ Main ] Finished processing batch. Time taken = {batch_process_time}s\n' ) self.mouse_control.move(self.x, self.y) if self.show_output: self.draw_outputs(frame) cv2.imshow('Computer Pointer Controller Output', frame) self.inference_speed.append(self.count_batch / self.total_processing_time) self.avg_inference_speed = sum(self.inference_speed) / len( self.inference_speed) with open(os.path.join(self.output_path, 'outputs.txt'), 'w+') as f: f.write('INFERENCE STATS\n') f.write( f'Total model initialization time : {self.model_init_time:.2f}s\n' ) f.write( f'Total model load time: {self.model_load_time:.2f}s\n' ) f.write( f'App initialization time: {self.app_init_time:.2f}s\n' ) f.write( f'Total processing time: {self.total_processing_time:.2f}s\n' ) f.write( f'Average inference speed: {self.avg_inference_speed:.2f}FPS\n' ) f.write(f'Batch count: {self.count_batch}\n\n') f.write('LAST OUTPUTS\n') f.write(f'Face coordinates: {self.face_coords}\n') f.write(f'Left eye coordinates: {self.left_eye_coords}\n') f.write( f'Right eye coordinates: {self.right_eye_coords}\n') f.write(f'Head pose angles: {self.head_pose_angles}\n') f.write( f'Relative pointer coordinates/ Gaze vector: [{self.x:.2f}, {self.y:.2f}]' ) else: self.input_feeder.close() cv2.destroyAllWindows() log.info( f'[ Main ] All input Batches processed in {self.total_processing_time:.2f}s' ) log.info('[ Main ] Shutting down app...') log.info('[ Main ] Mouse controller app has been shut down.') break return
def main(): args = build_argparser().parse_args() logging.basicConfig(filename='../outputs/logging.log', level=logging.DEBUG) # get the model objects faceDetObj = Model_FaceDetection(model_name=args.facedetecionmodel, device=args.device, extensions=args.cpu_extension, threshold=args.prob_threshold) headPoseObj = Model_HeadPoseEstimation( model_name=args.headposeestimationmodel, device=args.device, extensions=args.cpu_extension) facialLandmarkObj = Model_FacialLandmarkDetection( model_name=args.faciallandmarksdetectionmodel, device=args.device, extensions=args.cpu_extension) gazeEstimationObj = Model_GazeEstimation( model_name=args.gazeestimationnmodel, device=args.device, extensions=args.cpu_extension) # load the models faceDetObj.load_model() headPoseObj.load_model() facialLandmarkObj.load_model() gazeEstimationObj.load_model() # check if we have video or cam stream stream = None if args.input.upper() == "CAM": stream = "cam" else: stream = "video" # get the InputFeeder and MouseController objects feedObj = InputFeeder(input_type=stream, input_file=args.input) MouseControllerObj = MouseController(precision='high', speed='fast') # start processing the video or cam stream frames frame_count = 0 feedObj.load_data() for flag, frame in feedObj.next_batch(): if not flag: break key_pressed = cv2.waitKey(60) frame_count += 1 coords, cropped_faces = faceDetObj.predict(frame) # check if we have detected a face in the frame if type( cropped_faces[0] ) == int: # faces are sorted starting with the highest probability logging.info( "FaceDetection did not detect any face - skipping the frame {}" .format(frame_count)) continue head_angles = headPoseObj.predict( cropped_faces[0] ) # faces are sorted starting with the highest probability eyes, eyes_coords = facialLandmarkObj.predict( cropped_faces[0] ) # faces are sorted starting with the highest probability gaze_vec = gazeEstimationObj.predict(eyes[0], eyes[1], head_angles) # get the mouse pointer coordinates mouse_vec = calculate_mouse_vector(head_angles, gaze_vec) # visualise the outputs if args.visualise.upper() == "FACE": cv2.imshow("detected face", cropped_faces[0]) elif args.visualise.upper() == "EYES": pix = 15 eyes_image = cropped_faces[0].copy() # left eye x_l = eyes_coords[0][0] y_l = eyes_coords[0][1] eyes_image = cv2.rectangle(eyes_image, (x_l - pix, y_l - pix), (x_l + pix, y_l + pix), (0, 55, 255), 1) # right eye x_r = eyes_coords[1][0] y_r = eyes_coords[1][1] eyes_image = cv2.rectangle(eyes_image, (x_r - pix, y_r - pix), (x_r + pix, y_r + pix), (0, 55, 255), 1) cv2.imshow("detected eyes", eyes_image) elif args.visualise.upper() == "GAZE": fin_image = visualise_vector(eyes_coords, cropped_faces[0], mouse_vec) # move mouse pointer MouseControllerObj.move(mouse_vec[0], mouse_vec[1]) cv2.destroyAllWindows()
def main(args): device = args.device precision, speed = args.mouse_precision, args.mouse_speed mouse = MouseController(precision=precision, speed=speed) #get paths to the models modelF, modelG, modelH, modelL = args.modelF, args.modelG, args.modelH, args.modelL face = Model_Face(modelF, device) gaze = Model_Gaze(modelG, device) headpose = Model_HeadPose(modelH, device) landmarks = Model_Landmarks(modelL, device) face.load_model() gaze.load_model() headpose.load_model() landmarks.load_model() input_type, input_file = args.input_type, args.input_file feed = InputFeeder(input_type=input_type, input_file=input_file) vframe_shape = feed.load_data() logging.info("Please wait. Processing inference...") # Run inference on four models and get outputs for batch in feed.next_batch(): frame_copy = batch.copy() ##face: frame4infer_f = face.preprocess_input(batch) face_output = face.predict(frame4infer_f) #get face bb coordinates: f_preprocessed_output = face.preprocess_output(face_output, vframe_shape) xmin,ymin,xmax,ymax = f_preprocessed_output ##headpose: frame4infer_h = headpose.preprocess_input(batch) # get yaw, pitch and roll head pose angles headpose_output = headpose.predict(frame4infer_h) h_preprocessed_output = headpose.preprocess_output(headpose_output) ##landmarks: #get roi of face roi = batch[ymin:ymax, xmin:xmax] frame4infer_l = landmarks.preprocess_input(roi) landmarks_output = landmarks.predict(frame4infer_l) # get landmarks coordinates l_preprocessed_output = landmarks.preprocess_output(landmarks_output, f_preprocessed_output) right_eye, left_eye, nose, right_lip_corner, left_lip_corner = l_preprocessed_output ##gaze r_eye_crop = batch[right_eye[1]-20:right_eye[1]+20, right_eye[0]-20:right_eye[0]+20] l_eye_crop = batch[left_eye[1]-20:left_eye[1]+20, left_eye[0]-20:left_eye[0]+20] re_blob4infer_g = gaze.preprocess_input(r_eye_crop, 're') le_blob4infer_g = gaze.preprocess_input(l_eye_crop, 'le') hp_blob4infer_g = gaze.preprocess_input(np.array(h_preprocessed_output), 'hp') gaze_output = gaze.predict(re_blob4infer_g, le_blob4infer_g, hp_blob4infer_g) g_preprocessed_output = gaze.preprocess_output(gaze_output, l_preprocessed_output, vframe_shape) # Get mouse pointer position x, y = g_preprocessed_output # Move a mouse pointer mouse.move(x, y) if input_type == "image": cv2.imwrite("output_image.jpg", frame_copy) logging.info(" ! Got output image!") if input_type == 'video': feed.write(frame_copy) feed.close() logging.info("End of the processing.")
def inference(args): time_sheet = { 'face_infr': [], 'landmark_infr': [], 'head_infr': [], 'gaze_infr': [], 'infr_per_frame': [] } logging.basicConfig(filename='result.log', level=logging.INFO) logging.info( "=================================================================================" ) logging.info("Precision(face,landmark,head,gaze): FP32-INT1,FP{0},FP{1},FP{2}".format(\ args.landmark_model.split("FP")[1].split("\\")[0], args.head_model.split("FP")[1].split("\\")[0], args.gaze_model.split("FP")[1].split("\\")[0])) model_load_start = time.time() face_detection = FaceDetection(args.face_model) face_detection.load_model() landmark_regression = LandmarkRegression(args.landmark_model) landmark_regression.load_model() head_pose = HeadPose(args.head_model) head_pose.load_model() gaze_estimation = GazeEstimation(args.gaze_model) gaze_estimation.load_model() logging.info("4 models load time: {0:.4f}sec".format(time.time() - model_load_start)) mouse_controller = MouseController('high', 'fast') cv2.namedWindow('preview', cv2.WND_PROP_FULLSCREEN) cv2.setWindowProperty('preview', cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN) input_feeder = InputFeeder(args.input_type, args.input_file) input_feeder.load_data() total_infr_start = time.time() for image in input_feeder.next_batch(): if image is None: break face_infr_start = time.time() face_image = face_detection.predict(image) time_sheet['face_infr'].append(time.time() - face_infr_start) landmark_infr_start = time.time() left_eye_image, right_eye_image = landmark_regression.predict( np.copy(face_image)) time_sheet['landmark_infr'].append(time.time() - landmark_infr_start) head_infr_start = time.time() head_pose_angles = head_pose.predict(np.copy(face_image)) time_sheet['head_infr'].append(time.time() - head_infr_start) gaze_infr_start = time.time() x, y, z = gaze_estimation.predict(left_eye_image, right_eye_image, head_pose_angles) time_sheet['gaze_infr'].append(time.time() - gaze_infr_start) time_sheet['infr_per_frame'].append(time.time() - face_infr_start) cv2.imshow('preview', image) mouse_controller.move(x, y) key = cv2.waitKey(20) if key == 27: # exit on ESC break logging.info("Face model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['face_infr']))) logging.info("Landmark model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['landmark_infr']))) logging.info("Head model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['head_infr']))) logging.info("Gaze model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['gaze_infr']))) logging.info("4 Model avg inference per frame: {0:.4f}sec".format( np.mean(time_sheet['infr_per_frame']))) logging.info("Total inference time: {0:.4f}sec".format(time.time() - total_infr_start)) logging.info( "====================================END==========================================\n" ) input_feeder.close() cv2.destroyAllWindows()
def main(): args = build_argparser().parse_args() logging.basicConfig(filename=args.output+'/app.log', filemode='w') print("Begin: Try not to move mouse with your hands") mc = MouseController("low", "fast") if args.input == "cam": frames = InputFeeder("cam") else: frames = InputFeeder("video", args.input) cap = frames.load_data() if args.display: initial_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) initial_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) video_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = int(cap.get(cv2.CAP_PROP_FPS)) out_video = cv2.VideoWriter(os.path.join(args.output, 'output_video.mp4'), cv2.VideoWriter_fourcc('m','p','4','v'), fps, (initial_w, initial_h)) face_model = FaceDetectionModel(args.face_model, args.output, args.device) pose_model = HeadPoseEstimationModel(args.pose_model, args.output, args.device) landmarks_model = FacialLandmarksDetectionModel(args.landmarks_model, args.output, args.device) gaze_model = GazeEstimationModel(args.gaze_model, args.output, args.device) avg_out = 0 avg = 0 tmlt_face_avg = 0 tinpt_face_avg = 0 tint_face_avg = 0 toutt_face_avg = 0 tmlt_pose_avg = 0 tinpt_pose_avg = 0 tint_pose_avg = 0 toutt_pose_avg = 0 tmlt_landmarks_avg = 0 tinpt_landmarks_avg = 0 tint_landmarks_avg = 0 toutt_landmarks_avg = 0 tmlt_gaze_avg = 0 tinpt_gaze_avg = 0 tint_gaze_avg = 0 toutt_gaze_avg = 0 logging.info("Frames starting") for frame in frames.next_batch(): if frame is None: logging.error("Frame: " + frame + "failed") continue output_image = frame.copy() cropped_faces, tmlt_face, tinpt_face, tint_face, toutt_face = face_model.predict(frame) try: largest_face = cropped_faces[0] for face in cropped_faces: if largest_face.size < face.size: largest_face = face pose, tmlt_pose, tinpt_pose, tint_pose, toutt_pose = pose_model.predict(largest_face) landmarks, tmlt_landmarks, tinpt_landmarks, tint_landmarks, toutt_landmarks = landmarks_model.predict(largest_face) gaze_vector, tmlt_gaze, tinpt_gaze, tint_gaze, toutt_gaze = gaze_model.predict(largest_face, landmarks, pose) except Exception as e: logging.error("Model inference failed: " + str(e)) # print(e) continue if args.display: output_image, xmin, ymin = face_model.draw_crop_outputs(output_image, args.display) output_image = gaze_model.display_eye_boxes(output_image, landmarks, xmin, ymin, args.display) out_video.write(output_image) cv2.imshow("output_image", output_image) cv2.waitKey(15) face_model.coords = [] tmlt_face_avg += tmlt_face tinpt_face_avg += tinpt_face tint_face_avg += tint_face toutt_face_avg += toutt_face tmlt_pose_avg += tmlt_pose tinpt_pose_avg += tinpt_pose tint_pose_avg += tint_pose toutt_pose_avg += toutt_pose tmlt_landmarks_avg += tmlt_landmarks tinpt_landmarks_avg+= tinpt_landmarks tint_landmarks_avg += tint_landmarks toutt_landmarks_avg += toutt_landmarks if gaze_vector is None: avg_out += 1 continue tmlt_gaze_avg += tmlt_gaze tinpt_gaze_avg += tinpt_gaze tint_gaze_avg += tint_gaze toutt_gaze_avg += toutt_gaze avg += 1 gaze_vector_norm = gaze_vector / np.linalg.norm(gaze_vector) try: mc.move(gaze_vector_norm[0], gaze_vector_norm[1]) except Exception as e: logging.error("Gaze failed: " + str(e)) # print(e) continue file_name = "stats_"+args.precision+".txt" save_path = os.path.join(os.getcwd(), args.output) f = open(os.path.join(save_path, file_name), "w") f.write("Benchmark Start:"+"\n\n") f.write("Face Detection Model stats"+"\n") f.write("Total model Load Time:"+str(tmlt_face_avg/avg)+"\n") f.write("Total Input Time:"+str(tinpt_face_avg/avg)+"\n") f.write("Total Inference Time:"+str(tint_face_avg/avg)+"\n") f.write("Total Output Time:"+str(toutt_face_avg/avg)+"\n\n") f.write("Head Pose Estimation Model stats"+"\n") f.write("Total model Load Time:"+str(tmlt_pose_avg/avg)+"\n") f.write("Total Input Time:"+str(tinpt_pose_avg/avg)+"\n") f.write("Total Inference Time:"+str(tint_pose_avg/avg)+"\n") f.write("Total Output Time:"+str(toutt_pose_avg/avg)+"\n\n") f.write("Facial Landmarks Detection Model stats"+"\n") f.write("Total model Load Time:"+str(tmlt_landmarks_avg/avg)+"\n") f.write("Total Input Time:"+str(tinpt_landmarks_avg/avg)+"\n") f.write("Total Inference Time:"+str(tint_landmarks_avg/avg)+"\n") f.write("Total Output Time:"+str(toutt_landmarks_avg/avg)+"\n\n") f.write("Gaze Estimation Model stats"+"\n") f.write("Total model Load Time:"+str(tmlt_gaze_avg/(avg-avg_out))+"\n") f.write("Total Input Time:"+str(tinpt_gaze_avg/(avg-avg_out))+"\n") f.write("Total Inference Time:"+str(tint_gaze_avg/(avg-avg_out))+"\n") f.write("Total Output Time:"+str(toutt_gaze_avg/(avg-avg_out))+"\n\n") f.write("Benchmark end"+"\n") f.close() print("Thank you, Goodbye") frames.close()
def benchmark(args): print("runing benchmark") #file=open(args.c) #confs=json.loads(file.read()) input_type=args.t input_files=args.l face_lt_start=time.time() face_detect=face_detection(args.fm, args.d, args.p, args.e) face_detect.load_model() face_lt=time.time()-face_lt_start landmark_lt_start=time.time() landmarks_model=LandmarksDetection(args.lm, args.d, args.e) landmarks_model.load_model() landmark_lt=time.time()-landmark_lt_start head_pose_lt_start=time.time() head_pose=Head_Pose(args.hm, args.d, args.e) head_pose.load_model() head_pose_lt=time.time()-head_pose_lt_start gaze_lt_start=time.time() gaze_estimation=Gaze_Estimation(args.gm, args.d, args.e) gaze_estimation.load_model() gaze_lt=time.time()-gaze_lt_start feed=InputFeeder(input_type='video', input_file=input_files) feed.load_data() for batch in feed.next_batch(): face_inf_start=time.time() cropped_face=face_detect.predict(batch) face_inf_time=time.time()-face_inf_start landmark_inf_start=time.time() cropped_left_eye, cropped_right_eye = landmarks_model.predict(cropped_face) landmark_inf_time=time.time()-landmark_inf_start head_pose_inf_start=time.time() head_angles = head_pose.predict(cropped_face) head_pose_inf_time=time.time()-head_pose_inf_start gaze_inf_start=time.time() x,y = gaze_estimation.predict(cropped_left_eye, cropped_right_eye, head_angles) gaze_inf_time=time.time()-gaze_inf_start #plotting load_time models=['Face_detect', 'landmark_detect', 'Head_pose_est', 'Gaze est'] loading_times=[face_lt, landmark_lt, head_pose_lt, gaze_lt] plot_loading_time(models, loading_times, args.b) #plotting inference_time inference_times=[face_inf_time, landmark_inf_time, head_pose_inf_time, gaze_inf_time] plot_inf_time(models, inference_times, args.b) logging.info("Benchmarking done!") break feed.close()
def main(): args = build_argparser().parse_args() frame_num = 0 inference_time = 0 counter = 0 # Initialize the Inference Engine fd = FaceDetection() fld = Facial_Landmarks_Detection() ge = Gaze_Estimation() hp = Head_Pose_Estimation() # Load Models fd.load_model(args.face_detection_model, args.device, args.cpu_extension) fld.load_model(args.facial_landmark_model, args.device, args.cpu_extension) ge.load_model(args.gaze_estimation_model, args.device, args.cpu_extension) hp.load_model(args.head_pose_model, args.device, args.cpu_extension) # Mouse Controller precision and speed mc = MouseController('medium', 'fast') # feed input from an image, webcam, or video to model if args.input == "cam": feed = InputFeeder("cam") else: assert os.path.isfile(args.input), "Specified input file doesn't exist" feed = InputFeeder("video", args.input) feed.load_data() frame_count = 0 for frame in feed.next_batch(): frame_count += 1 inf_start = time.time() if frame is not None: try: key = cv2.waitKey(60) det_time = time.time() - inf_start # make predictions detected_face, face_coords = fd.predict( frame.copy(), args.prob_threshold) hp_output = hp.predict(detected_face.copy()) left_eye, right_eye, eye_coords = fld.predict( detected_face.copy()) new_mouse_coord, gaze_vector = ge.predict( left_eye, right_eye, hp_output) stop_inference = time.time() inference_time = inference_time + stop_inference - inf_start counter = counter + 1 # Visualization preview = args.visualization if preview: preview_frame = frame.copy() face_frame = detected_face.copy() draw_face_bbox(preview_frame, face_coords) display_hp(preview_frame, hp_output, face_coords) draw_landmarks(face_frame, eye_coords) draw_gaze(face_frame, gaze_vector, left_eye.copy(), right_eye.copy(), eye_coords) if preview: img = np.hstack((cv2.resize(preview_frame, (500, 500)), cv2.resize(face_frame, (500, 500)))) else: img = cv2.resize(frame, (500, 500)) cv2.imshow('Visualization', img) # set speed if frame_count % 5 == 0: mc.move(new_mouse_coord[0], new_mouse_coord[1]) # INFO log.info("NUMBER OF FRAMES: {} ".format(frame_num)) log.info("INFERENCE TIME: {}ms".format(det_time * 1000)) frame_num += 1 if key == 27: break except: print( 'Not supported image or video file format. Please send in a supported video format.' ) exit() feed.close()
def main(args): input_type=args.t input_files=args.l flags=args.f face_detect=Face_Detection(face_model_path, args.d, args.p, args.e) face_detect.load_model() landmarks_model=LandmarksDetection(landmarks_model_path, args.d, args.e) landmarks_model.load_model() head_pose=Head_Pose(hpose_model_path, args.d, args.e) head_pose.load_model() gaze_estimation=Gaze_Estimation(gaze_model_path, args.d, args.e) gaze_estimation.load_model() if input_type == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_files): logging.error("Could not find the input file") exit(1) feed= InputFeeder(input_type='video', input_file=input_files) #feed=InputFeeder(input_type=input_type, input_file= input_files) try: feed.load_data() except Exception: logging.error("Could not load data from input file", exc_info=True) for batch in feed.next_batch(): try: cropped_face, coords=face_detect.predict(batch) if type(cropped_face) == int: logging.info("Face not detected") if key == 27: break continue cropped_left_eye, cropped_right_eye, left_eye_cord, right_eye_cord = landmarks_model.predict(cropped_face) head_angles = head_pose.predict(cropped_face) x,y = gaze_estimation.predict(cropped_left_eye, cropped_right_eye, head_angles) except Exception: logging.error("An error occured while running predictions", exc_info=True) if flags != 0: if flags == 'FD': cv2.rectangle(batch, (coords[0], coords[1]), (coords[2], coords[3]), (255, 0, 0), 3) if flags =='FL': cv2.rectangle(cropped_face, (left_eye_cord[0], left_eye_cord[1]), (left_eye_cord[2], left_eye_cord[3]), (255, 0, 0), 3) cv2.rectangle(cropped_face, (right_eye_cord[0], right_eye_cord[1]), (right_eye_cord[2], right_eye_cord[3]), (255, 0, 0), 3) if flags =='HP': cv2.putText(batch, "Head angles: yaw={:.2f} , pitch={:.2f}, roll={:.2f}".format( head_angles[0], head_angles[1], head_angles[2]), (20, 40), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 255), 2) if flags == 'GE': left_eye_mid_x= (left_eye_cord[2]-left_eye_cord[0])/2 + left_eye_cord[0] left_eye_mid_y=(left_eye_cord[3]-left_eye_cord[1])/2 + left_eye_cord[1] right_eye_mid_x=(right_eye_cord[2]-right_eye_cord[0])/2 + right_eye_cord[0] right_eye_mid_y=(right_eye_cord[3]- right_eye_cord[1])/2 + right_eye_cord[1] left_eye_new_x=int(left_eye_mid_x + x*160) left_eye_new_y=int(left_eye_mid_y + y*160*-1) right_eye_new_x=int(right_eye_mid_x + x*160) right_eye_new_y=int(right_eye_mid_y + y*160*-1) cv2.line(cropped_face, (int(left_eye_mid_x), int(left_eye_mid_y)), (int(left_eye_new_x), int(left_eye_new_y)), (255, 0, 255), 5) cv2.line(cropped_face, (int(right_eye_mid_x), int(right_eye_mid_y)), (int(right_eye_new_x), int(right_eye_new_y)), (255, 0, 255), 5) mouse=MouseController(precision='low', speed='fast') mouse.move(x,y) batch = imutils.resize(batch, width=500) cv2.imshow('frame', batch) key = cv2.waitKey(1) & 0xFF feed.close()
def main(): args = build_argparser().parse_args() logger = logging.getLogger('main') model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'FacialLandmarksModel': args.facialLandmarksModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } bbox_flag = args.bbox_flag input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold output_path = args.output_path if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger.error("Unable to find specified model file" + str(model_path)) exit(1) face_detection_model = Face_detection( model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) facial_landmarks_detection_model = Landmark_Detection( model_path_dict['FacialLandmarksModel'], device_name, threshold=prob_threshold) head_pose_estimation_model = Head_pose( model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_estimation_model = Gaze_estimation( model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) is_benchmarking = False if not is_benchmarking: mouse_controller = MouseController('medium', 'fast') start_model_load_time = time.time() face_detection_model.load_model() facial_landmarks_detection_model.load_model() head_pose_estimation_model.load_model() gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time feeder.load_data() out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.get_fps() / 10), (1920, 1080), True) frame_count = 0 start_inference_time = time.time() for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) try: face_coords, image_copy = face_detection_model.predict(frame) if type(image_copy) == int: logger.warning("Unable to detect the face") if key == 27: break continue left_eye, right_eye, eye_coords = facial_landmarks_detection_model.predict( image_copy) hp_output = head_pose_estimation_model.predict(image_copy) mouse_coords, gaze_coords = gaze_estimation_model.predict( left_eye, right_eye, hp_output) except Exception as e: logger.warning("Could predict using model" + str(e) + " for frame " + str(frame_count)) continue image = cv2.resize(frame, (500, 500)) if not len(bbox_flag) == 0: bbox_frame = draw_bbox(frame, bbox_flag, image_copy, left_eye, right_eye, face_coords, eye_coords, hp_output, gaze_coords) image = np.hstack( (cv2.resize(frame, (500, 500)), cv2.resize(bbox_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(frame) if frame_count % 5 == 0 and not is_benchmarking: mouse_controller.move(mouse_coords[0], mouse_coords[1]) if key == 27: break total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = frame_count / total_inference_time try: os.mkdir(output_path) except OSError as error: logger.error(error) with open(output_path + 'stats.txt', 'w') as f: f.write(str(total_inference_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_model_load_time) + '\n') logger.info('Model load time: ' + str(total_model_load_time)) logger.info('Inference time: ' + str(total_inference_time)) logger.info('FPS: ' + str(fps)) logger.info('Video stream ended') cv2.destroyAllWindows() feeder.close()
def infer_on_stream(args): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :return: None """ try: logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("gaze-app.log"), logging.StreamHandler() ]) # Initialise the class mc = MouseController("low", "fast") #mc.move(100,100) fdnet = FaceDetection(args.fdmodel) lmnet = FacialLandmarks(args.lmmodel) hpnet = HeadPoseEstimation(args.hpmodel) genet = GazeEstimation(args.gemodel) ### Load the model through ### logging.info("============== Models Load time ===============") start_time = time.time() fdnet.load_model() logging.info("Face Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) fdnet.check_model() logging.info("Face Detection estimation layers loaded correctly") start_time = time.time() lmnet.load_model() logging.info("Facial Landmarks Detection Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) lmnet.check_model() logging.info("Facial Landmarks estimation layers loaded correctly") start_time = time.time() hpnet.load_model() logging.info("Headpose Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) hpnet.check_model() logging.info("Head pose estimation layers loaded correctly") start_time = time.time() genet.load_model() logging.info("Gaze Estimation Model: {:.1f}ms".format( 1000 * (time.time() - start_time))) genet.check_model() logging.info("Gaze estimation layers loaded correctly") logging.info("============== End =====================") # Get and open video capture feeder = InputFeeder('video', args.input) feeder.load_data() # FPS = feeder.get_fps() # Grab the shape of the input # width = feeder.get_width() # height = feeder.get_height() # init scene variables frame_count = 0 ### Loop until stream is over ### fd_infertime = 0 lm_infertime = 0 hp_infertime = 0 ge_infertime = 0 while True: # Read the next frame try: frame = next(feeder.next_batch()) except StopIteration: break key_pressed = cv2.waitKey(60) frame_count += 1 #print(int((frame_count) % int(FPS))) # face detection fd_process_time = time.time() p_frame = fdnet.preprocess_input(frame) start_time = time.time() fnoutput = fdnet.predict(p_frame) fd_infertime += time.time() - start_time out_frame, fboxes = fdnet.preprocess_output( fnoutput, frame, args.print) logging.info( "Face Detection Model processing time : {:.1f}ms".format( 1000 * (time.time() - fd_process_time))) #for each face for fbox in fboxes: # fbox = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame face = frame[fbox[1]:fbox[3], fbox[0]:fbox[2]] lm_process_time = time.time() p_frame = lmnet.preprocess_input(face) start_time = time.time() lmoutput = lmnet.predict(p_frame) lm_infertime += time.time() - start_time out_frame, left_eye_point, right_eye_point = lmnet.preprocess_output( lmoutput, fbox, out_frame, args.print) logging.info( "Landmarks model processing time : {:.1f}ms".format( 1000 * (time.time() - lm_process_time))) # get head pose estimation hp_process_time = time.time() p_frame = hpnet.preprocess_input(face) start_time = time.time() hpoutput = hpnet.predict(p_frame) hp_infertime += time.time() - start_time out_frame, headpose_angels = hpnet.preprocess_output( hpoutput, out_frame, face, fbox, args.print) logging.info( "Headpose estimation model processing time : {:.1f}ms". format(1000 * (time.time() - hp_process_time))) # get gaze estimation gaze_process_time = time.time() out_frame, left_eye, right_eye = genet.preprocess_input( out_frame, face, left_eye_point, right_eye_point, args.print) start_time = time.time() geoutput = genet.predict(left_eye, right_eye, headpose_angels) ge_infertime += time.time() - start_time out_frame, gazevector = genet.preprocess_output( geoutput, out_frame, fbox, left_eye_point, right_eye_point, args.print) logging.info( "Gaze estimation model processing time : {:.1f}ms".format( 1000 * (time.time() - gaze_process_time))) if (not args.no_video): cv2.imshow('im', out_frame) if (not args.no_move): mc.move(gazevector[0], gazevector[1]) #consider only first detected face in the frame break # Break if escape key pressed if key_pressed == 27: break #logging inference times if (frame_count > 0): logging.info( "============== Models Inference time ===============") logging.info("Face Detection:{:.1f}ms".format(1000 * fd_infertime / frame_count)) logging.info("Facial Landmarks Detection:{:.1f}ms".format( 1000 * lm_infertime / frame_count)) logging.info("Headpose Estimation:{:.1f}ms".format( 1000 * hp_infertime / frame_count)) logging.info("Gaze Estimation:{:.1f}ms".format( 1000 * ge_infertime / frame_count)) logging.info("============== End ===============================") # Release the capture and destroy any OpenCV windows feeder.close() cv2.destroyAllWindows() except Exception as ex: logging.exception("Error in inference:" + str(ex))