def main(): args = build_argparser().parse_args() logger = logging.getLogger('main') model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'FacialLandmarksModel': args.facialLandmarksModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } bbox_flag = args.bbox_flag input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold output_path = args.output_path if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger.error("Unable to find specified model file" + str(model_path)) exit(1) face_detection_model = Face_detection( model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) facial_landmarks_detection_model = Landmark_Detection( model_path_dict['FacialLandmarksModel'], device_name, threshold=prob_threshold) head_pose_estimation_model = Head_pose( model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_estimation_model = Gaze_estimation( model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) is_benchmarking = False if not is_benchmarking: mouse_controller = MouseController('medium', 'fast') start_model_load_time = time.time() face_detection_model.load_model() facial_landmarks_detection_model.load_model() head_pose_estimation_model.load_model() gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time feeder.load_data() out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.get_fps() / 10), (1920, 1080), True) frame_count = 0 start_inference_time = time.time() for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) try: face_coords, image_copy = face_detection_model.predict(frame) if type(image_copy) == int: logger.warning("Unable to detect the face") if key == 27: break continue left_eye, right_eye, eye_coords = facial_landmarks_detection_model.predict( image_copy) hp_output = head_pose_estimation_model.predict(image_copy) mouse_coords, gaze_coords = gaze_estimation_model.predict( left_eye, right_eye, hp_output) except Exception as e: logger.warning("Could predict using model" + str(e) + " for frame " + str(frame_count)) continue image = cv2.resize(frame, (500, 500)) if not len(bbox_flag) == 0: bbox_frame = draw_bbox(frame, bbox_flag, image_copy, left_eye, right_eye, face_coords, eye_coords, hp_output, gaze_coords) image = np.hstack( (cv2.resize(frame, (500, 500)), cv2.resize(bbox_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(frame) if frame_count % 5 == 0 and not is_benchmarking: mouse_controller.move(mouse_coords[0], mouse_coords[1]) if key == 27: break total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = frame_count / total_inference_time try: os.mkdir(output_path) except OSError as error: logger.error(error) with open(output_path + 'stats.txt', 'w') as f: f.write(str(total_inference_time) + '\n') f.write(str(fps) + '\n') f.write(str(total_model_load_time) + '\n') logger.info('Model load time: ' + str(total_model_load_time)) logger.info('Inference time: ' + str(total_inference_time)) logger.info('FPS: ' + str(fps)) logger.info('Video stream ended') cv2.destroyAllWindows() feeder.close()
def run_app(options): metrics_builder = MetricsBuilder(options.precision) with elapsed_timer() as et: fdmodel = FaceDetectionModel(options.fdmodel, options.device, options.prob_threshold, options.is_visual, options.extension) fdmodel.load_model() fdmodel_loadtime = et() metrics_builder.face_detection.load_time = fdmodel_loadtime logging.info(f'face detection loading time taken: {fdmodel_loadtime}') with elapsed_timer() as et: ldmodel = LandmarkDetectionModel(options.ldmodel, options.device, options.prob_threshold, options.is_visual, options.extension) ldmodel.load_model() ldmodel_loadtime = et() metrics_builder.landmarks_detection.load_time = ldmodel_loadtime logging.info( f'Landmark detection loading time taken: {ldmodel_loadtime}') with elapsed_timer() as et: hpemodel = HeadPoseEstimationModel(options.hpemodel, options.device, options.prob_threshold, options.is_visual, options.extension) hpemodel.load_model() hpemodel_loadtime = et() metrics_builder.head_pose_estimation.load_time = hpemodel_loadtime logging.info( f'Head Position Estimation loading time taken: {hpemodel_loadtime}' ) with elapsed_timer() as et: gemodel = GazeEstimationModel(options.gemodel, options.device, options.prob_threshold, options.is_visual, options.extension) gemodel.load_model() gemodel_loadtime = et() metrics_builder.gaze_estimation.load_time = gemodel_loadtime logging.info( f'Gazer Estimation loading time taken: {gemodel_loadtime}') try: # Get and open video capture if options.is_cam: feeder = InputFeeder('cam') else: feeder = InputFeeder('video', options.input) feeder.load_data() initial_w, initial_h = feeder.get_size() fps = feeder.get_fps() fdmodel.set_inputsize(initial_w, initial_h) ldmodel.set_inputsize(initial_w, initial_h) hpemodel.set_inputsize(initial_w, initial_h) gemodel.set_inputsize(initial_w, initial_h) frame_count = 0 mouse_controller = MouseController("low", "fast") window_name = 'computer pointer controller' cv2.namedWindow(window_name, cv2.WINDOW_NORMAL) cv2.resizeWindow(window_name, initial_w, initial_h) out_path = str(pathlib.Path('./results/output_video.mp4')) print(out_path) out_video = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'avc1'), fps, (initial_w, initial_h), True) for frame in feeder.next_batch(): if frame is None: break # exit video for escape key key_pressed = cv2.waitKey(60) if key_pressed == 27: break frame_count += 1 # detect face p_frame = fdmodel.preprocess_input(frame) with elapsed_timer() as et: fdmodel_output = fdmodel.predict(p_frame) metrics_builder.face_detection.add_infer_time(et()) out_frame, fboxes = fdmodel.preprocess_output( fdmodel_output, frame) # Take first face - (xmin,ymin,xmax,ymax) fbox = fboxes[0] # landmarks estimation # face = np.asarray(Image.fromarray(frame).crop(fbox)) xmin, ymin, xmax, ymax = fbox face = frame[ymin:ymax, xmin:xmax] p_frame = ldmodel.preprocess_input(face) with elapsed_timer() as et: lmoutput = ldmodel.predict(p_frame) metrics_builder.landmarks_detection.add_infer_time(et()) out_frame, left_eye_point, right_eye_point = ldmodel.preprocess_output( lmoutput, fbox, out_frame) # head pose estimation p_frame = hpemodel.preprocess_input(face) with elapsed_timer() as et: hpoutput = hpemodel.predict(p_frame) metrics_builder.head_pose_estimation.add_infer_time(et()) out_frame, headpose_angels = hpemodel.preprocess_output( hpoutput, out_frame, face, fbox) # # # gaze estimation out_frame, left_eye, right_eye = gemodel.preprocess_input( out_frame, face, left_eye_point, right_eye_point) with elapsed_timer() as et: geoutput = gemodel.predict(headpose_angels, left_eye, right_eye) metrics_builder.gaze_estimation.add_infer_time(et()) out_frame, gazevector = gemodel.preprocess_output( geoutput, out_frame, fbox, left_eye_point, right_eye_point) # show frame if options.is_show_frame: cv2.imshow(window_name, out_frame) # mouse controller if options.is_move_pointer: x, y, _ = gazevector mouse_controller.move(x, y) out_video.write(out_frame) # performance metrics metrics_builder.save_metrics(frame_count) feeder.close() cv2.destroyAllWindows() except Exception as e: logging.error("Fatal error in main loop", exc_info=True)
def main(): args = build_argparser().parse_args() # initialize variables with the input arguments for easy access fdm = args.face_detection_model ldm = args.facial_landmarks_detection_model hpem = args.head_pose_estimation_model gem = args.gaze_estimation_model output_flags = args.output_flags input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold cpu_extension = args.cpu_extension if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): log.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) # initialize model face_detection_model = FaceDetect(fdm, device_name, cpu_extension, prob_threshold) landmark_detection_model = FacialLandmarks(ldm, device_name, cpu_extension, prob_threshold) head_pose_estimation_model = HeadPose(hpem, device_name, cpu_extension, prob_threshold) gaze_estimation_model = GazeEstimation(gem, device_name, cpu_extension, prob_threshold) mouse_controller = MouseController('medium', 'fast') # load Models start_model_load_time = time.time() face_detection_model.load_model() #load face detection model log.info("Face Detection Model Loaded...") FDMT = time.time() - start_model_load_time start1 = time.time() landmark_detection_model.load_model() #load_landmark_detection_model log.info("landmark_estimation Model Loaded...") LDMT = time.time() - start1 start2 = time.time() head_pose_estimation_model.load_model() #load_head_pose_estimation_model log.info("Head pose estimation model Loaded...") hpem = time.time() - start2 start3 = time.time() gaze_estimation_model.load_model() #load_gaze_estimation_model log.info("Gaze_estimation model loaded..") gem = time.time() - start3 total_time = time.time() - start_model_load_time feeder.load_data() #check for output flags if (len(output_flags) != 0): for flag in output_flags: if not flag in ['fdm', 'lrm', 'hp', 'gze']: log.error("Flag '" + flag + "' is not a valid preview flag.") sys.exit(1) frame_count = 0 start_inference_time = time.time() for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 #if frame_count%5==0: #cv2.imshow('video',cv2.resize(frame,(500,500))) key = cv2.waitKey(60) try: image, fc = face_detection_model.predict(frame, args.prob_threshold) #print (fc) #print (image.shape) #face_cords1=face_cords[0] #face_c = face_cords1.astype(np.int8) #print (image.shape) if type(image) == int: log.warning("Unable to detect the face") if key == 27: break continue #for cord in face_c: #face1=cord.astype(np.int32) # cord = (xmin,ymin,xmax,ymax) # get face landmarks # crop face from frame #face = image[face_cords1[1]:face_cords1[3],face_cords1[0]:face_cords1[2]] #print (face.shape) if 'fdm' in output_flags: #cv2.rectangle(frame,(fc[0],fc[1]),(fc[2],fc[4]),3) cv2.putText(frame, "face detected", (10, 140), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 4) # predicting using landmark detection model left_eye_image, right_eye_image, eye_coords = landmark_detection_model.predict( image) #using the output of face detection model print(eye_coords) eye_buffer = 10 if 'lrm' in output_flags: view_eye_rectangle(eye_coords, eye_buffer, image) print(left_eye_image.shape) print(right_eye_image.shape) #predicting using head_pose_estimation model pose_output = head_pose_estimation_model.predict(image) yaw = pose_output[0] pitch = pose_output[1] roll = pose_output[2] if "hp" in output_flags: cv2.putText( frame, "Pose Angles: yaw:{:.2f}, pitch:{:.2f}, roll:{:.2f}". format(yaw, pitch, roll), (10, 40), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 0), 4) mouse_coord, gaze_vector = gaze_estimation_model.predict( left_eye_image, right_eye_image, pose_output) if "gze" in output_flags: cv2.putText( frame, "Gaze Cords: x= {:.2f} , y= {:.2f} , z= {:.2f}".format( gaze_vector[0], gaze_vector[1], gaze_vector[2]), (10, 90), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 4) except Exception as e: log.warning("Could not predict using model " + str(e) + " for frame " + str(frame_count)) continue #image = cv2.resize(frame, (500, 500)) total_inference_time = time.time() - start_inference_time cv2.imshow("Visualization", cv2.resize(frame, (500, 500))) #out_video.write(preview_frame) #moving_mouse_controller if frame_count % 5 == 0: mouse_controller.move(mouse_coord[0], -1 * mouse_coord[1]) if key == 27: break log.error("VideoStream ended...") print("total_model_load time is {:} ms".format(1000 * total_time / frame_count)) print("fps is {:}".format(int(feeder.get_fps()))) print("total inference time is{:} ms".format(1000 * total_inference_time / frame_count)) print("fdmt loading time is{:} ms".format(1000 * FDMT / frame_count)) print("ldmt loading time is{:} ms".format(1000 * LDMT / frame_count)) print("hpem loading tiem{:} ms".format(1000 * hpem / frame_count)) print("gzem loading time{:} ms".format(1000 * hpem / frame_count)) cv2.destroyAllWindows() feeder.close()
def main(): args = build_argparser().parse_args() logger = logging.getLogger('main') is_benchmarking = False total_score = 0 # initialize variables with the input arguments for easy access model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'LandmarkRegressionModel': args.landmarkRegressionModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } preview_flags = args.previewFlags input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold output_path = args.output_path # add path for exercise video data exercise_video_path = '../bin/demo.mp4' exercise_gaze_path = '../bin/demo.csv' exercise_gaze_df = pd.read_csv(exercise_gaze_path) if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) exercise_feeder = InputFeeder(input_type='video', input_file=exercise_video_path) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger.error("Unable to find specified model file" + str(model_path)) exit(1) # instantiate model face_detection_model = FaceDetectionModel( model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_detection_model = LandmarkDetectionModel( model_path_dict['LandmarkRegressionModel'], device_name, threshold=prob_threshold) head_pose_estimation_model = HeadPoseEstimationModel( model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_estimation_model = GazeEstimationModel( model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) # load Models start_model_load_time = time.time() face_detection_model.load_model() landmark_detection_model.load_model() head_pose_estimation_model.load_model() gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time feeder.load_data() exercise_feeder.load_data() out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.get_fps() / 10), (1000, 500), True) frame_count = 0 gaze_vectors = [] start_inference_time = time.time() for ret, frame in feeder.next_batch(): # flip the image to make it similar to video image frame = np.flip(frame, 1) ex_ret, ex_frame = next(exercise_feeder.next_batch()) if not ret: break # This will stop the cam when exercise video is over if len(exercise_gaze_df) <= len(gaze_vectors): break frame_count += 1 key = cv2.waitKey(60) try: face_cords, cropped_image = face_detection_model.predict(frame) if type(cropped_image) == int: logger.warning("Unable to detect the face") if key == 27: break continue left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict( cropped_image) pose_output = head_pose_estimation_model.predict(cropped_image) mouse_cord, gaze_vector = gaze_estimation_model.predict( left_eye_image, right_eye_image, pose_output) gaze_vectors.append(gaze_vector) except Exception as e: logger.warning("Could predict using model" + str(e) + " for frame " + str(frame_count)) continue if not len(preview_flags) == 0: preview_frame = draw_preview(frame, 'ff', cropped_image, left_eye_image, right_eye_image, face_cords, eye_cords, pose_output, gaze_vector) cropped_image = np.hstack((cv2.resize(ex_frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) instructor_gaze_vector = exercise_gaze_df.iloc[frame_count - 1].values score = cosine(instructor_gaze_vector, gaze_vector) if score > 0.1: total_score += 1 # show score on output video cv2.putText( ex_frame, "Instructor Gaze Vector: {} ".format(instructor_gaze_vector), (40, 60), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2) cv2.putText(ex_frame, "User Gaze Vector: {}".format(gaze_vector), (40, 100), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2) cv2.putText(ex_frame, "Gaze Match Score : {}".format(total_score), (40, 145), cv2.FONT_HERSHEY_COMPLEX, 1.5, (0, 0, 0), 2) ex_frame = cv2.rectangle(ex_frame, (20, 20), (1200, 160), (0, 0, 0), 2) image = np.hstack( (cv2.resize(ex_frame, (500, 500)), cv2.resize(cropped_image, (500, 500)))) cv2.imshow('preview', image) out_video.write(image) if key == 0: break total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = frame_count / total_inference_time if input_filename == "cam": filename = "cam.csv" else: filename = input_filename.split("/")[-1].split(".")[0] + ".csv" gaze_df = pd.DataFrame(gaze_vectors, columns=['vector_x', 'vector_y', 'vector_z']) gaze_df.to_csv(filename, index=False) logger.info('Model load time: ' + str(total_model_load_time)) logger.info('Inference time: ' + str(total_inference_time)) logger.info('FPS: ' + str(fps)) logger.info('Video stream ended') cv2.destroyAllWindows() feeder.close() """
def infer_on_stream(args): start_model_load_time=time.time() #initiate and load models face_det_net = Face_Detection_Model(args.face_model) face_det_net.load_model() head_pose_net = Head_Pose_Model(args.head_model) head_pose_net.load_model() facial_landmarks_net = Facial_Landmarks_Model(args.landmarks_model) facial_landmarks_net.load_model() gaze_est_net = Gaze_Estimation_Model(args.gaze_model) gaze_est_net.load_model() total_model_load_time = time.time() - start_model_load_time #initiate stream counter=0 start_inference_time=time.time() if args.input.lower()=="cam": frame_feeder = InputFeeder(input_type='cam') frame_feeder.load_data() else: frame_feeder = InputFeeder(input_type='video', input_file=args.input) frame_feeder.load_data() fps = frame_feeder.get_fps() log.info('Video started') #initiate mouse controller mouse_controller = MouseController('medium','fast') ## write output video in Winows out_video = cv2.VideoWriter('../output.mp4',cv2.VideoWriter_fourcc(*'avc1'), fps,(frame_feeder.get_size()), True) ## write output video in Linux #out_video = cv2.VideoWriter('output.mp4',cv2.VideoWriter_fourcc(*'avc1'), #fps,(frame_feeder.get_size())) for flag,frame in frame_feeder.next_batch(): if flag == True: key = cv2.waitKey(60) counter+=1 coords, image, face = face_det_net.predict(frame) pose = head_pose_net.predict(face) land, left_eye_image, right_eye_image, eye_coords = facial_landmarks_net.predict(face) if left_eye_image.shape == (40, 40, 3): mouse_coords, gaze = gaze_est_net.predict(left_eye_image, right_eye_image, pose) mouse_controller.move(mouse_coords[0], mouse_coords[1]) if args.visual.lower()=="yes": frame = draw_outputs(coords, eye_coords, pose, gaze, mouse_coords[0], mouse_coords[1], image) cv2.imshow('video', frame) out_video.write(frame) cv2.imshow('video', frame) else: cv2.imshow('video', frame) if key == 27: break else: log.info('Video ended') total_time=time.time()-start_inference_time total_inference_time=round(total_time, 1) f_ps=counter/total_inference_time log.info("Models load time {:.2f}.".format(total_model_load_time)) log.info("Total inference time {:.2f}.".format(total_inference_time)) log.info("Inference frames pre second {:.2f}.".format(f_ps)) cv2.destroyAllWindows() frame_feeder.close() break
def main(): args = build_argparser().parse_args() logger = logging.getLogger('main') is_benchmarking = False # initialize variables with the input arguments for easy access model_path_dict = { 'FaceDetectionModel': args.faceDetectionModel, 'LandmarkRegressionModel': args.landmarkRegressionModel, 'HeadPoseEstimationModel': args.headPoseEstimationModel, 'GazeEstimationModel': args.gazeEstimationModel } preview_flags = args.previewFlags input_filename = args.input device_name = args.device prob_threshold = args.prob_threshold output_path = args.output_path if input_filename.lower() == 'cam': feeder = InputFeeder(input_type='cam') else: if not os.path.isfile(input_filename): logger.error("Unable to find specified video file") exit(1) feeder = InputFeeder(input_type='video', input_file=input_filename) for model_path in list(model_path_dict.values()): if not os.path.isfile(model_path): logger.error("Unable to find specified model file" + str(model_path)) exit(1) # instantiate model face_detection_model = FaceDetectionModel( model_path_dict['FaceDetectionModel'], device_name, threshold=prob_threshold) landmark_detection_model = LandmarkDetectionModel( model_path_dict['LandmarkRegressionModel'], device_name, threshold=prob_threshold) head_pose_estimation_model = HeadPoseEstimationModel( model_path_dict['HeadPoseEstimationModel'], device_name, threshold=prob_threshold) gaze_estimation_model = GazeEstimationModel( model_path_dict['GazeEstimationModel'], device_name, threshold=prob_threshold) # load Models start_model_load_time = time.time() face_detection_model.load_model() landmark_detection_model.load_model() head_pose_estimation_model.load_model() gaze_estimation_model.load_model() total_model_load_time = time.time() - start_model_load_time feeder.load_data() out_video = cv2.VideoWriter(os.path.join('output_video.mp4'), cv2.VideoWriter_fourcc(*'avc1'), int(feeder.get_fps() / 10), (1920, 1080), True) frame_count = 0 gaze_vectors = [] start_inference_time = time.time() for ret, frame in feeder.next_batch(): if not ret: break frame_count += 1 key = cv2.waitKey(60) try: face_cords, cropped_image = face_detection_model.predict(frame) if type(cropped_image) == int: logger.warning("Unable to detect the face") if key == 27: break continue left_eye_image, right_eye_image, eye_cords = landmark_detection_model.predict( cropped_image) pose_output = head_pose_estimation_model.predict(cropped_image) mouse_cord, gaze_vector = gaze_estimation_model.predict( left_eye_image, right_eye_image, pose_output) gaze_vectors.append(gaze_vector) except Exception as e: logger.warning("Could predict using model" + str(e) + " for frame " + str(frame_count)) continue image = cv2.resize(frame, (500, 500)) if not len(preview_flags) == 0: preview_frame = draw_preview(frame, preview_flags, cropped_image, left_eye_image, right_eye_image, face_cords, eye_cords, pose_output, gaze_vector) image = np.hstack((cv2.resize(frame, (500, 500)), cv2.resize(preview_frame, (500, 500)))) cv2.imshow('preview', image) out_video.write(frame) if key == 27: break total_time = time.time() - start_inference_time total_inference_time = round(total_time, 1) fps = frame_count / total_inference_time gaze_df = pd.DataFrame(gaze_vectors, columns=['vector_x', 'vector_y', 'vector_z']) gaze_df.to_csv("gaze_vectors_excercise_video.csv", index=False) logger.info('Model load time: ' + str(total_model_load_time)) logger.info('Inference time: ' + str(total_inference_time)) logger.info('FPS: ' + str(fps)) logger.info('Video stream ended') cv2.destroyAllWindows() feeder.close() """