def __init__( self, face_model="retinaface", landmark_model="mobilenet", au_model="rf", emotion_model="resmasknet", n_jobs=1, ): """Detector class to detect FEX from images or videos. Detector is a class used to detect faces, facial landmarks, emotions, and action units from images and videos. Args: n_jobs (int, default=1): Number of processes to use for extraction. Attributes: info (dict): n_jobs (int): Number of jobs to be used in parallel. face_model (str, default=retinaface): Name of face detection model landmark_model (str, default=mobilenet): Nam eof landmark model au_model (str, default=rf): Name of Action Unit detection model emotion_model (str, default=resmasknet): Path to emotion detection model. face_detection_columns (list): Column names for face detection ouput (x, y, w, h) face_landmark_columns (list): Column names for face landmark output (x0, y0, x1, y1, ...) emotion_model_columns (list): Column names for emotion model output mapper (dict): Class names for emotion model output by index. input_shape (dict) face_detector: face detector object face_landmark: face_landmark object emotion_model: emotion_model object Examples: >> detector = Detector(n_jobs=1) >> detector.detect_image("input.jpg") >> detector.detect_video("input.mp4") """ self.info = {} self.info["n_jobs"] = n_jobs if torch.cuda.is_available(): self.map_location = lambda storage, loc: storage.cuda() else: self.map_location = "cpu" """ LOAD UP THE MODELS """ print("Loading Face Detection model: ", face_model) # Check if model files have been downloaded. Otherwise download model. # get model url. with open(os.path.join(get_resource_path(), "model_list.json"), "r") as f: model_urls = json.load(f) if face_model: for url in model_urls["face_detectors"][ face_model.lower()]["urls"]: download_url(url, get_resource_path()) if landmark_model: for url in model_urls["landmark_detectors"][ landmark_model.lower()]["urls"]: download_url(url, get_resource_path()) if au_model: for url in model_urls["au_detectors"][au_model.lower()]["urls"]: download_url(url, get_resource_path()) if ".zip" in url: import zipfile with zipfile.ZipFile( os.path.join(get_resource_path(), "JAANetparams.zip"), 'r') as zip_ref: zip_ref.extractall(os.path.join(get_resource_path())) if au_model.lower() in ['logistic', 'svm', 'rf']: download_url( model_urls["au_detectors"]['hog-pca']['urls'][0], get_resource_path()) download_url( model_urls["au_detectors"]['au_scalar']['urls'][0], get_resource_path()) if emotion_model: for url in model_urls["emotion_detectors"][ emotion_model.lower()]["urls"]: download_url(url, get_resource_path()) if emotion_model.lower() in ['svm', 'rf']: download_url( model_urls["emotion_detectors"]['emo_pca']['urls'][0], get_resource_path()) download_url( model_urls["emotion_detectors"]['emo_scalar']['urls'] [0], get_resource_path()) if face_model: if face_model.lower() == "faceboxes": self.face_detector = FaceBoxes() elif face_model.lower() == "retinaface": self.face_detector = Retinaface_test.Retinaface() elif face_model.lower() == "mtcnn": self.face_detector = MTCNN() self.info["face_model"] = face_model facebox_columns = FEAT_FACEBOX_COLUMNS self.info["face_detection_columns"] = facebox_columns predictions = np.empty((1, len(facebox_columns))) predictions[:] = np.nan empty_facebox = pd.DataFrame(predictions, columns=facebox_columns) self._empty_facebox = empty_facebox print("Loading Face Landmark model: ", landmark_model) if landmark_model: if landmark_model.lower() == "mobilenet": self.landmark_detector = MobileNet_GDConv(136) self.landmark_detector = torch.nn.DataParallel( self.landmark_detector) checkpoint = torch.load( os.path.join( get_resource_path(), "mobilenet_224_model_best_gdconv_external.pth.tar", ), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) elif landmark_model.lower() == "pfld": self.landmark_detector = PFLDInference() checkpoint = torch.load( os.path.join(get_resource_path(), "pfld_model_best.pth.tar"), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) elif landmark_model.lower() == "mobilefacenet": self.landmark_detector = MobileFaceNet([112, 112], 136) checkpoint = torch.load( os.path.join(get_resource_path(), "mobilefacenet_model_best.pth.tar"), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) self.info["landmark_model"] = landmark_model self.info["mapper"] = openface_2d_landmark_columns landmark_columns = openface_2d_landmark_columns self.info["face_landmark_columns"] = landmark_columns predictions = np.empty((1, len(openface_2d_landmark_columns))) predictions[:] = np.nan empty_landmarks = pd.DataFrame(predictions, columns=landmark_columns) self._empty_landmark = empty_landmarks print("Loading au model: ", au_model) self.info["au_model"] = au_model if au_model: if au_model.lower() == "jaanet": self.au_model = JAANet() elif au_model.lower() == "drml": self.au_model = DRMLNet() elif au_model.lower() == "logistic": self.au_model = LogisticClassifier() elif au_model.lower() == "svm": self.au_model = SVMClassifier() elif au_model.lower() == 'rf': self.au_model = RandomForestClassifier() if (au_model is None) or (au_model.lower() in ['jaanet', 'drml']): auoccur_columns = jaanet_AU_presence else: auoccur_columns = RF_AU_presence self.info["au_presence_columns"] = auoccur_columns predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs print("Loading emotion model: ", emotion_model) self.info["emotion_model"] = emotion_model if emotion_model: if emotion_model.lower() == "fer": self.emotion_model = ferNetModule() elif emotion_model.lower() == "resmasknet": self.emotion_model = ResMaskNet() elif emotion_model.lower() == 'svm': self.emotion_model = EmoSVMClassifier() elif emotion_model.lower() == 'rf': self.emotion_model = EmoRandomForestClassifier() self.info["emotion_model_columns"] = FEAT_EMOTION_COLUMNS predictions = np.empty((1, len(FEAT_EMOTION_COLUMNS))) predictions[:] = np.nan empty_emotion = pd.DataFrame(predictions, columns=FEAT_EMOTION_COLUMNS) self._empty_emotion = empty_emotion predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs self.info["output_columns"] = (FEAT_TIME_COLUMNS + facebox_columns + landmark_columns + auoccur_columns + FEAT_EMOTION_COLUMNS + ["input"])
class Detector(object): def __init__( self, face_model="retinaface", landmark_model="mobilenet", au_model="rf", emotion_model="resmasknet", n_jobs=1, ): """Detector class to detect FEX from images or videos. Detector is a class used to detect faces, facial landmarks, emotions, and action units from images and videos. Args: n_jobs (int, default=1): Number of processes to use for extraction. Attributes: info (dict): n_jobs (int): Number of jobs to be used in parallel. face_model (str, default=retinaface): Name of face detection model landmark_model (str, default=mobilenet): Nam eof landmark model au_model (str, default=rf): Name of Action Unit detection model emotion_model (str, default=resmasknet): Path to emotion detection model. face_detection_columns (list): Column names for face detection ouput (x, y, w, h) face_landmark_columns (list): Column names for face landmark output (x0, y0, x1, y1, ...) emotion_model_columns (list): Column names for emotion model output mapper (dict): Class names for emotion model output by index. input_shape (dict) face_detector: face detector object face_landmark: face_landmark object emotion_model: emotion_model object Examples: >> detector = Detector(n_jobs=1) >> detector.detect_image("input.jpg") >> detector.detect_video("input.mp4") """ self.info = {} self.info["n_jobs"] = n_jobs if torch.cuda.is_available(): self.map_location = lambda storage, loc: storage.cuda() else: self.map_location = "cpu" """ LOAD UP THE MODELS """ print("Loading Face Detection model: ", face_model) # Check if model files have been downloaded. Otherwise download model. # get model url. with open(os.path.join(get_resource_path(), "model_list.json"), "r") as f: model_urls = json.load(f) if face_model: for url in model_urls["face_detectors"][ face_model.lower()]["urls"]: download_url(url, get_resource_path()) if landmark_model: for url in model_urls["landmark_detectors"][ landmark_model.lower()]["urls"]: download_url(url, get_resource_path()) if au_model: for url in model_urls["au_detectors"][au_model.lower()]["urls"]: download_url(url, get_resource_path()) if ".zip" in url: import zipfile with zipfile.ZipFile( os.path.join(get_resource_path(), "JAANetparams.zip"), 'r') as zip_ref: zip_ref.extractall(os.path.join(get_resource_path())) if au_model.lower() in ['logistic', 'svm', 'rf']: download_url( model_urls["au_detectors"]['hog-pca']['urls'][0], get_resource_path()) download_url( model_urls["au_detectors"]['au_scalar']['urls'][0], get_resource_path()) if emotion_model: for url in model_urls["emotion_detectors"][ emotion_model.lower()]["urls"]: download_url(url, get_resource_path()) if emotion_model.lower() in ['svm', 'rf']: download_url( model_urls["emotion_detectors"]['emo_pca']['urls'][0], get_resource_path()) download_url( model_urls["emotion_detectors"]['emo_scalar']['urls'] [0], get_resource_path()) if face_model: if face_model.lower() == "faceboxes": self.face_detector = FaceBoxes() elif face_model.lower() == "retinaface": self.face_detector = Retinaface_test.Retinaface() elif face_model.lower() == "mtcnn": self.face_detector = MTCNN() self.info["face_model"] = face_model facebox_columns = FEAT_FACEBOX_COLUMNS self.info["face_detection_columns"] = facebox_columns predictions = np.empty((1, len(facebox_columns))) predictions[:] = np.nan empty_facebox = pd.DataFrame(predictions, columns=facebox_columns) self._empty_facebox = empty_facebox print("Loading Face Landmark model: ", landmark_model) if landmark_model: if landmark_model.lower() == "mobilenet": self.landmark_detector = MobileNet_GDConv(136) self.landmark_detector = torch.nn.DataParallel( self.landmark_detector) checkpoint = torch.load( os.path.join( get_resource_path(), "mobilenet_224_model_best_gdconv_external.pth.tar", ), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) elif landmark_model.lower() == "pfld": self.landmark_detector = PFLDInference() checkpoint = torch.load( os.path.join(get_resource_path(), "pfld_model_best.pth.tar"), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) elif landmark_model.lower() == "mobilefacenet": self.landmark_detector = MobileFaceNet([112, 112], 136) checkpoint = torch.load( os.path.join(get_resource_path(), "mobilefacenet_model_best.pth.tar"), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) self.info["landmark_model"] = landmark_model self.info["mapper"] = openface_2d_landmark_columns landmark_columns = openface_2d_landmark_columns self.info["face_landmark_columns"] = landmark_columns predictions = np.empty((1, len(openface_2d_landmark_columns))) predictions[:] = np.nan empty_landmarks = pd.DataFrame(predictions, columns=landmark_columns) self._empty_landmark = empty_landmarks print("Loading au model: ", au_model) self.info["au_model"] = au_model if au_model: if au_model.lower() == "jaanet": self.au_model = JAANet() elif au_model.lower() == "drml": self.au_model = DRMLNet() elif au_model.lower() == "logistic": self.au_model = LogisticClassifier() elif au_model.lower() == "svm": self.au_model = SVMClassifier() elif au_model.lower() == 'rf': self.au_model = RandomForestClassifier() if (au_model is None) or (au_model.lower() in ['jaanet', 'drml']): auoccur_columns = jaanet_AU_presence else: auoccur_columns = RF_AU_presence self.info["au_presence_columns"] = auoccur_columns predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs print("Loading emotion model: ", emotion_model) self.info["emotion_model"] = emotion_model if emotion_model: if emotion_model.lower() == "fer": self.emotion_model = ferNetModule() elif emotion_model.lower() == "resmasknet": self.emotion_model = ResMaskNet() elif emotion_model.lower() == 'svm': self.emotion_model = EmoSVMClassifier() elif emotion_model.lower() == 'rf': self.emotion_model = EmoRandomForestClassifier() self.info["emotion_model_columns"] = FEAT_EMOTION_COLUMNS predictions = np.empty((1, len(FEAT_EMOTION_COLUMNS))) predictions[:] = np.nan empty_emotion = pd.DataFrame(predictions, columns=FEAT_EMOTION_COLUMNS) self._empty_emotion = empty_emotion predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs self.info["output_columns"] = (FEAT_TIME_COLUMNS + facebox_columns + landmark_columns + auoccur_columns + FEAT_EMOTION_COLUMNS + ["input"]) def __getitem__(self, i): return self.info[i] def detect_faces(self, frame): """Detect faces from image or video frame Args: frame (array): image array Returns: array: face detection results (x, y, x2, y2) Examples: >>> import cv2 >>> frame = cv2.imread(imgfile) >>> from feat import Detector >>> detector = Detector() >>> detector.detect_faces(frame) """ height, width, _ = frame.shape faces = self.face_detector(frame) if len(faces) == 0: print("Warning: NO FACE is detected") return faces def detect_landmarks(self, frame, detected_faces): """Detect landmarks from image or video frame Args: frame (array): image array detected_faces (array): Returns: list: x and y landmark coordinates (1,68,2) Examples: >>> import cv2 >>> frame = cv2.imread(imgfile) >>> from feat import Detector >>> detector = Detector() >>> detected_faces = detector.detect_faces(frame) >>> detector.detect_landmarks(frame, detected_faces) """ mean = np.asarray([0.485, 0.456, 0.406]) std = np.asarray([0.229, 0.224, 0.225]) self.landmark_detector.eval() if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilenet": out_size = 224 else: out_size = 112 height, width, _ = frame.shape landmark_list = [] for k, face in enumerate(detected_faces): x1 = face[0] y1 = face[1] x2 = face[2] y2 = face[3] w = x2 - x1 + 1 h = y2 - y1 + 1 size = int(min([w, h]) * 1.2) cx = x1 + w // 2 cy = y1 + h // 2 x1 = cx - size // 2 x2 = x1 + size y1 = cy - size // 2 y2 = y1 + size dx = max(0, -x1) dy = max(0, -y1) x1 = max(0, x1) y1 = max(0, y1) edx = max(0, x2 - width) edy = max(0, y2 - height) x2 = min(width, x2) y2 = min(height, y2) new_bbox = list(map(int, [x1, x2, y1, y2])) new_bbox = BBox(new_bbox) cropped = frame[new_bbox.top:new_bbox.bottom, new_bbox.left:new_bbox.right] if dx > 0 or dy > 0 or edx > 0 or edy > 0: cropped = cv2.copyMakeBorder( cropped, int(dy), int(edy), int(dx), int(edx), cv2.BORDER_CONSTANT, 0, ) cropped_face = cv2.resize(cropped, (out_size, out_size)) if cropped_face.shape[0] <= 0 or cropped_face.shape[1] <= 0: continue test_face = cropped_face.copy() test_face = test_face / 255.0 if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilenet": test_face = (test_face - mean) / std test_face = test_face.transpose((2, 0, 1)) test_face = test_face.reshape((1, ) + test_face.shape) input = torch.from_numpy(test_face).float() input = torch.autograd.Variable(input) if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilefacenet": landmark = self.landmark_detector( input)[0].cpu().data.numpy() else: landmark = self.landmark_detector(input).cpu().data.numpy() landmark = landmark.reshape(-1, 2) landmark = new_bbox.reprojectLandmark(landmark) landmark_list.append(landmark) return landmark_list def extract_face(self, frame, detected_faces, landmarks, size_output=112): """Extract a face in a frame with a convex hull of landmarks. This function extracts the faces of the frame with convex hulls and masks out the rest. Args: frame (array): The original image] detected_faces (list): face bounding box landmarks (list): the landmark information] size_output (int, optional): [description]. Defaults to 112. Returns: resized_face_np: resized face as a numpy array new_landmarks: landmarks of aligned face """ detected_faces = np.array(detected_faces) landmarks = np.array(landmarks) # if (np.any(detected_faces) < 0): # orig_size = np.array(frame).shape # if np.where(detected_faces<0)[0][0]==1: # # extend y # new_size = (orig_size[0], int(orig_size[1] + 2*abs(detected_faces[detected_faces<0][0]))) # else: # # extend x # new_size = (int(orig_size[0] + 2*abs(detected_faces[detected_faces<0][0])), orig_size[1]) # frame = resize_with_padding(Image.fromarray(frame), new_size) # frame = np.asarray(frame) # detected_faces = np.array(detector.detect_faces(np.array(frame))[0]) detected_faces = detected_faces.astype(int) aligned_img, new_landmarks = align_face_68pts(frame, landmarks.flatten(), 2.5, img_size=size_output) hull = ConvexHull(new_landmarks) mask = grid_points_in_poly( shape=np.array(aligned_img).shape, # for some reason verts need to be flipped verts=list( zip(new_landmarks[hull.vertices][:, 1], new_landmarks[hull.vertices][:, 0]))) mask[0:np.min([new_landmarks[0][1], new_landmarks[16][1]]), new_landmarks[0][0]:new_landmarks[16][0]] = True aligned_img[~mask] = 0 resized_face_np = aligned_img resized_face_np = cv2.cvtColor(resized_face_np, cv2.COLOR_BGR2RGB) return resized_face_np, new_landmarks def extract_hog(self, frame, orientation=8, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=False): """Extract HOG features from a frame. Args: frame (array]): Frame of image] orientation (int, optional): Orientation for HOG. Defaults to 8. pixels_per_cell (tuple, optional): Pixels per cell for HOG. Defaults to (8,8). cells_per_block (tuple, optional): Cells per block for HOG. Defaults to (2,2). visualize (bool, optional): Whether to provide the HOG image. Defaults to False. Returns: hog_output: array of HOG features, and the HOG image if visualize is True. """ hog_output = hog(frame, orientations=orientation, pixels_per_cell=pixels_per_cell, cells_per_block=cells_per_block, visualize=visualize, multichannel=True) if visualize: return (hog_output[0], hog_output[1]) else: return hog_output def detect_aus(self, frame, landmarks): """Detect Action Units from image or video frame Args: frame (array): image loaded in array format (n, m, 3) landmarks (array): 68 landmarks used to localize face. Returns: array: Action Unit predictions Examples: >>> import cv2 >>> frame = cv2.imread(imgfile) >>> from feat import Detector >>> detector = Detector() >>> detector.detect_aus(frame) """ # Assume that the Raw landmark is given in the format (n_land,2) #landmarks = np.transpose(landmarks) #if landmarks.shape[-1] == 68: # landmarks = convert68to49(landmarks) return self.au_model.detect_au(frame, landmarks) def detect_emotions(self, frame, facebox, landmarks): """Detect emotions from image or video frame Args: frame ([type]): [description] facebox ([type]): [description] landmarks ([type]): [description] Returns: array: Action Unit predictions Examples: >>> import cv2 >>> frame = cv2.imread(imgfile) >>> from feat import Detector >>> detector = Detector() >>> detected_faces = detector.detect_faces(frame) >>> detected_landmarks = detector.detect_landmarks(frame, detected_faces) >>> detector.detect_emotions(frame, detected_faces, detected_landmarks) """ if self.info["emotion_model"].lower() == 'fer': landmarks = np.transpose(landmarks) if landmarks.shape[-1] == 68: landmarks = convert68to49(landmarks) landmarks = landmarks.T return self.emotion_model.detect_emo(frame, landmarks) elif self.info["emotion_model"].lower() == 'resmasknet': return self.emotion_model.detect_emo(frame, facebox) elif self.info["emotion_model"].lower() in ['svm', 'rf']: return self.emotion_model.detect_emo(frame, landmarks) else: raise ValueError( 'Cannot recognize input emo model! Please try to re-type emotion model' ) def process_frame(self, frame, counter=0): """Helper function to run face detection, landmark detection, and emotion detection on a frame. Args: frame (np.array): Numpy array of image, ideally loaded through Pillow.Image counter (int, str, default=0): Index used for the prediction results dataframe. Returns: df (dataframe): Prediction results dataframe. Example: >>> from pil import Image >>> frame = Image.open("input.jpg") >>> detector = Detector() >>> detector.process_frame(np.array(frame)) """ try: # detect faces detected_faces = self.detect_faces(frame=frame) out = None for i, faces in enumerate(detected_faces): facebox_df = pd.DataFrame( [[ faces[0], faces[1], faces[2] - faces[0], faces[3] - faces[1], faces[4], ]], columns=self["face_detection_columns"], index=[counter + i], ) # detect landmarks landmarks = self.detect_landmarks(frame=frame, detected_faces=[faces[0:4]]) landmarks_df = pd.DataFrame( [landmarks[0].flatten(order="F")], columns=self["face_landmark_columns"], index=[counter + i], ) # detect AUs if self["au_model"].lower() in ['logistic', 'svm', 'rf']: convex_hull, new_lands = self.extract_face( frame=frame, detected_faces=[faces[0:4]], landmarks=landmarks, size_output=112) hogs = self.extract_hog(frame=convex_hull, visualize=False) au_occur = self.detect_aus(frame=hogs, landmarks=new_lands) else: au_occur = self.detect_aus(frame=frame, landmarks=landmarks) au_occur_df = pd.DataFrame(au_occur, columns=self["au_presence_columns"], index=[counter + i]) # detect emotions if self["emotion_model"].lower() in ['svm', 'rf']: emo_pred = self.detect_emotions(frame=hogs, facebox=None, landmarks=new_lands) else: emo_pred = self.detect_emotions(frame=frame, facebox=[faces], landmarks=landmarks[0]) emo_pred_df = pd.DataFrame(emo_pred, columns=FEAT_EMOTION_COLUMNS, index=[counter + i]) tmp_df = pd.concat( [facebox_df, landmarks_df, au_occur_df, emo_pred_df], axis=1) if out is None: out = tmp_df else: out = pd.concat([out, tmp_df], axis=0) out[FEAT_TIME_COLUMNS] = counter return out except: print("exception occurred") emotion_df = self._empty_emotion.reindex(index=[counter]) facebox_df = self._empty_facebox.reindex(index=[counter]) landmarks_df = self._empty_landmark.reindex(index=[counter]) au_occur_df = self._empty_auoccurence.reindex(index=[counter]) out = pd.concat( [facebox_df, landmarks_df, au_occur_df, emotion_df], axis=1) out[FEAT_TIME_COLUMNS] = counter return out def detect_video(self, inputFname, outputFname=None, skip_frames=1, verbose=False): """Detects FEX from a video file. Args: inputFname (str): Path to video file outputFname (str, optional): Path to output file. Defaults to None. skip_frames (int, optional): Number of every other frames to skip for speed or if not all frames need to be processed. Defaults to 1. Returns: dataframe: Prediction results dataframe if outputFname is None. Returns True if outputFname is specified. """ self.info["inputFname"] = inputFname self.info["outputFname"] = outputFname init_df = pd.DataFrame(columns=self["output_columns"]) if outputFname: init_df.to_csv(outputFname, index=False, header=True) cap = cv2.VideoCapture(inputFname) length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) frames_to_process = int(np.ceil(length / skip_frames)) # Determine whether to use multiprocessing. n_jobs = self["n_jobs"] if n_jobs == -1: thread_num = cv2.getNumberOfCPUs() # get available cpus else: thread_num = n_jobs if verbose: print(f"Using {thread_num} cpus") pool = ThreadPool(processes=thread_num) pending_task = deque() counter = 0 processed_frames = 0 frame_got = True detected_faces = [] if verbose: print("Processing video.") # single core while True: frame_got, frame = cap.read() if counter % skip_frames == 0: df = self.process_frame(frame, counter=counter) df["input"] = inputFname if outputFname: df[init_df.columns].to_csv(outputFname, index=False, header=False, mode="a") else: init_df = pd.concat([init_df, df[init_df.columns]], axis=0) counter = counter + 1 if not frame_got: break cap.release() if outputFname: return True else: return Fex( init_df, filename=inputFname, au_columns=self["au_presence_columns"], emotion_columns=FEAT_EMOTION_COLUMNS, facebox_columns=FEAT_FACEBOX_COLUMNS, landmark_columns=openface_2d_landmark_columns, time_columns=FEAT_TIME_COLUMNS, detector="Feat", ) def detect_image(self, inputFname, outputFname=None, verbose=False): """Detects FEX from an image file. Args: inputFname (str, or list of str): Path to image file or a list of paths to image files. outputFname (str, optional): Path to output file. Defaults to None. Rseturns: Fex: Prediction results dataframe if outputFname is None. Returns True if outputFname is specified. """ assert ( type(inputFname) == str or type(inputFname) == list ), "inputFname must be a string path to image or list of image paths" if type(inputFname) == str: inputFname = [inputFname] for inputF in inputFname: if not os.path.exists(inputF): raise FileNotFoundError(f"File {inputF} not found.") self.info["inputFname"] = inputFname init_df = pd.DataFrame(columns=self["output_columns"]) if outputFname: init_df.to_csv(outputFname, index=False, header=True) for inputF in inputFname: if verbose: print(f"processing {inputF}") frame = cv2.imread(inputF) df = self.process_frame(frame) df["input"] = inputF if outputFname: df[init_df.columns].to_csv(outputFname, index=False, header=False, mode="a") else: init_df = pd.concat([init_df, df[init_df.columns]], axis=0) if outputFname: return True else: return Fex( init_df, filename=inputFname, au_columns=self['au_presence_columns'], emotion_columns=FEAT_EMOTION_COLUMNS, facebox_columns=FEAT_FACEBOX_COLUMNS, landmark_columns=openface_2d_landmark_columns, time_columns=FACET_TIME_COLUMNS, detector="Feat", )
class Detector(object): def __init__( self, face_model="retinaface", landmark_model="MobileNet", au_model="jaanet", emotion_model="fer", n_jobs=1, ): """Detector class to detect FEX from images or videos. Detector is a class used to detect faces, facial landmarks, emotions, and action units from images and videos. Args: n_jobs (int, default=1): Number of processes to use for extraction. Attributes: info (dict): n_jobs (int): Number of jobs to be used in parallel. face_detection_model (str, default=haarcascade_frontalface_alt.xml): Path to face detection model. face_detection_columns (list): Column names for face detection ouput (x, y, w, h) face_landmark_model (str, default=lbfmodel.yaml): Path to landmark model. face_landmark_columns (list): Column names for face landmark output (x0, y0, x1, y1, ...) emotion_model (str, default=fer_aug_model.h5): Path to emotion detection model. emotion_model_columns (list): Column names for emotion model output mapper (dict): Class names for emotion model output by index. input_shape (dict) face_detector: face detector object face_landmark: face_landmark object emotion_model: emotion_model object Examples: >> detector = Detector(n_jobs=1) >> detector.detect_image("input.jpg") >> detector.detect_video("input.mp4") """ self.info = {} self.info["n_jobs"] = n_jobs if torch.cuda.is_available(): self.map_location = lambda storage, loc: storage.cuda() else: self.map_location = "cpu" """ LOAD UP THE MODELS """ print("Loading Face Detection model: ", face_model) if face_model: if face_model.lower() == "faceboxes": self.face_detector = FaceBoxes() elif face_model.lower() == "retinaface": self.face_detector = Retinaface_test.Retinaface() elif face_model.lower() == "mtcnn": self.face_detector = MTCNN() self.info["face_model"] = face_model # self.info["mapper"] = FEAT_FACEBOX_COLUMNS facebox_columns = FEAT_FACEBOX_COLUMNS self.info["face_detection_columns"] = facebox_columns predictions = np.empty((1, len(facebox_columns))) predictions[:] = np.nan empty_facebox = pd.DataFrame(predictions, columns=facebox_columns) self._empty_facebox = empty_facebox print("Loading Face Landmark model: ", landmark_model) # self.info['Landmark_Model'] = landmark_model if landmark_model: if landmark_model.lower() == "mobilenet": self.landmark_detector = MobileNet_GDConv(136) self.landmark_detector = torch.nn.DataParallel( self.landmark_detector) # or download model from https://drive.google.com/file/d/1Le5UdpMkKOTRr1sTp4lwkw8263sbgdSe/view?usp=sharing checkpoint = torch.load( os.path.join( get_resource_path(), "mobilenet_224_model_best_gdconv_external.pth.tar", ), map_location=self.map_location, ) # print("Use MobileNet as backbone") self.landmark_detector.load_state_dict( checkpoint["state_dict"]) elif landmark_model.lower() == "pfld": self.landmark_detector = PFLDInference() # or download from https://drive.google.com/file/d/1gjgtm6qaBQJ_EY7lQfQj3EuMJCVg9lVu/view?usp=sharing checkpoint = torch.load( os.path.join(get_resource_path(), "pfld_model_best.pth.tar"), map_location=self.map_location, ) # print("Use PFLD as backbone") self.landmark_detector.load_state_dict( checkpoint["state_dict"]) # or download from https://drive.google.com/file/d/1T8J73UTcB25BEJ_ObAJczCkyGKW5VaeY/view?usp=sharing elif landmark_model.lower() == "mobilefacenet": self.landmark_detector = MobileFaceNet([112, 112], 136) checkpoint = torch.load( os.path.join(get_resource_path(), "mobilefacenet_model_best.pth.tar"), map_location=self.map_location, ) # print("Use MobileFaceNet as backbone") self.landmark_detector.load_state_dict( checkpoint["state_dict"]) self.info["landmark_model"] = landmark_model self.info["mapper"] = openface_2d_landmark_columns landmark_columns = openface_2d_landmark_columns self.info["face_landmark_columns"] = landmark_columns predictions = np.empty((1, len(openface_2d_landmark_columns))) predictions[:] = np.nan empty_landmarks = pd.DataFrame(predictions, columns=landmark_columns) self._empty_landmark = empty_landmarks print("Loading au occurence model: ", au_model) self.info["au_model"] = au_model if au_model: if au_model.lower() == "jaanet": self.au_model = JAANet() elif au_model.lower() == "drml": self.au_model = DRMLNet() # self.info["mapper"] = jaanet_AU_presence auoccur_columns = jaanet_AU_presence self.info["au_presence_columns"] = auoccur_columns predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs print("Loading emotion model: ", emotion_model) self.info["emotion_model"] = emotion_model if emotion_model: if emotion_model.lower() == "fer": self.emotion_model = ferNetModule() elif emotion_model.lower() == "resmasknet": self.emotion_model = ResMaskNet() self.info["emotion_model_columns"] = FEAT_EMOTION_COLUMNS predictions = np.empty((1, len(FEAT_EMOTION_COLUMNS))) predictions[:] = np.nan empty_emotion = pd.DataFrame(predictions, columns=FEAT_EMOTION_COLUMNS) self._empty_emotion = empty_emotion # self.info['auoccur_model'] = au_model # self.info["mapper"] = jaanet_AU_presence auoccur_columns = jaanet_AU_presence self.info["au_presence_columns"] = auoccur_columns predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs self.info["output_columns"] = (FEAT_TIME_COLUMNS + facebox_columns + landmark_columns + auoccur_columns + FEAT_EMOTION_COLUMNS + ["input"]) def __getitem__(self, i): return self.info[i] def detect_faces(self, frame): # suppose frame=cv2.imread(imgname) height, width, _ = frame.shape faces = self.face_detector(frame) if len(faces) == 0: print("Warning: NO FACE is detected") return faces def detect_landmarks(self, frame, detected_faces): mean = np.asarray([0.485, 0.456, 0.406]) std = np.asarray([0.229, 0.224, 0.225]) self.landmark_detector.eval() if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilenet": out_size = 224 else: out_size = 112 height, width, _ = frame.shape landmark_list = [] for k, face in enumerate(detected_faces): x1 = face[0] y1 = face[1] x2 = face[2] y2 = face[3] w = x2 - x1 + 1 h = y2 - y1 + 1 size = int(min([w, h]) * 1.2) cx = x1 + w // 2 cy = y1 + h // 2 x1 = cx - size // 2 x2 = x1 + size y1 = cy - size // 2 y2 = y1 + size dx = max(0, -x1) dy = max(0, -y1) x1 = max(0, x1) y1 = max(0, y1) edx = max(0, x2 - width) edy = max(0, y2 - height) x2 = min(width, x2) y2 = min(height, y2) new_bbox = list(map(int, [x1, x2, y1, y2])) new_bbox = BBox(new_bbox) cropped = frame[new_bbox.top:new_bbox.bottom, new_bbox.left:new_bbox.right] if dx > 0 or dy > 0 or edx > 0 or edy > 0: cropped = cv2.copyMakeBorder( cropped, int(dy), int(edy), int(dx), int(edx), cv2.BORDER_CONSTANT, 0, ) cropped_face = cv2.resize(cropped, (out_size, out_size)) if cropped_face.shape[0] <= 0 or cropped_face.shape[1] <= 0: continue test_face = cropped_face.copy() test_face = test_face / 255.0 if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilenet": test_face = (test_face - mean) / std test_face = test_face.transpose((2, 0, 1)) test_face = test_face.reshape((1, ) + test_face.shape) input = torch.from_numpy(test_face).float() input = torch.autograd.Variable(input) if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilefacenet": landmark = self.landmark_detector( input)[0].cpu().data.numpy() else: landmark = self.landmark_detector(input).cpu().data.numpy() landmark = landmark.reshape(-1, 2) landmark = new_bbox.reprojectLandmark(landmark) landmark_list.append(landmark) return landmark_list def detect_aus(self, frame, landmarks): # Assume that the Raw landmark is given in the format (n_land,2) landmarks = np.transpose(landmarks) if landmarks.shape[-1] == 68: landmarks = convert68to49(landmarks) return self.au_model.detect_au(frame, landmarks) def detect_emotions(self, frame, facebox, landmarks): """Detect emotions. Args: frame ([type]): [description] facebox ([type]): [description] landmarks ([type]): [description] Returns: [type]: [description] """ if self.info["emotion_model"].lower() == 'fer': landmarks = np.transpose(landmarks) if landmarks.shape[-1] == 68: landmarks = convert68to49(landmarks) landmarks = landmarks.T return self.emotion_model.detect_emo(frame, landmarks) elif self.info["emotion_model"].lower() == 'resmasknet': return self.emotion_model.detect_emo(frame, facebox) def process_frame(self, frame, counter=0): """Helper function to run face detection, landmark detection, and emotion detection on a frame. Args: frame (np.array): Numpy array of image, ideally loaded through Pillow.Image counter (int, str, default=0): Index used for the prediction results dataframe. Returns: df (dataframe): Prediction results dataframe. Example: >> from pil import Image >> frame = Image.open("input.jpg") >> detector = Detector() >> detector.process_frame(np.array(frame)) """ try: # detect faces detected_faces = self.detect_faces(frame=frame) out = None for i, faces in enumerate(detected_faces): facebox_df = pd.DataFrame( [[ faces[0], faces[1], faces[2] - faces[0], faces[3] - faces[1], faces[4], ]], columns=self["face_detection_columns"], index=[counter + i], ) # detect landmarks landmarks = self.detect_landmarks(frame=frame, detected_faces=[faces[0:4]]) landmarks_df = pd.DataFrame( [landmarks[0].flatten(order="F")], columns=self["face_landmark_columns"], index=[counter + i], ) # detect AUs au_occur = self.detect_aus(frame=frame, landmarks=landmarks) au_occur_df = pd.DataFrame(au_occur, columns=self["au_presence_columns"], index=[counter + i]) # detect emotions emo_pred = self.detect_emotions(frame=frame, facebox=[faces], landmarks=landmarks[0]) emo_pred_df = pd.DataFrame(emo_pred, columns=FEAT_EMOTION_COLUMNS, index=[counter + i]) tmp_df = pd.concat( [facebox_df, landmarks_df, au_occur_df, emo_pred_df], axis=1) if out is None: out = tmp_df else: out = pd.concat([out, tmp_df], axis=0) out[FEAT_TIME_COLUMNS] = counter return out except: print("exception occurred") emotion_df = self._empty_emotion.reindex(index=[counter]) facebox_df = self._empty_facebox.reindex(index=[counter]) landmarks_df = self._empty_landmark.reindex(index=[counter]) au_occur_df = self._empty_auoccurence.reindex(index=[counter]) out = pd.concat( [facebox_df, landmarks_df, au_occur_df, emotion_df], axis=1) out[FEAT_TIME_COLUMNS] = counter return out def detect_video(self, inputFname, outputFname=None, skip_frames=1, verbose=False): """Detects FEX from a video file. Args: inputFname (str): Path to video file outputFname (str, optional): Path to output file. Defaults to None. skip_frames (int, optional): Number of every other frames to skip for speed or if not all frames need to be processed. Defaults to 1. Returns: dataframe: Prediction results dataframe if outputFname is None. Returns True if outputFname is specified. """ self.info["inputFname"] = inputFname self.info["outputFname"] = outputFname init_df = pd.DataFrame(columns=self["output_columns"]) if outputFname: init_df.to_csv(outputFname, index=False, header=True) cap = cv2.VideoCapture(inputFname) length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) frames_to_process = int(np.ceil(length / skip_frames)) # Determine whether to use multiprocessing. n_jobs = self["n_jobs"] if n_jobs == -1: thread_num = cv2.getNumberOfCPUs() # get available cpus else: thread_num = n_jobs if verbose: print(f"Using {thread_num} cpus") pool = ThreadPool(processes=thread_num) pending_task = deque() counter = 0 processed_frames = 0 frame_got = True detected_faces = [] if verbose: print("Processing video.") # single core while True: frame_got, frame = cap.read() if counter % skip_frames == 0: df = self.process_frame(frame, counter=counter) df["input"] = inputFname if outputFname: df[init_df.columns].to_csv(outputFname, index=False, header=False, mode="a") else: init_df = pd.concat([init_df, df[init_df.columns]], axis=0) counter = counter + 1 if not frame_got: break cap.release() if outputFname: return True else: return init_df def detect_image(self, inputFname, outputFname=None, verbose=False): """Detects FEX from a video file. Args: inputFname (str, or list of str): Path to image file or a list of paths to image files. outputFname (str, optional): Path to output file. Defaults to None. Rseturns: Fex: Prediction results dataframe if outputFname is None. Returns True if outputFname is specified. """ assert ( type(inputFname) == str or type(inputFname) == list ), "inputFname must be a string path to image or list of image paths" if type(inputFname) == str: inputFname = [inputFname] for inputF in inputFname: if not os.path.exists(inputF): raise FileNotFoundError(f"File {inputF} not found.") self.info["inputFname"] = inputFname init_df = pd.DataFrame(columns=self["output_columns"]) if outputFname: init_df.to_csv(outputFname, index=False, header=True) for inputF in inputFname: if verbose: print(f"processing {inputF}") frame = cv2.imread(inputF) df = self.process_frame(frame) df["input"] = inputF if outputFname: df[init_df.columns].to_csv(outputFname, index=False, header=False, mode="a") else: init_df = pd.concat([init_df, df[init_df.columns]], axis=0) if outputFname: return True else: return Fex( init_df, filename=inputFname, au_columns=jaanet_AU_presence, emotion_columns=FEAT_EMOTION_COLUMNS, facebox_columns=FEAT_FACEBOX_COLUMNS, landmark_columns=openface_2d_landmark_columns, time_columns=FACET_TIME_COLUMNS, detector="Feat", )
def __init__( self, face_model="retinaface", landmark_model="MobileNet", au_model="jaanet", emotion_model="fer", n_jobs=1, ): """Detector class to detect FEX from images or videos. Detector is a class used to detect faces, facial landmarks, emotions, and action units from images and videos. Args: n_jobs (int, default=1): Number of processes to use for extraction. Attributes: info (dict): n_jobs (int): Number of jobs to be used in parallel. face_detection_model (str, default=haarcascade_frontalface_alt.xml): Path to face detection model. face_detection_columns (list): Column names for face detection ouput (x, y, w, h) face_landmark_model (str, default=lbfmodel.yaml): Path to landmark model. face_landmark_columns (list): Column names for face landmark output (x0, y0, x1, y1, ...) emotion_model (str, default=fer_aug_model.h5): Path to emotion detection model. emotion_model_columns (list): Column names for emotion model output mapper (dict): Class names for emotion model output by index. input_shape (dict) face_detector: face detector object face_landmark: face_landmark object emotion_model: emotion_model object Examples: >> detector = Detector(n_jobs=1) >> detector.detect_image("input.jpg") >> detector.detect_video("input.mp4") """ self.info = {} self.info["n_jobs"] = n_jobs if torch.cuda.is_available(): self.map_location = lambda storage, loc: storage.cuda() else: self.map_location = "cpu" """ LOAD UP THE MODELS """ print("Loading Face Detection model: ", face_model) if face_model: if face_model.lower() == "faceboxes": self.face_detector = FaceBoxes() elif face_model.lower() == "retinaface": self.face_detector = Retinaface_test.Retinaface() elif face_model.lower() == "mtcnn": self.face_detector = MTCNN() self.info["face_model"] = face_model # self.info["mapper"] = FEAT_FACEBOX_COLUMNS facebox_columns = FEAT_FACEBOX_COLUMNS self.info["face_detection_columns"] = facebox_columns predictions = np.empty((1, len(facebox_columns))) predictions[:] = np.nan empty_facebox = pd.DataFrame(predictions, columns=facebox_columns) self._empty_facebox = empty_facebox print("Loading Face Landmark model: ", landmark_model) # self.info['Landmark_Model'] = landmark_model if landmark_model: if landmark_model.lower() == "mobilenet": self.landmark_detector = MobileNet_GDConv(136) self.landmark_detector = torch.nn.DataParallel( self.landmark_detector) # or download model from https://drive.google.com/file/d/1Le5UdpMkKOTRr1sTp4lwkw8263sbgdSe/view?usp=sharing checkpoint = torch.load( os.path.join( get_resource_path(), "mobilenet_224_model_best_gdconv_external.pth.tar", ), map_location=self.map_location, ) # print("Use MobileNet as backbone") self.landmark_detector.load_state_dict( checkpoint["state_dict"]) elif landmark_model.lower() == "pfld": self.landmark_detector = PFLDInference() # or download from https://drive.google.com/file/d/1gjgtm6qaBQJ_EY7lQfQj3EuMJCVg9lVu/view?usp=sharing checkpoint = torch.load( os.path.join(get_resource_path(), "pfld_model_best.pth.tar"), map_location=self.map_location, ) # print("Use PFLD as backbone") self.landmark_detector.load_state_dict( checkpoint["state_dict"]) # or download from https://drive.google.com/file/d/1T8J73UTcB25BEJ_ObAJczCkyGKW5VaeY/view?usp=sharing elif landmark_model.lower() == "mobilefacenet": self.landmark_detector = MobileFaceNet([112, 112], 136) checkpoint = torch.load( os.path.join(get_resource_path(), "mobilefacenet_model_best.pth.tar"), map_location=self.map_location, ) # print("Use MobileFaceNet as backbone") self.landmark_detector.load_state_dict( checkpoint["state_dict"]) self.info["landmark_model"] = landmark_model self.info["mapper"] = openface_2d_landmark_columns landmark_columns = openface_2d_landmark_columns self.info["face_landmark_columns"] = landmark_columns predictions = np.empty((1, len(openface_2d_landmark_columns))) predictions[:] = np.nan empty_landmarks = pd.DataFrame(predictions, columns=landmark_columns) self._empty_landmark = empty_landmarks print("Loading au occurence model: ", au_model) self.info["au_model"] = au_model if au_model: if au_model.lower() == "jaanet": self.au_model = JAANet() elif au_model.lower() == "drml": self.au_model = DRMLNet() # self.info["mapper"] = jaanet_AU_presence auoccur_columns = jaanet_AU_presence self.info["au_presence_columns"] = auoccur_columns predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs print("Loading emotion model: ", emotion_model) self.info["emotion_model"] = emotion_model if emotion_model: if emotion_model.lower() == "fer": self.emotion_model = ferNetModule() elif emotion_model.lower() == "resmasknet": self.emotion_model = ResMaskNet() self.info["emotion_model_columns"] = FEAT_EMOTION_COLUMNS predictions = np.empty((1, len(FEAT_EMOTION_COLUMNS))) predictions[:] = np.nan empty_emotion = pd.DataFrame(predictions, columns=FEAT_EMOTION_COLUMNS) self._empty_emotion = empty_emotion # self.info['auoccur_model'] = au_model # self.info["mapper"] = jaanet_AU_presence auoccur_columns = jaanet_AU_presence self.info["au_presence_columns"] = auoccur_columns predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs self.info["output_columns"] = (FEAT_TIME_COLUMNS + facebox_columns + landmark_columns + auoccur_columns + FEAT_EMOTION_COLUMNS + ["input"])
class Detector(object): def __init__( self, face_model="retinaface", landmark_model="mobilenet", au_model="rf", emotion_model="resmasknet", facepose_model="pnp", n_jobs=1, ): """Detector class to detect FEX from images or videos. Detector is a class used to detect faces, facial landmarks, emotions, and action units from images and videos. Args: n_jobs (int, default=1): Number of processes to use for extraction. Attributes: info (dict): n_jobs (int): Number of jobs to be used in parallel. face_model (str, default=retinaface): Name of face detection model landmark_model (str, default=mobilenet): Nam eof landmark model au_model (str, default=rf): Name of Action Unit detection model emotion_model (str, default=resmasknet): Path to emotion detection model. facepose_model (str, default=pnp): Name of headpose detection model. face_detection_columns (list): Column names for face detection ouput (x, y, w, h) face_landmark_columns (list): Column names for face landmark output (x0, y0, x1, y1, ...) emotion_model_columns (list): Column names for emotion model output emotion_model_columns (list): Column names for emotion model output mapper (dict): Class names for emotion model output by index. input_shape (dict) face_detector: face detector object face_landmark: face_landmark object emotion_model: emotion_model object Examples: >> detector = Detector(n_jobs=1) >> detector.detect_image(["input.jpg"]) >> detector.detect_video("input.mp4") """ self.info = {} self.info["n_jobs"] = n_jobs if torch.cuda.is_available(): self.map_location = lambda storage, loc: storage.cuda() else: self.map_location = "cpu" # Handle img2pose mismatch error if facepose_model and "img2pose" in facepose_model.lower( ) and facepose_model.lower() != face_model.lower(): print( facepose_model, " is both a face detector and pose estimator, and cannot be used with a different " "face detector. Setting face detector to use ", facepose_model, '.', sep='') face_model = facepose_model """ LOAD UP THE MODELS """ print("Loading Face Detection model: ", face_model) # Check if model files have been downloaded. Otherwise download model. # get model url. with open(os.path.join(get_resource_path(), "model_list.json"), "r") as f: model_urls = json.load(f) if face_model: for url in model_urls["face_detectors"][ face_model.lower()]["urls"]: download_url(url, get_resource_path()) if landmark_model: for url in model_urls["landmark_detectors"][ landmark_model.lower()]["urls"]: download_url(url, get_resource_path()) if au_model: for url in model_urls["au_detectors"][au_model.lower()]["urls"]: download_url(url, get_resource_path()) if ".zip" in url: import zipfile with zipfile.ZipFile( os.path.join(get_resource_path(), "JAANetparams.zip"), 'r') as zip_ref: zip_ref.extractall(os.path.join(get_resource_path())) if au_model.lower() in ['logistic', 'svm', 'rf']: download_url( model_urls["au_detectors"]['hog-pca']['urls'][0], get_resource_path()) download_url( model_urls["au_detectors"]['au_scalar']['urls'][0], get_resource_path()) if emotion_model: for url in model_urls["emotion_detectors"][ emotion_model.lower()]["urls"]: download_url(url, get_resource_path()) if emotion_model.lower() in ['svm', 'rf']: download_url( model_urls["emotion_detectors"]['emo_pca']['urls'][0], get_resource_path()) download_url( model_urls["emotion_detectors"]['emo_scalar']['urls'] [0], get_resource_path()) if face_model: if face_model.lower() == "faceboxes": self.face_detector = FaceBoxes() elif face_model.lower() == "retinaface": self.face_detector = Retinaface_test.Retinaface() elif face_model.lower() == "mtcnn": self.face_detector = MTCNN() elif "img2pose" in face_model.lower(): # Check if user selected unconstrained or constrained version constrained = False # use by default if face_model.lower() == "img2pose-c": constrained = True # Used as both face detector and facepose estimator self.face_detector = Img2Pose( cpu_mode=self.map_location == "cpu", constrained=constrained) self.facepose_detector = self.face_detector self.info["face_model"] = face_model facebox_columns = FEAT_FACEBOX_COLUMNS self.info["face_detection_columns"] = facebox_columns predictions = np.empty((1, len(facebox_columns))) predictions[:] = np.nan empty_facebox = pd.DataFrame(predictions, columns=facebox_columns) self._empty_facebox = empty_facebox print("Loading Face Landmark model: ", landmark_model) if landmark_model: if landmark_model.lower() == "mobilenet": self.landmark_detector = MobileNet_GDConv(136) self.landmark_detector = torch.nn.DataParallel( self.landmark_detector) checkpoint = torch.load( os.path.join( get_resource_path(), "mobilenet_224_model_best_gdconv_external.pth.tar", ), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) elif landmark_model.lower() == "pfld": self.landmark_detector = PFLDInference() checkpoint = torch.load( os.path.join(get_resource_path(), "pfld_model_best.pth.tar"), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) elif landmark_model.lower() == "mobilefacenet": self.landmark_detector = MobileFaceNet([112, 112], 136) checkpoint = torch.load( os.path.join(get_resource_path(), "mobilefacenet_model_best.pth.tar"), map_location=self.map_location, ) self.landmark_detector.load_state_dict( checkpoint["state_dict"]) self.info["landmark_model"] = landmark_model self.info["mapper"] = openface_2d_landmark_columns landmark_columns = openface_2d_landmark_columns self.info["face_landmark_columns"] = landmark_columns predictions = np.empty((1, len(openface_2d_landmark_columns))) predictions[:] = np.nan empty_landmarks = pd.DataFrame(predictions, columns=landmark_columns) self._empty_landmark = empty_landmarks print("Loading au model: ", au_model) self.info["au_model"] = au_model if au_model: if au_model.lower() == "jaanet": self.au_model = JAANet() elif au_model.lower() == "drml": self.au_model = DRMLNet() elif au_model.lower() == "logistic": self.au_model = LogisticClassifier() elif au_model.lower() == "svm": self.au_model = SVMClassifier() elif au_model.lower() == 'rf': self.au_model = RandomForestClassifier() if (au_model is None) or (au_model.lower() in ['jaanet', 'drml']): auoccur_columns = jaanet_AU_presence else: auoccur_columns = RF_AU_presence self.info["au_presence_columns"] = auoccur_columns predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs print("Loading emotion model: ", emotion_model) self.info["emotion_model"] = emotion_model if emotion_model: if emotion_model.lower() == "fer": self.emotion_model = ferNetModule() elif emotion_model.lower() == "resmasknet": self.emotion_model = ResMaskNet() elif emotion_model.lower() == 'svm': self.emotion_model = EmoSVMClassifier() elif emotion_model.lower() == 'rf': self.emotion_model = EmoRandomForestClassifier() self.info["emotion_model_columns"] = FEAT_EMOTION_COLUMNS predictions = np.empty((1, len(FEAT_EMOTION_COLUMNS))) predictions[:] = np.nan empty_emotion = pd.DataFrame(predictions, columns=FEAT_EMOTION_COLUMNS) self._empty_emotion = empty_emotion predictions = np.empty((1, len(auoccur_columns))) predictions[:] = np.nan empty_au_occurs = pd.DataFrame(predictions, columns=auoccur_columns) self._empty_auoccurence = empty_au_occurs print("Loading facepose model: ", facepose_model) self.info["facepose_model"] = facepose_model if facepose_model: if facepose_model.lower() == "pnp": self.facepose_detector = PerspectiveNPoint() # Note that img2pose case is handled under face_model loading self.info["facepose_model_columns"] = FACET_FACEPOSE_COLUMNS predictions = np.empty((1, len(FACET_FACEPOSE_COLUMNS))) predictions[:] = np.nan empty_facepose = pd.DataFrame(predictions, columns=FACET_FACEPOSE_COLUMNS) self._empty_facepose = empty_facepose self.info["output_columns"] = (FEAT_TIME_COLUMNS + facebox_columns + landmark_columns + auoccur_columns + FACET_FACEPOSE_COLUMNS + FEAT_EMOTION_COLUMNS + ["input"]) def __getitem__(self, i): return self.info[i] def detect_faces(self, frame): """Detect faces from image or video frame Args: frame (array): image array Returns: list: face detection results (x, y, x2, y2) Examples: >>> import cv2 >>> frame = cv2.imread(imgfile) >>> from feat import Detector >>> detector = Detector() >>> detector.detect_faces(frame) """ # check if frame is 4d if frame.ndim == 3: frame = np.expand_dims(frame, 0) assert frame.ndim == 4, "Frame needs to be 4 dimensions (list of images)" # height, width, _ = frame.shape if "img2pose" in self.info["face_model"]: faces, poses = self.face_detector(frame) else: faces = self.face_detector(frame) if len(faces) == 0: print("Warning: NO FACE is detected") return faces def detect_landmarks(self, frame, detected_faces): """Detect landmarks from image or video frame Args: frame (array): image array detected_faces (array): Returns: list: x and y landmark coordinates (1,68,2) Examples: >>> import cv2 >>> frame = cv2.imread(imgfile) >>> from feat import Detector >>> detector = Detector() >>> detected_faces = detector.detect_faces(frame) >>> detector.detect_landmarks(frame, detected_faces) """ # check if frame is 4d if frame.ndim == 3: frame = np.expand_dims(frame, 0) assert frame.ndim == 4, "Frame needs to be 4 dimensions (list of images)" mean = np.asarray([0.485, 0.456, 0.406]) std = np.asarray([0.229, 0.224, 0.225]) self.landmark_detector.eval() if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilenet": out_size = 224 else: out_size = 112 _, height, width, _ = frame.shape landmark_list = [] concate_arr, len_frames_faces, bbox_list = self._face_preprocesing( frame=frame, detected_faces=detected_faces, mean=mean, std=std, out_size=out_size, height=height, width=width) # Run through the deep leanring model input = torch.from_numpy(concate_arr).float() input = torch.autograd.Variable(input) if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilefacenet": landmark = self.landmark_detector(input)[0].cpu().data.numpy() else: landmark = self.landmark_detector(input).cpu().data.numpy() landmark_results = [] landmark = landmark.reshape(landmark.shape[0], -1, 2) for ik in range(landmark.shape[0]): landmark2 = bbox_list[ik].reprojectLandmark(landmark[ik, :, :]) landmark_results.append(landmark2) list_concat = [] new_lens = np.insert(np.cumsum(len_frames_faces), 0, 0) for ij in range(len(len_frames_faces)): list_concat.append(landmark_results[new_lens[ij]:new_lens[ij + 1]]) return list_concat def _batch_hog(self, frames, detected_faces, landmarks): """ NEW Helper function used in batch processing hog features frames is a batch of frames """ len_index = [len(aa) for aa in landmarks] lenth_cumu = np.cumsum(len_index) lenth_cumu2 = np.insert(lenth_cumu, 0, 0) new_lands_list = [] flat_faces = [item for sublist in detected_faces for item in sublist] flat_land = [item for sublist in landmarks for item in sublist] hogs_arr = None for i in range(len(flat_land)): frame_assignment = np.where(i < lenth_cumu)[0][0] convex_hull, new_lands = self.extract_face( frame=frames[frame_assignment], detected_faces=[flat_faces[i][0:4]], landmarks=flat_land[i], size_output=112) hogs = self.extract_hog(frame=convex_hull, visualize=False).reshape(1, -1) if hogs_arr is None: hogs_arr = hogs else: hogs_arr = np.concatenate([hogs_arr, hogs], 0) new_lands_list.append(new_lands) new_lands = [] for i in range(len(lenth_cumu)): new_lands.append(new_lands_list[lenth_cumu2[i]:(lenth_cumu2[i + 1])]) return (hogs_arr, new_lands) def _face_preprocesing(self, frame, detected_faces, mean, std, out_size, height, width): """ NEW Helper function used in batch detecting landmarks Let's assume that frame is of shape B x H x W x 3 """ lenth_index = [len(ama) for ama in detected_faces] lenth_cumu = np.cumsum(lenth_index) flat_faces = [item for sublist in detected_faces for item in sublist] # Flatten the faces concatenated_face = None bbox_list = [] for k, face in enumerate(flat_faces): frame_assignment = np.where( k <= lenth_cumu)[0][0] # which frame is it? x1 = face[0] y1 = face[1] x2 = face[2] y2 = face[3] w = x2 - x1 + 1 h = y2 - y1 + 1 size = int(min([w, h]) * 1.2) cx = x1 + w // 2 cy = y1 + h // 2 x1 = cx - size // 2 x2 = x1 + size y1 = cy - size // 2 y2 = y1 + size dx = max(0, -x1) dy = max(0, -y1) x1 = max(0, x1) y1 = max(0, y1) edx = max(0, x2 - width) edy = max(0, y2 - height) x2 = min(width, x2) y2 = min(height, y2) new_bbox = list(map(int, [x1, x2, y1, y2])) new_bbox = BBox(new_bbox) cropped = frame[frame_assignment, new_bbox.top:new_bbox.bottom, new_bbox.left:new_bbox.right] bbox_list.append(new_bbox) if dx > 0 or dy > 0 or edx > 0 or edy > 0: cropped = cv2.copyMakeBorder( cropped, int(dy), int(edy), int(dx), int(edx), cv2.BORDER_CONSTANT, 0, ) cropped_face = cv2.resize(cropped, (out_size, out_size)) if cropped_face.shape[0] <= 0 or cropped_face.shape[1] <= 0: continue test_face = cropped_face.copy() test_face = test_face / 255.0 if self.info["landmark_model"]: if self.info["landmark_model"].lower() == "mobilenet": test_face = (test_face - mean) / std test_face = test_face.transpose((2, 0, 1)) test_face = test_face.reshape((1, ) + test_face.shape) if concatenated_face is None: concatenated_face = test_face else: concatenated_face = np.concatenate( [concatenated_face, test_face], 0) return (concatenated_face, lenth_index, bbox_list) def extract_face(self, frame, detected_faces, landmarks, size_output=112): """Extract a face in a frame with a convex hull of landmarks. This function extracts the faces of the frame with convex hulls and masks out the rest. Args: frame (array): The original image] detected_faces (list): face bounding box landmarks (list): the landmark information] size_output (int, optional): [description]. Defaults to 112. Returns: resized_face_np: resized face as a numpy array new_landmarks: landmarks of aligned face """ detected_faces = np.array(detected_faces) landmarks = np.array(landmarks) detected_faces = detected_faces.astype(int) aligned_img, new_landmarks = align_face_68pts(frame, landmarks.flatten(), 2.5, img_size=size_output) hull = ConvexHull(new_landmarks) mask = grid_points_in_poly( shape=np.array(aligned_img).shape, # for some reason verts need to be flipped verts=list( zip(new_landmarks[hull.vertices][:, 1], new_landmarks[hull.vertices][:, 0]))) mask[0:np.min([new_landmarks[0][1], new_landmarks[16][1]]), new_landmarks[0][0]:new_landmarks[16][0]] = True aligned_img[~mask] = 0 resized_face_np = aligned_img resized_face_np = cv2.cvtColor(resized_face_np, cv2.COLOR_BGR2RGB) return resized_face_np, new_landmarks #, hull, mask, np.array(aligned_img).shape, list(zip(new_landmarks[hull.vertices][:, 1], new_landmarks[hull.vertices][:, 0])), origin_mask def extract_hog(self, frame, orientation=8, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=False): """Extract HOG features from a SINGLE frame. Args: frame (array]): Frame of image] orientation (int, optional): Orientation for HOG. Defaults to 8. pixels_per_cell (tuple, optional): Pixels per cell for HOG. Defaults to (8,8). cells_per_block (tuple, optional): Cells per block for HOG. Defaults to (2,2). visualize (bool, optional): Whether to provide the HOG image. Defaults to False. Returns: hog_output: array of HOG features, and the HOG image if visualize is True. """ hog_output = hog(frame, orientations=orientation, pixels_per_cell=pixels_per_cell, cells_per_block=cells_per_block, visualize=visualize, multichannel=True) if visualize: return (hog_output[0], hog_output[1]) else: return hog_output def detect_aus(self, frame, landmarks): """Detect Action Units from image or video frame Args: frame (array): image loaded in array format (n, m, 3) landmarks (array): 68 landmarks used to localize face. Returns: array: Action Unit predictions Examples: >>> import cv2 >>> frame = cv2.imread(imgfile) >>> from feat import Detector >>> detector = Detector() >>> detector.detect_aus(frame) """ # Assume that the Raw landmark is given in the format (n_land,2) #landmarks = np.transpose(landmarks) #if landmarks.shape[-1] == 68: # landmarks = convert68to49(landmarks) return self.au_model.detect_au(frame, landmarks) def _concatenate_batch(self, indexed_length, au_results): """ NEW helper function to convert batch AUs to desired list of list only useful for our emotion and au prediction results Args: indexed_length: (list) the list index for number of faces in each frame. if you have 2 faces in each frame and you batch process 4 frames, it will be [2,2,2,2] au_results: (np.array), immediate result from running our au/emotion models Returns: list_concat: (list of list). The list which contains the number of faces. for example if you process 2 frames and each frame contains 4 faces, it will return: [[xxx,xxx,xxx,xxx],[xxx,xxx,xxx,xxx]] """ list_concat = [] new_lens = np.insert(np.cumsum(indexed_length), 0, 0) for ij in range(len(indexed_length)): list_concat.append(au_results[new_lens[ij]:new_lens[ij + 1], :]) return list_concat def detect_emotions(self, frame, facebox, landmarks): """Detect emotions from image or video frame Args: frame ([type]): [description] facebox ([type]): [description] landmarks ([type]): [description] Returns: array: Action Unit predictions Examples: >>> import cv2 >>> frame = cv2.imread(imgfile) >>> from feat import Detector >>> detector = Detector() >>> detected_faces = detector.detect_faces(frame) >>> detected_landmarks = detector.detect_landmarks(frame, detected_faces) >>> detector.detect_emotions(frame, detected_faces, detected_landmarks) """ if self.info["emotion_model"].lower() == 'fer': #landmarks = np.transpose(landmarks) #if landmarks.shape[-1] == 68: # landmarks = convert68to49(landmarks) # landmarks = landmarks.T return self.emotion_model.detect_emo(frame, landmarks) elif self.info["emotion_model"].lower() == 'resmasknet': return self.emotion_model.detect_emo(frame, facebox) elif self.info["emotion_model"].lower() in ['svm', 'rf']: return self.emotion_model.detect_emo(frame, landmarks) else: raise ValueError( 'Cannot recognize input emo model! Please try to re-type emotion model' ) def detect_facepose(self, frame, detected_faces=None, landmarks=None): """ Detect facepose from image or video frame. - When used with img2pose, returns *all* detected poses, and facebox and landmarks are ignored. Use `detect_face` method in order to obtain bounding boxes corresponding to the detected poses returned by this method. - When used with pnp model, 'facebox' param is ignored, and the passed 2D landmarks are used to compute the head pose for the single face associated with the passed landmarks. frame (np.ndarray): list of cv2 images detected_faces (list): (num_images, num_faces, 4) faceboxes representing faces in the list of images landmarks (np.ndarray): (num_images, num_faces, 68, 2) landmarks for the faces contained in list of images Returns: np.ndarray: (num_images, num_faces, [pitch, roll, yaw]) - Euler angles (in degrees) for each face within in each image Examples: # With img2pose >>> import cv2 >>> frame = [cv2.imread(imgfile)] >>> from feat import Detector >>> detector = Detector(face_model='imgpose', facepose_model='img2pose') >>> detector.detect_facepose([frame]) # one shot computation # With PnP >>> import cv2 >>> frame = [cv2.imread(imgfile)] >>> from feat import Detector >>> detector = Detector(face_model='retinaface', landmark_model='mobilefacenet', facepose_model='pnp') >>> faces = detector.detect_faces(frame) >>> landmarks = detector.detect_landmarks(detected_faces=faces) >>> detector.detect_facepose(frame=frame, landmarks=landmarks) # detect pose for all faces """ # check if frame is 4d if frame.ndim == 3: frame = np.expand_dims(frame, 0) assert frame.ndim == 4, "Frame needs to be 4 dimensions (list of images)" # height, width, _ = frame.shape if "img2pose" in self.info["face_model"]: faces, poses = self.facepose_detector(frame) else: poses = self.facepose_detector(frame, landmarks) return poses def process_frame(self, frames, counter=0, singleframe4error=False, skip_frame_rate=1): """function to run face detection, landmark detection, and emotion detection on a frame. TODO: probably need to add exceptions. The exception handling is not great yet NEW Args: frames (np.array): batch of frames, of shape BxHxWxC (read from cv2) counter (int, str, default=0): Index used for the prediction results dataframe. tracks the batches singleframe4error (bool, default = False): When exception occurs inside a batch, instead of nullify the whole batch, process each img in batch individually Returns: feat.data.Fex (dataframe): Prediction results dataframe. int: counter - the updated number of counter. Used to track the batch size and image number """ # check if frame is 4d if frames.ndim == 3: frames = np.expand_dims(frames, 0) assert frames.ndim == 4, "Frame needs to be 4 dimensions (list of images)" out = None # TODO Changed here try: detected_faces = self.detect_faces(frame=frames) landmarks = self.detect_landmarks(frame=frames, detected_faces=detected_faces) poses = self.detect_facepose(frame=frames, detected_faces=detected_faces, landmarks=landmarks) index_len = [len(ii) for ii in landmarks] if self["au_model"].lower() in ['logistic', 'svm', 'rf']: # landmarks_2 = round_vals(landmarks,3) landmarks_2 = landmarks hog_arr, new_lands = self._batch_hog( frames=frames, detected_faces=detected_faces, landmarks=landmarks_2) au_occur = self.detect_aus(frame=hog_arr, landmarks=new_lands) else: au_occur = self.detect_aus(frame=frames, landmarks=landmarks) if self["emotion_model"].lower() in ['svm', 'rf']: hog_arr, new_lands = self._batch_hog( frames=frames, detected_faces=detected_faces, landmarks=landmarks) emo_pred = self.detect_emotions(frame=hog_arr, facebox=None, landmarks=new_lands) else: emo_pred = self.detect_emotions(frame=frames, facebox=detected_faces, landmarks=landmarks) my_aus = self._concatenate_batch(indexed_length=index_len, au_results=au_occur) my_emo = self._concatenate_batch(indexed_length=index_len, au_results=emo_pred) for i, sessions in enumerate(detected_faces): for j, faces in enumerate(sessions): facebox_df = pd.DataFrame( [[ faces[0], faces[1], faces[2] - faces[0], faces[3] - faces[1], faces[4], ]], columns=self["face_detection_columns"], index=[counter + j], ) facepose_df = pd.DataFrame( [poses[i][j].flatten(order="F")], columns=self["facepose_model_columns"], index=[counter + j]) landmarks_df = pd.DataFrame( [landmarks[i][j].flatten(order="F")], columns=self["face_landmark_columns"], index=[counter + j], ) au_occur_df = pd.DataFrame( my_aus[i][j, :].reshape( 1, len(self["au_presence_columns"])), columns=self["au_presence_columns"], index=[counter + j]) emo_pred_df = pd.DataFrame(my_emo[i][j, :].reshape( 1, len(FEAT_EMOTION_COLUMNS)), columns=FEAT_EMOTION_COLUMNS, index=[counter + j]) tmp_df = pd.concat([ facebox_df, landmarks_df, au_occur_df, facepose_df, emo_pred_df ], axis=1) tmp_df[FEAT_TIME_COLUMNS] = counter if out is None: out = tmp_df else: out = pd.concat([out, tmp_df], axis=0) # out[FEAT_TIME_COLUMNS] = counter counter += skip_frame_rate return out, counter except: traceback.print_exc() print("exception occurred in the batch") if singleframe4error: print("Trying to process one image at a time in the batch") raise FaceDetectionError else: print( "Since singleframe4error=FALSE, giving up this entire batch result" ) newdf = None for cter in range(frames.shape[0]): emotion_df = self._empty_emotion.reindex( index=[counter + cter]) facebox_df = self._empty_facebox.reindex( index=[counter + cter]) facepose_df = self._empty_facepose.reindex( index=[counter + cter]) landmarks_df = self._empty_landmark.reindex( index=[counter + cter]) au_occur_df = self._empty_auoccurence.reindex( index=[counter + cter]) out = pd.concat([ facebox_df, landmarks_df, au_occur_df, facepose_df, emotion_df ], axis=1) out[FEAT_TIME_COLUMNS] = counter + cter if newdf is None: newdf = out else: newdf = pd.concat([newdf, out], axis=0) return (newdf, counter + frames.shape[0]) def detect_video(self, inputFname, batch_size=5, outputFname=None, skip_frames=1, verbose=False, singleframe4error=False): """Detects FEX from a video file. Args: inputFname (str): Path to video file outputFname (str, optional): Path to output file. Defaults to None. bacth_size (int, optional): how many batches of images you want to run at one shot. Larger gives faster speed but is more memory-consuming skip_frames (int, optional): Number of every other frames to skip for speed or if not all frames need to be processed. Defaults to 1. singleframe4error (bool, default = False): When set True, when exception occurs inside a batch, instead of nullify the whole batch, process each img in batch individually Returns: dataframe: Prediction results dataframe if outputFname is None. Returns True if outputFname is specified. """ self.info["inputFname"] = inputFname self.info["outputFname"] = outputFname init_df = pd.DataFrame(columns=self["output_columns"]) if outputFname: init_df.to_csv(outputFname, index=False, header=True) cap = cv2.VideoCapture(inputFname) length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) frames_to_process = int(np.ceil(length / skip_frames)) counter = 0 frame_got = True if verbose: print("Processing video.") # single core concat_frame = None while True: frame_got, frame = cap.read() if frame_got: if counter % skip_frames == 0: # if the if concat_frame is None: concat_frame = np.expand_dims(frame, 0) tmp_counter = counter else: concat_frame = np.concatenate( [concat_frame, np.expand_dims(frame, 0)], 0) if (concat_frame is not None) and (counter != 0) and ( concat_frame.shape[0] % batch_size == 0): # I think it's probably this error if singleframe4error: try: df, _ = self.process_frame( concat_frame, counter=tmp_counter, singleframe4error=singleframe4error, skip_frame_rate=skip_frames) except FaceDetectionError: df = None for id_fr in range(concat_frame.shape[0]): tmp_df, _ = self.process_frame( concat_frame[id_fr:(id_fr + 1)], counter=tmp_counter, singleframe4error=False, skip_frame_rate=skip_frames) tmp_counter += 1 if df is None: df = tmp_df else: df = pd.concat((df, tmp_df), 0) else: df, _ = self.process_frame(concat_frame, counter=tmp_counter, skip_frame_rate=skip_frames) df["input"] = inputFname if outputFname: df[init_df.columns].to_csv(outputFname, index=False, header=False, mode="a") else: init_df = pd.concat([init_df, df[init_df.columns]], axis=0) concat_frame = None tmp_counter = None counter = counter + 1 else: # process remaining frames if concat_frame is not None: if singleframe4error: try: df, _ = self.process_frame( concat_frame, counter=tmp_counter, skip_frame_rate=skip_frames) except FaceDetectionError: df = None for id_fr in range(concat_frame.shape[0]): tmp_df, _ = self.process_frame( concat_frame[id_fr:(id_fr + 1)], counter=tmp_counter, singleframe4error=False, skip_frame_rate=skip_frames) tmp_counter += 1 if df is None: df = tmp_df else: df = pd.concat((df, tmp_df), 0) else: df, _ = self.process_frame(concat_frame, counter=tmp_counter, skip_frame_rate=skip_frames) df["input"] = inputFname if outputFname: df[init_df.columns].to_csv(outputFname, index=False, header=False, mode="a") else: init_df = pd.concat([init_df, df[init_df.columns]], axis=0) break cap.release() if outputFname: return True else: return Fex( init_df, filename=inputFname, au_columns=self["au_presence_columns"], emotion_columns=FEAT_EMOTION_COLUMNS, facebox_columns=FEAT_FACEBOX_COLUMNS, landmark_columns=openface_2d_landmark_columns, facepose_columns=FACET_FACEPOSE_COLUMNS, time_columns=FEAT_TIME_COLUMNS, detector="Feat", ) def detect_image(self, inputFname, batch_size=5, outputFname=None, verbose=False, singleframe4error=False): """Detects FEX from an image file. Args: inputFname (list of str): Path to a list of paths to image files. bacth_size (int, optional): how many batches of images you want to run at one shot. Larger gives faster speed but is more memory-consuming outputFname (str, optional): Path to output file. Defaults to None. singleframe4error (bool, default = False): When set True, when exception occurs inside a batch, instead of nullify the whole batch, process each img in batch individually Rseturns: Fex: Prediction results dataframe if outputFname is None. Returns True if outputFname is specified. """ assert ( type(inputFname) == str or type(inputFname) == list ), "inputFname must be a string path to image or list of image paths" if type(inputFname) == str: inputFname = [inputFname] for inputF in inputFname: if not os.path.exists(inputF): raise FileNotFoundError(f"File {inputF} not found.") self.info["inputFname"] = inputFname init_df = pd.DataFrame(columns=self["output_columns"]) if outputFname: init_df.to_csv(outputFname, index=False, header=True) counter = 0 concat_frame = None input_names = [] while counter < len(inputFname): #if counter % skip_frames == 0: frame = np.expand_dims(cv2.imread(inputFname[counter]), 0) if concat_frame is None: concat_frame = frame tmp_counter = counter else: concat_frame = np.concatenate([concat_frame, frame], 0) input_names.append(inputFname[counter]) counter = counter + 1 if (counter % batch_size == 0) and (concat_frame is not None): if singleframe4error: try: df, _ = self.process_frame( concat_frame, counter=tmp_counter, singleframe4error=singleframe4error) except FaceDetectionError: df = None for id_fr in range(concat_frame.shape[0]): tmp_df, _ = self.process_frame( concat_frame[id_fr:(id_fr + 1)], counter=tmp_counter, singleframe4error=False) tmp_counter += 1 if df is None: df = tmp_df else: df = pd.concat((df, tmp_df), 0) else: df, _ = self.process_frame(concat_frame, counter=tmp_counter) df["input"] = input_names if outputFname: df[init_df.columns].to_csv(outputFname, index=False, header=False, mode="a") else: init_df = pd.concat([init_df, df[init_df.columns]], axis=0) concat_frame = None tmp_counter = None input_names = [] if len(inputFname) % batch_size != 0: # process remaining frames if concat_frame is not None: if singleframe4error: try: df, _ = self.process_frame(concat_frame, counter=tmp_counter) except FaceDetectionError: df = None for id_fr in range(concat_frame.shape[0]): tmp_df, _ = self.process_frame( concat_frame[id_fr:(id_fr + 1)], counter=tmp_counter, singleframe4error=False) tmp_counter += 1 if df is None: df = tmp_df else: df = pd.concat((df, tmp_df), 0) else: df, _ = self.process_frame(concat_frame, counter=tmp_counter) df["input"] = input_names if outputFname: df[init_df.columns].to_csv(outputFname, index=False, header=False, mode="a") else: init_df = pd.concat([init_df, df[init_df.columns]], axis=0) if outputFname: return True else: return Fex( init_df, filename=inputFname, au_columns=self['au_presence_columns'], emotion_columns=FEAT_EMOTION_COLUMNS, facebox_columns=FEAT_FACEBOX_COLUMNS, landmark_columns=openface_2d_landmark_columns, facepose_columns=FACET_FACEPOSE_COLUMNS, time_columns=FACET_TIME_COLUMNS, detector="Feat", )