def extract_features(path_in, net, num_layers_finetune, use_gpu, minimum_frames=45): # Create inference engine inference_engine = engine.InferenceEngine(net, use_gpu=use_gpu) # extract features for dataset in ["train", "valid"]: videos_dir = os.path.join(path_in, f"videos_{dataset}") features_dir = os.path.join(path_in, f"features_{dataset}_num_layers_to_finetune={num_layers_finetune}") video_files = glob.glob(os.path.join(videos_dir, "*", "*.mp4")) print(f"\nFound {len(video_files)} videos to process in the {dataset}set") for video_index, video_path in enumerate(video_files): print(f"\rExtract features from video {video_index + 1} / {len(video_files)}", end="") path_out = video_path.replace(videos_dir, features_dir).replace(".mp4", ".npy") if os.path.isfile(path_out): print("\n\tSkipped - feature was already precomputed.") else: # Read all frames video_source = camera.VideoSource(camera_id=None, size=inference_engine.expected_frame_size, filename=video_path) video_fps = video_source.get_fps() frames = [] while True: images = video_source.get_image() if images is None: break else: image, image_rescaled = images frames.append(image_rescaled) frames = uniform_frame_sample(np.array(frames), inference_engine.fps / video_fps) if frames.shape[0] < minimum_frames: print(f"\nVideo too short: {video_path} - first frame will be duplicated") num_missing_frames = minimum_frames - frames.shape[0] frames = np.pad(frames, ((num_missing_frames, 0), (0, 0), (0, 0), (0, 0)), mode='edge') # Inference clip = frames[None].astype(np.float32) predictions = inference_engine.infer(clip) features = np.array(predictions) os.makedirs(os.path.dirname(path_out), exist_ok=True) np.save(path_out, features) print('\n')
checkpoint = engine.load_weights('resources/backbone/strided_inflated_efficientnet.ckpt') feature_extractor.load_state_dict(checkpoint) feature_extractor.eval() # Load a logistic regression classifier gesture_classifier = LogisticRegression(num_in=feature_extractor.feature_dim, num_out=30) checkpoint = engine.load_weights('resources/gesture_detection/efficientnet_logistic_regression.ckpt') gesture_classifier.load_state_dict(checkpoint) gesture_classifier.eval() # Concatenate feature extractor and met converter net = Pipe(feature_extractor, gesture_classifier) # Create inference engine, video streaming and display instances inference_engine = engine.InferenceEngine(net, use_gpu=use_gpu) video_source = camera.VideoSource(camera_id=camera_id, size=inference_engine.expected_frame_size, filename=path_in) video_stream = camera.VideoStream(video_source, inference_engine.fps) postprocessor = [ PostprocessClassificationOutput(INT2LAB, smoothing=4) ] display_ops = [ realtimenet.display.DisplayTopKClassificationOutputs(top_k=1, threshold=0.5), ]
def _setup_inference_engine(self): self.inference_engine = engine.InferenceEngine(self.net, use_gpu=True) video_source = camera.VideoSource( camera_id=0, size=self.inference_engine.expected_frame_size) self.frame_grabber = camera.VideoStream(video_source, self.inference_engine.fps)