class SoundRecognitionApp(): def __init__(self, cfg) -> None: self.transformer = Signal2ImageTransformer(**cfg['transforms']) self.audio = Audio(cfg['audio']) self.load_model(cfg['model']) pass def run(self): print("============= REALTIME START ==============") self.audio.start() self.flag = True try: while self.flag: status, data = self.audio.get() if status == Audio.ERROR: print('[error]') break elif status == Audio.WAIT: continue mel_spec = self.preprocess(data) result = self.inference(mel_spec) except KeyboardInterrupt: pass except Exception as e: print(e) finally: self.audio.stop() print("============= REALTIME FINISH ==============") def preprocess(self, signal): return np.expand_dims(self.transformer.transform(signal), axis=0) def inference(self, X): image = torch.from_numpy(X.astype(np.float32)).clone() image.to(self.device).float() prob = self.model(image)['multilabel_proba'].detach().cpu().numpy() return prob def load_model(self, cfg): try: self.device = torch.device(cfg["device"]) self.model = getattr(ml.my_model, cfg['name'])(**cfg['params']) self.model.load_state_dict(torch.load(cfg['path'])) self.model.to(self.device) except AttributeError as e: print(f"Model {cfg['name']} is None. {e}") exit(1) except FileNotFoundError as e: print(f"{e}") exit(1) except Exception as e: print(f"{e}") exit(1)
class MusicChangePointDetector(object): def __init__(self, setting_path: str, audio_path: str): """コンストラクタ Args: setting_path (str): 設定ファイルのパス audio_path (str): 音楽ファイルのパス """ with open(setting_path, 'r') as f: cfg = yaml.load(f) self.cf = cf.ChangeFinder(**cfg['change_finder']) self.audio = Audio(cfg['audio'], audio_file_path=audio_path) self.buffer = np.zeros(cfg['model']['buffer_audio_length'], dtype=np.float32) self.buf_num = int(cfg['model']['frame_buf_num']) self.spec_buf = [] self.thr = float(cfg['model']['thr']) def run(self): """メインループ開始""" self.audio.start() try: while True: status, data = self.audio.get() if status == Audio.ERROR: break elif status == Audio.WAIT: continue self.buffer = np.roll(self.buffer, -data.shape[0], axis=0) self.buffer[-data.shape[0]:] = data if self.detect(): print('detect') except KeyboardInterrupt: print('Interrupt') self.audio.stop() def detect(self): """検出 Returns: bool: 転調したかどうか """ is_detect = False D = np.average(librosa.amplitude_to_db(np.abs(librosa.stft( self.buffer)), ref=np.max), axis=1)[:512] D /= np.linalg.norm(D, ord=2) self.spec_buf.append(D) if len(self.spec_buf) > self.buf_num: similarity = np.average( np.dot(self.spec_buf[-1], np.array(self.spec_buf[-(self.buf_num - 1):-1]).T)) score = self.cf.update(similarity) self.spec_buf.pop() if score > self.thr: is_detect = True return is_detect