def test_SoundScpReader(tmp_path: Path): audio_path1 = tmp_path / "a1.wav" audio1 = np.random.randint(-100, 100, 16, dtype=np.int16) audio_path2 = tmp_path / "a2.wav" audio2 = np.random.randint(-100, 100, 16, dtype=np.int16) soundfile.write(audio_path1, audio1, 16) soundfile.write(audio_path2, audio2, 16) p = tmp_path / "dummy.scp" with p.open("w") as f: f.write(f"abc {audio_path1}\n") f.write(f"def {audio_path2}\n") desired = {"abc": (16, audio1), "def": (16, audio2)} target = SoundScpReader(p, normalize=False, dtype=np.int16) for k in desired: rate1, t = target[k] rate2, d = desired[k] assert rate1 == rate2 np.testing.assert_array_equal(t, d) assert len(target) == len(desired) assert "abc" in target assert "def" in target assert tuple(target.keys()) == tuple(desired) assert tuple(target) == tuple(desired) assert target.get_path("abc") == str(audio_path1) assert target.get_path("def") == str(audio_path2)
def test_SoundScpReader_normalize(tmp_path: Path): audio_path1 = tmp_path / "a1.wav" audio1 = np.random.randint(-100, 100, 16, dtype=np.int16) audio_path2 = tmp_path / "a2.wav" audio2 = np.random.randint(-100, 100, 16, dtype=np.int16) audio1 = audio1.astype(np.float64) / (np.iinfo(np.int16).max + 1) audio2 = audio2.astype(np.float64) / (np.iinfo(np.int16).max + 1) soundfile.write(audio_path1, audio1, 16) soundfile.write(audio_path2, audio2, 16) p = tmp_path / "dummy.scp" with p.open("w") as f: f.write(f"abc {audio_path1}\n") f.write(f"def {audio_path2}\n") desired = {"abc": (16, audio1), "def": (16, audio2)} target = SoundScpReader(p, normalize=True) for k in desired: rate1, t = target[k] rate2, d = desired[k] assert rate1 == rate2 np.testing.assert_array_equal(t, d)
def sound_loader(path, float_dtype=None): # The file is as follows: # utterance_id_A /some/where/a.wav # utterance_id_B /some/where/a.flac # NOTE(kamo): SoundScpReader doesn't support pipe-fashion # like Kaldi e.g. "cat a.wav |". # NOTE(kamo): The audio signal is normalized to [-1,1] range. loader = SoundScpReader(path, normalize=True, always_2d=False) # SoundScpReader.__getitem__() returns Tuple[int, ndarray], # but ndarray is desired, so Adapter class is inserted here return AdapterForSoundScpReader(loader, float_dtype)
def test_SoundScpWriter_normalize(tmp_path: Path): audio1 = np.random.randint(-100, 100, 16, dtype=np.int16) audio2 = np.random.randint(-100, 100, 16, dtype=np.int16) audio1 = audio1.astype(np.float64) / (np.iinfo(np.int16).max + 1) audio2 = audio2.astype(np.float64) / (np.iinfo(np.int16).max + 1) with SoundScpWriter(tmp_path, tmp_path / "wav.scp", dtype=np.int16) as writer: writer["abc"] = 16, audio1 writer["def"] = 16, audio2 # Unsupported dimension with pytest.raises(RuntimeError): y = np.random.randint(-100, 100, [16, 1, 1], dtype=np.int16) writer["ghi"] = 16, y target = SoundScpReader(tmp_path / "wav.scp", normalize=True, dtype=np.float64) desired = {"abc": (16, audio1), "def": (16, audio2)} for k in desired: rate1, t = target[k] rate2, d = desired[k] assert rate1 == rate2 np.testing.assert_array_equal(t, d)
def test_SoundScpWriter(tmp_path: Path): audio1 = np.random.randint(-100, 100, 16, dtype=np.int16) audio2 = np.random.randint(-100, 100, 16, dtype=np.int16) with SoundScpWriter(tmp_path, tmp_path / "wav.scp", dtype=np.int16) as writer: writer["abc"] = 16, audio1 writer["def"] = 16, audio2 # Unsupported dimension with pytest.raises(RuntimeError): y = np.random.randint(-100, 100, [16, 1, 1], dtype=np.int16) writer["ghi"] = 16, y target = SoundScpReader(tmp_path / "wav.scp", normalize=False, dtype=np.int16) desired = {"abc": (16, audio1), "def": (16, audio2)} for k in desired: rate1, t = target[k] rate2, d = desired[k] assert rate1 == rate2 np.testing.assert_array_equal(t, d) assert writer.get_path("abc") == str(tmp_path / "abc.wav") assert writer.get_path("def") == str(tmp_path / "def.wav")
def main(argv): """Load the model, generate kernel and bandpass plots.""" parser = get_parser() args = parser.parse_args(argv) if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") if torch.cuda.is_available() and ("cuda" in args.device): device = args.device else: device = "cpu" if args.toolkit == "speechbrain": from speechbrain.dataio.preprocess import AudioNormalizer from speechbrain.pretrained import EncoderClassifier # Prepare spk2utt for mean x-vector spk2utt = dict() with open(os.path.join(args.in_folder, "spk2utt"), "r") as reader: for line in reader: details = line.split() spk2utt[details[0]] = details[1:] # TODO(nelson): The model inference can be moved into functon. classifier = EncoderClassifier.from_hparams( source=args.pretrained_model, run_opts={"device": device}) audio_norm = AudioNormalizer() wav_scp = SoundScpReader(os.path.join(args.in_folder, "wav.scp")) os.makedirs(args.out_folder, exist_ok=True) writer_utt = kaldiio.WriteHelper( "ark,scp:{0}/xvector.ark,{0}/xvector.scp".format(args.out_folder)) writer_spk = kaldiio.WriteHelper( "ark,scp:{0}/spk_xvector.ark,{0}/spk_xvector.scp".format( args.out_folder)) for speaker in tqdm(spk2utt): xvectors = list() for utt in spk2utt[speaker]: in_sr, wav = wav_scp[utt] # Amp Normalization -1 ~ 1 amax = np.amax(np.absolute(wav)) wav = wav.astype(np.float32) / amax # Freq Norm wav = audio_norm(torch.from_numpy(wav), in_sr).to(device) # X-vector Embedding embeds = classifier.encode_batch(wav).detach().cpu().numpy()[0] writer_utt[utt] = np.squeeze(embeds) xvectors.append(embeds) # Speaker Normalization embeds = np.mean(np.stack(xvectors, 0), 0) writer_spk[speaker] = embeds writer_utt.close() writer_spk.close() elif args.toolkit == "espnet": raise NotImplementedError( "Follow details at: https://github.com/espnet/espnet/issues/3040") else: raise ValueError( f"Unkown type of toolkit. Only supported: speechbrain, espnet, kaldi" )
def scoring( output_dir: str, dtype: str, log_level: Union[int, str], key_file: str, ref_scp: List[str], inf_scp: List[str], ref_channel: int, ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) assert len(ref_scp) == len(inf_scp), ref_scp num_spk = len(ref_scp) keys = [ line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8") ] ref_readers = [ SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp ] inf_readers = [ SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp ] # get sample rate sample_rate, _ = ref_readers[0][keys[0]] # check keys for inf_reader, ref_reader in zip(inf_readers, ref_readers): assert inf_reader.keys() == ref_reader.keys() with DatadirWriter(output_dir) as writer: for key in keys: ref_audios = [ref_reader[key][1] for ref_reader in ref_readers] inf_audios = [inf_reader[key][1] for inf_reader in inf_readers] ref = np.array(ref_audios) inf = np.array(inf_audios) if ref.ndim > inf.ndim: # multi-channel reference and single-channel output ref = ref[..., ref_channel] assert ref.shape == inf.shape, (ref.shape, inf.shape) elif ref.ndim < inf.ndim: # single-channel reference and multi-channel output raise ValueError("Reference must be multi-channel when the \ network output is multi-channel.") elif ref.ndim == inf.ndim == 3: # multi-channel reference and output ref = ref[..., ref_channel] inf = inf[..., ref_channel] sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True) for i in range(num_spk): stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate) si_snr_score = -float( si_snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[int(perm[i])][None, ...]), )) writer[f"STOI_spk{i + 1}"][key] = str(stoi_score) writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score) writer[f"SDR_spk{i + 1}"][key] = str(sdr[i]) writer[f"SAR_spk{i + 1}"][key] = str(sar[i]) writer[f"SIR_spk{i + 1}"][key] = str(sir[i]) # save permutation assigned script file writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]
def scoring( output_dir: str, dtype: str, log_level: Union[int, str], key_file: str, ref_scp: List[str], inf_scp: List[str], ref_channel: int, metrics: List[str], frame_size: int = 512, frame_hop: int = 256, ): assert check_argument_types() for metric in metrics: assert metric in ( "STOI", "ESTOI", "SNR", "SI_SNR", "SDR", "SAR", "SIR", "framewise-SNR", ), metric logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) assert len(ref_scp) == len(inf_scp), ref_scp num_spk = len(ref_scp) keys = [ line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8") ] ref_readers = [ SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp ] inf_readers = [ SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp ] # get sample rate fs, _ = ref_readers[0][keys[0]] # check keys for inf_reader, ref_reader in zip(inf_readers, ref_readers): assert inf_reader.keys() == ref_reader.keys() stft = STFTEncoder(n_fft=frame_size, hop_length=frame_hop) do_bss_eval = "SDR" in metrics or "SAR" in metrics or "SIR" in metrics with DatadirWriter(output_dir) as writer: for key in keys: ref_audios = [ref_reader[key][1] for ref_reader in ref_readers] inf_audios = [inf_reader[key][1] for inf_reader in inf_readers] ref = np.array(ref_audios) inf = np.array(inf_audios) if ref.ndim > inf.ndim: # multi-channel reference and single-channel output ref = ref[..., ref_channel] assert ref.shape == inf.shape, (ref.shape, inf.shape) elif ref.ndim < inf.ndim: # single-channel reference and multi-channel output raise ValueError("Reference must be multi-channel when the " "network output is multi-channel.") elif ref.ndim == inf.ndim == 3: # multi-channel reference and output ref = ref[..., ref_channel] inf = inf[..., ref_channel] if do_bss_eval or num_spk > 1: sdr, sir, sar, perm = bss_eval_sources( ref, inf, compute_permutation=True) else: perm = [0] ilens = torch.LongTensor([ref.shape[1]]) # (num_spk, T, F) ref_spec, flens = stft(torch.from_numpy(ref), ilens) inf_spec, _ = stft(torch.from_numpy(inf), ilens) for i in range(num_spk): p = int(perm[i]) for metric in metrics: name = f"{metric}_spk{i + 1}" if metric == "STOI": writer[name][key] = str( stoi(ref[i], inf[p], fs_sig=fs, extended=False)) elif metric == "ESTOI": writer[name][key] = str( stoi(ref[i], inf[p], fs_sig=fs, extended=True)) elif metric == "SNR": si_snr_score = -float( ESPnetEnhancementModel.snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[p][None, ...]), )) writer[name][key] = str(si_snr_score) elif metric == "SI_SNR": si_snr_score = -float( ESPnetEnhancementModel.si_snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[p][None, ...]), )) writer[name][key] = str(si_snr_score) elif metric == "SDR": writer[name][key] = str(sdr[i]) elif metric == "SAR": writer[name][key] = str(sar[i]) elif metric == "SIR": writer[name][key] = str(sir[i]) elif metric == "framewise-SNR": framewise_snr = -ESPnetEnhancementModel.snr_loss( ref_spec[i].abs(), inf_spec[i].abs()) writer[name][key] = " ".join( map(str, framewise_snr.tolist())) else: raise ValueError("Unsupported metric: %s" % metric) # save permutation assigned script file writer[f"wav_spk{i + 1}"][key] = inf_readers[ perm[i]].data[key]
def scoring( output_dir: str, dtype: str, log_level: Union[int, str], key_file: str, ref_scp: List[str], inf_scp: List[str], ref_channel: int, flexible_numspk: bool, ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) assert len(ref_scp) == len(inf_scp), ref_scp num_spk = len(ref_scp) keys = [ line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8") ] ref_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp] inf_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp] # get sample rate sample_rate, _ = ref_readers[0][keys[0]] # check keys if not flexible_numspk: for inf_reader, ref_reader in zip(inf_readers, ref_readers): assert inf_reader.keys() == ref_reader.keys() with DatadirWriter(output_dir) as writer: for key in keys: if not flexible_numspk: ref_audios = [ref_reader[key][1] for ref_reader in ref_readers] inf_audios = [inf_reader[key][1] for inf_reader in inf_readers] else: ref_audios = [ ref_reader[key][1] for ref_reader in ref_readers if key in ref_reader.keys() ] inf_audios = [ inf_reader[key][1] for inf_reader in inf_readers if key in inf_reader.keys() ] ref = np.array(ref_audios) inf = np.array(inf_audios) if ref.ndim > inf.ndim: # multi-channel reference and single-channel output ref = ref[..., ref_channel] elif ref.ndim < inf.ndim: # single-channel reference and multi-channel output inf = inf[..., ref_channel] elif ref.ndim == inf.ndim == 3: # multi-channel reference and output ref = ref[..., ref_channel] inf = inf[..., ref_channel] if not flexible_numspk: assert ref.shape == inf.shape, (ref.shape, inf.shape) else: # epsilon value to avoid divergence # caused by zero-value, e.g., log(0) eps = 0.000001 # if num_spk of ref > num_spk of inf if ref.shape[0] > inf.shape[0]: p = np.full((ref.shape[0] - inf.shape[0], inf.shape[1]), eps) inf = np.concatenate([inf, p]) num_spk = ref.shape[0] # if num_spk of ref < num_spk of inf elif ref.shape[0] < inf.shape[0]: p = np.full((inf.shape[0] - ref.shape[0], ref.shape[1]), eps) ref = np.concatenate([ref, p]) num_spk = inf.shape[0] else: num_spk = ref.shape[0] sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True) for i in range(num_spk): stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate) estoi_score = stoi( ref[i], inf[int(perm[i])], fs_sig=sample_rate, extended=True ) si_snr_score = -float( si_snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[int(perm[i])][None, ...]), ) ) writer[f"STOI_spk{i + 1}"][key] = str(stoi_score * 100) # in percentage writer[f"ESTOI_spk{i + 1}"][key] = str(estoi_score * 100) writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score) writer[f"SDR_spk{i + 1}"][key] = str(sdr[i]) writer[f"SAR_spk{i + 1}"][key] = str(sar[i]) writer[f"SIR_spk{i + 1}"][key] = str(sir[i]) # save permutation assigned script file if not flexible_numspk: writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]