Exemplo n.º 1
0
def test_SoundScpReader(tmp_path: Path):
    audio_path1 = tmp_path / "a1.wav"
    audio1 = np.random.randint(-100, 100, 16, dtype=np.int16)
    audio_path2 = tmp_path / "a2.wav"
    audio2 = np.random.randint(-100, 100, 16, dtype=np.int16)

    soundfile.write(audio_path1, audio1, 16)
    soundfile.write(audio_path2, audio2, 16)

    p = tmp_path / "dummy.scp"
    with p.open("w") as f:
        f.write(f"abc {audio_path1}\n")
        f.write(f"def {audio_path2}\n")

    desired = {"abc": (16, audio1), "def": (16, audio2)}
    target = SoundScpReader(p, normalize=False, dtype=np.int16)

    for k in desired:
        rate1, t = target[k]
        rate2, d = desired[k]
        assert rate1 == rate2
        np.testing.assert_array_equal(t, d)

    assert len(target) == len(desired)
    assert "abc" in target
    assert "def" in target
    assert tuple(target.keys()) == tuple(desired)
    assert tuple(target) == tuple(desired)
    assert target.get_path("abc") == str(audio_path1)
    assert target.get_path("def") == str(audio_path2)
Exemplo n.º 2
0
def test_SoundScpReader_normalize(tmp_path: Path):
    audio_path1 = tmp_path / "a1.wav"
    audio1 = np.random.randint(-100, 100, 16, dtype=np.int16)
    audio_path2 = tmp_path / "a2.wav"
    audio2 = np.random.randint(-100, 100, 16, dtype=np.int16)

    audio1 = audio1.astype(np.float64) / (np.iinfo(np.int16).max + 1)
    audio2 = audio2.astype(np.float64) / (np.iinfo(np.int16).max + 1)

    soundfile.write(audio_path1, audio1, 16)
    soundfile.write(audio_path2, audio2, 16)

    p = tmp_path / "dummy.scp"
    with p.open("w") as f:
        f.write(f"abc {audio_path1}\n")
        f.write(f"def {audio_path2}\n")

    desired = {"abc": (16, audio1), "def": (16, audio2)}
    target = SoundScpReader(p, normalize=True)

    for k in desired:
        rate1, t = target[k]
        rate2, d = desired[k]
        assert rate1 == rate2
        np.testing.assert_array_equal(t, d)
Exemplo n.º 3
0
def sound_loader(path, float_dtype=None):
    # The file is as follows:
    #   utterance_id_A /some/where/a.wav
    #   utterance_id_B /some/where/a.flac

    # NOTE(kamo): SoundScpReader doesn't support pipe-fashion
    # like Kaldi e.g. "cat a.wav |".
    # NOTE(kamo): The audio signal is normalized to [-1,1] range.
    loader = SoundScpReader(path, normalize=True, always_2d=False)

    # SoundScpReader.__getitem__() returns Tuple[int, ndarray],
    # but ndarray is desired, so Adapter class is inserted here
    return AdapterForSoundScpReader(loader, float_dtype)
Exemplo n.º 4
0
def test_SoundScpWriter_normalize(tmp_path: Path):
    audio1 = np.random.randint(-100, 100, 16, dtype=np.int16)
    audio2 = np.random.randint(-100, 100, 16, dtype=np.int16)
    audio1 = audio1.astype(np.float64) / (np.iinfo(np.int16).max + 1)
    audio2 = audio2.astype(np.float64) / (np.iinfo(np.int16).max + 1)

    with SoundScpWriter(tmp_path, tmp_path / "wav.scp",
                        dtype=np.int16) as writer:
        writer["abc"] = 16, audio1
        writer["def"] = 16, audio2
        # Unsupported dimension
        with pytest.raises(RuntimeError):
            y = np.random.randint(-100, 100, [16, 1, 1], dtype=np.int16)
            writer["ghi"] = 16, y
    target = SoundScpReader(tmp_path / "wav.scp",
                            normalize=True,
                            dtype=np.float64)
    desired = {"abc": (16, audio1), "def": (16, audio2)}

    for k in desired:
        rate1, t = target[k]
        rate2, d = desired[k]
        assert rate1 == rate2
        np.testing.assert_array_equal(t, d)
Exemplo n.º 5
0
def test_SoundScpWriter(tmp_path: Path):
    audio1 = np.random.randint(-100, 100, 16, dtype=np.int16)
    audio2 = np.random.randint(-100, 100, 16, dtype=np.int16)
    with SoundScpWriter(tmp_path, tmp_path / "wav.scp",
                        dtype=np.int16) as writer:
        writer["abc"] = 16, audio1
        writer["def"] = 16, audio2
        # Unsupported dimension
        with pytest.raises(RuntimeError):
            y = np.random.randint(-100, 100, [16, 1, 1], dtype=np.int16)
            writer["ghi"] = 16, y
    target = SoundScpReader(tmp_path / "wav.scp",
                            normalize=False,
                            dtype=np.int16)
    desired = {"abc": (16, audio1), "def": (16, audio2)}

    for k in desired:
        rate1, t = target[k]
        rate2, d = desired[k]
        assert rate1 == rate2
        np.testing.assert_array_equal(t, d)

    assert writer.get_path("abc") == str(tmp_path / "abc.wav")
    assert writer.get_path("def") == str(tmp_path / "def.wav")
Exemplo n.º 6
0
def main(argv):
    """Load the model, generate kernel and bandpass plots."""
    parser = get_parser()
    args = parser.parse_args(argv)

    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    if torch.cuda.is_available() and ("cuda" in args.device):
        device = args.device
    else:
        device = "cpu"

    if args.toolkit == "speechbrain":
        from speechbrain.dataio.preprocess import AudioNormalizer
        from speechbrain.pretrained import EncoderClassifier

        # Prepare spk2utt for mean x-vector
        spk2utt = dict()
        with open(os.path.join(args.in_folder, "spk2utt"), "r") as reader:
            for line in reader:
                details = line.split()
                spk2utt[details[0]] = details[1:]

        # TODO(nelson): The model inference can be moved into functon.
        classifier = EncoderClassifier.from_hparams(
            source=args.pretrained_model, run_opts={"device": device})
        audio_norm = AudioNormalizer()

        wav_scp = SoundScpReader(os.path.join(args.in_folder, "wav.scp"))
        os.makedirs(args.out_folder, exist_ok=True)
        writer_utt = kaldiio.WriteHelper(
            "ark,scp:{0}/xvector.ark,{0}/xvector.scp".format(args.out_folder))
        writer_spk = kaldiio.WriteHelper(
            "ark,scp:{0}/spk_xvector.ark,{0}/spk_xvector.scp".format(
                args.out_folder))

        for speaker in tqdm(spk2utt):
            xvectors = list()
            for utt in spk2utt[speaker]:
                in_sr, wav = wav_scp[utt]
                # Amp Normalization -1 ~ 1
                amax = np.amax(np.absolute(wav))
                wav = wav.astype(np.float32) / amax
                # Freq Norm
                wav = audio_norm(torch.from_numpy(wav), in_sr).to(device)
                # X-vector Embedding
                embeds = classifier.encode_batch(wav).detach().cpu().numpy()[0]
                writer_utt[utt] = np.squeeze(embeds)
                xvectors.append(embeds)

            # Speaker Normalization
            embeds = np.mean(np.stack(xvectors, 0), 0)
            writer_spk[speaker] = embeds
        writer_utt.close()
        writer_spk.close()

    elif args.toolkit == "espnet":
        raise NotImplementedError(
            "Follow details at: https://github.com/espnet/espnet/issues/3040")
    else:
        raise ValueError(
            f"Unkown type of toolkit. Only supported: speechbrain, espnet, kaldi"
        )
Exemplo n.º 7
0
def scoring(
    output_dir: str,
    dtype: str,
    log_level: Union[int, str],
    key_file: str,
    ref_scp: List[str],
    inf_scp: List[str],
    ref_channel: int,
):
    assert check_argument_types()

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    assert len(ref_scp) == len(inf_scp), ref_scp
    num_spk = len(ref_scp)

    keys = [
        line.rstrip().split(maxsplit=1)[0]
        for line in open(key_file, encoding="utf-8")
    ]

    ref_readers = [
        SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp
    ]
    inf_readers = [
        SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp
    ]

    # get sample rate
    sample_rate, _ = ref_readers[0][keys[0]]

    # check keys
    for inf_reader, ref_reader in zip(inf_readers, ref_readers):
        assert inf_reader.keys() == ref_reader.keys()

    with DatadirWriter(output_dir) as writer:
        for key in keys:
            ref_audios = [ref_reader[key][1] for ref_reader in ref_readers]
            inf_audios = [inf_reader[key][1] for inf_reader in inf_readers]
            ref = np.array(ref_audios)
            inf = np.array(inf_audios)
            if ref.ndim > inf.ndim:
                # multi-channel reference and single-channel output
                ref = ref[..., ref_channel]
                assert ref.shape == inf.shape, (ref.shape, inf.shape)
            elif ref.ndim < inf.ndim:
                # single-channel reference and multi-channel output
                raise ValueError("Reference must be multi-channel when the \
                    network output is multi-channel.")
            elif ref.ndim == inf.ndim == 3:
                # multi-channel reference and output
                ref = ref[..., ref_channel]
                inf = inf[..., ref_channel]

            sdr, sir, sar, perm = bss_eval_sources(ref,
                                                   inf,
                                                   compute_permutation=True)

            for i in range(num_spk):
                stoi_score = stoi(ref[i],
                                  inf[int(perm[i])],
                                  fs_sig=sample_rate)
                si_snr_score = -float(
                    si_snr_loss(
                        torch.from_numpy(ref[i][None, ...]),
                        torch.from_numpy(inf[int(perm[i])][None, ...]),
                    ))
                writer[f"STOI_spk{i + 1}"][key] = str(stoi_score)
                writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score)
                writer[f"SDR_spk{i + 1}"][key] = str(sdr[i])
                writer[f"SAR_spk{i + 1}"][key] = str(sar[i])
                writer[f"SIR_spk{i + 1}"][key] = str(sir[i])
                # save permutation assigned script file
                writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]
Exemplo n.º 8
0
def scoring(
    output_dir: str,
    dtype: str,
    log_level: Union[int, str],
    key_file: str,
    ref_scp: List[str],
    inf_scp: List[str],
    ref_channel: int,
    metrics: List[str],
    frame_size: int = 512,
    frame_hop: int = 256,
):
    assert check_argument_types()
    for metric in metrics:
        assert metric in (
            "STOI",
            "ESTOI",
            "SNR",
            "SI_SNR",
            "SDR",
            "SAR",
            "SIR",
            "framewise-SNR",
        ), metric

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    assert len(ref_scp) == len(inf_scp), ref_scp
    num_spk = len(ref_scp)

    keys = [
        line.rstrip().split(maxsplit=1)[0]
        for line in open(key_file, encoding="utf-8")
    ]

    ref_readers = [
        SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp
    ]
    inf_readers = [
        SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp
    ]

    # get sample rate
    fs, _ = ref_readers[0][keys[0]]

    # check keys
    for inf_reader, ref_reader in zip(inf_readers, ref_readers):
        assert inf_reader.keys() == ref_reader.keys()

    stft = STFTEncoder(n_fft=frame_size, hop_length=frame_hop)

    do_bss_eval = "SDR" in metrics or "SAR" in metrics or "SIR" in metrics
    with DatadirWriter(output_dir) as writer:
        for key in keys:
            ref_audios = [ref_reader[key][1] for ref_reader in ref_readers]
            inf_audios = [inf_reader[key][1] for inf_reader in inf_readers]
            ref = np.array(ref_audios)
            inf = np.array(inf_audios)
            if ref.ndim > inf.ndim:
                # multi-channel reference and single-channel output
                ref = ref[..., ref_channel]
                assert ref.shape == inf.shape, (ref.shape, inf.shape)
            elif ref.ndim < inf.ndim:
                # single-channel reference and multi-channel output
                raise ValueError("Reference must be multi-channel when the "
                                 "network output is multi-channel.")
            elif ref.ndim == inf.ndim == 3:
                # multi-channel reference and output
                ref = ref[..., ref_channel]
                inf = inf[..., ref_channel]

            if do_bss_eval or num_spk > 1:
                sdr, sir, sar, perm = bss_eval_sources(
                    ref, inf, compute_permutation=True)
            else:
                perm = [0]

            ilens = torch.LongTensor([ref.shape[1]])
            # (num_spk, T, F)
            ref_spec, flens = stft(torch.from_numpy(ref), ilens)
            inf_spec, _ = stft(torch.from_numpy(inf), ilens)

            for i in range(num_spk):
                p = int(perm[i])
                for metric in metrics:
                    name = f"{metric}_spk{i + 1}"
                    if metric == "STOI":
                        writer[name][key] = str(
                            stoi(ref[i], inf[p], fs_sig=fs, extended=False))
                    elif metric == "ESTOI":
                        writer[name][key] = str(
                            stoi(ref[i], inf[p], fs_sig=fs, extended=True))
                    elif metric == "SNR":
                        si_snr_score = -float(
                            ESPnetEnhancementModel.snr_loss(
                                torch.from_numpy(ref[i][None, ...]),
                                torch.from_numpy(inf[p][None, ...]),
                            ))
                        writer[name][key] = str(si_snr_score)
                    elif metric == "SI_SNR":
                        si_snr_score = -float(
                            ESPnetEnhancementModel.si_snr_loss(
                                torch.from_numpy(ref[i][None, ...]),
                                torch.from_numpy(inf[p][None, ...]),
                            ))
                        writer[name][key] = str(si_snr_score)
                    elif metric == "SDR":
                        writer[name][key] = str(sdr[i])
                    elif metric == "SAR":
                        writer[name][key] = str(sar[i])
                    elif metric == "SIR":
                        writer[name][key] = str(sir[i])
                    elif metric == "framewise-SNR":
                        framewise_snr = -ESPnetEnhancementModel.snr_loss(
                            ref_spec[i].abs(), inf_spec[i].abs())
                        writer[name][key] = " ".join(
                            map(str, framewise_snr.tolist()))
                    else:
                        raise ValueError("Unsupported metric: %s" % metric)
                    # save permutation assigned script file
                    writer[f"wav_spk{i + 1}"][key] = inf_readers[
                        perm[i]].data[key]
Exemplo n.º 9
0
def scoring(
    output_dir: str,
    dtype: str,
    log_level: Union[int, str],
    key_file: str,
    ref_scp: List[str],
    inf_scp: List[str],
    ref_channel: int,
    flexible_numspk: bool,
):
    assert check_argument_types()

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    assert len(ref_scp) == len(inf_scp), ref_scp
    num_spk = len(ref_scp)

    keys = [
        line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8")
    ]

    ref_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp]
    inf_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp]

    # get sample rate
    sample_rate, _ = ref_readers[0][keys[0]]

    # check keys
    if not flexible_numspk:
        for inf_reader, ref_reader in zip(inf_readers, ref_readers):
            assert inf_reader.keys() == ref_reader.keys()

    with DatadirWriter(output_dir) as writer:
        for key in keys:
            if not flexible_numspk:
                ref_audios = [ref_reader[key][1] for ref_reader in ref_readers]
                inf_audios = [inf_reader[key][1] for inf_reader in inf_readers]
            else:
                ref_audios = [
                    ref_reader[key][1]
                    for ref_reader in ref_readers
                    if key in ref_reader.keys()
                ]
                inf_audios = [
                    inf_reader[key][1]
                    for inf_reader in inf_readers
                    if key in inf_reader.keys()
                ]
            ref = np.array(ref_audios)
            inf = np.array(inf_audios)
            if ref.ndim > inf.ndim:
                # multi-channel reference and single-channel output
                ref = ref[..., ref_channel]
            elif ref.ndim < inf.ndim:
                # single-channel reference and multi-channel output
                inf = inf[..., ref_channel]
            elif ref.ndim == inf.ndim == 3:
                # multi-channel reference and output
                ref = ref[..., ref_channel]
                inf = inf[..., ref_channel]
            if not flexible_numspk:
                assert ref.shape == inf.shape, (ref.shape, inf.shape)
            else:
                # epsilon value to avoid divergence
                # caused by zero-value, e.g., log(0)
                eps = 0.000001
                # if num_spk of ref > num_spk of inf
                if ref.shape[0] > inf.shape[0]:
                    p = np.full((ref.shape[0] - inf.shape[0], inf.shape[1]), eps)
                    inf = np.concatenate([inf, p])
                    num_spk = ref.shape[0]
                # if num_spk of ref < num_spk of inf
                elif ref.shape[0] < inf.shape[0]:
                    p = np.full((inf.shape[0] - ref.shape[0], ref.shape[1]), eps)
                    ref = np.concatenate([ref, p])
                    num_spk = inf.shape[0]
                else:
                    num_spk = ref.shape[0]

            sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True)

            for i in range(num_spk):
                stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate)
                estoi_score = stoi(
                    ref[i], inf[int(perm[i])], fs_sig=sample_rate, extended=True
                )
                si_snr_score = -float(
                    si_snr_loss(
                        torch.from_numpy(ref[i][None, ...]),
                        torch.from_numpy(inf[int(perm[i])][None, ...]),
                    )
                )
                writer[f"STOI_spk{i + 1}"][key] = str(stoi_score * 100)  # in percentage
                writer[f"ESTOI_spk{i + 1}"][key] = str(estoi_score * 100)
                writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score)
                writer[f"SDR_spk{i + 1}"][key] = str(sdr[i])
                writer[f"SAR_spk{i + 1}"][key] = str(sar[i])
                writer[f"SIR_spk{i + 1}"][key] = str(sir[i])
                # save permutation assigned script file
                if not flexible_numspk:
                    writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]