def test_normalize():

    from speechbrain.processing.signal_processing import compute_amplitude
    from speechbrain.processing.signal_processing import rescale
    import random
    import numpy as np

    for scale in ["dB", "linear"]:
        for amp_type in ["peak", "avg"]:
            for test_vec in [
                    torch.zeros((100)),
                    torch.rand((10, 100)),
                    torch.rand((10, 100, 5)),
            ]:

                lengths = (test_vec.size(1)
                           if len(test_vec.shape) > 1 else test_vec.size(0))
                amp = compute_amplitude(test_vec, lengths, amp_type, scale)
                scaled_back = rescale(
                    random.random() * test_vec,
                    lengths,
                    amp,
                    amp_type,
                    scale,
                )
                np.testing.assert_array_almost_equal(scaled_back.numpy(),
                                                     test_vec.numpy())
예제 #2
0
    def audio_pipeline(
        mix_wav,
    ):  # this is dummy --> it means one epoch will be same as without dynamic mixing
        """
        This audio pipeline defines the compute graph for dynamic mixing
        """

        speakers = np.random.choice(
            spk_list, hparams["num_spks"], replace=False, p=spk_weights
        )

        if "wham" in Path(hparams["data_folder"]).stem:
            noise_file = np.random.choice(noise_files, 1, replace=False)

            noise, fs_read = torchaudio.load(noise_file[0])
            noise = noise.squeeze()
            # gain = np.clip(random.normalvariate(1, 10), -4, 15)
            # noise = rescale(noise, torch.tensor(len(noise)), gain, scale="dB").squeeze()

        # select two speakers randomly
        sources = []
        first_lvl = None

        spk_files = [
            np.random.choice(spk_hashtable[spk], 1, False)[0]
            for spk in speakers
        ]

        minlen = min(
            *[torchaudio.info(x).num_frames for x in spk_files],
            hparams["training_signal_len"],
        )

        for i, spk_file in enumerate(spk_files):

            # select random offset
            length = torchaudio.info(spk_file).num_frames
            start = 0
            stop = length
            if length > minlen:  # take a random window
                start = np.random.randint(0, length - minlen)
                stop = start + minlen

            tmp, fs_read = torchaudio.load(
                spk_file, frame_offset=start, num_frames=stop - start,
            )

            # peak = float(Path(spk_file).stem.split("_peak_")[-1])
            tmp = tmp[0]  # * peak  # remove channel dim and normalize

            if i == 0:
                gain = np.clip(random.normalvariate(-27.43, 2.57), -45, 0)
                tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB")
                # assert not torch.all(torch.isnan(tmp))
                first_lvl = gain
            else:
                gain = np.clip(
                    first_lvl + random.normalvariate(-2.51, 2.66), -45, 0
                )
                tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB")
                # assert not torch.all(torch.isnan(tmp))
            sources.append(tmp)

        # we mix the sources together
        # here we can also use augmentations ! -> runs on cpu and for each
        # mixture parameters will be different rather than for whole batch.
        # no difference however for bsz=1 :)

        # padding left
        # sources, _ = batch_pad_right(sources)

        sources = torch.stack(sources)
        mixture = torch.sum(sources, 0)
        if "wham" in Path(hparams["data_folder"]).stem:
            len_noise = len(noise)
            len_mix = len(mixture)
            min_len = min(len_noise, len_mix)
            mixture = mixture[:min_len] + noise[:min_len]

        max_amp = max(
            torch.abs(mixture).max().item(),
            *[x.item() for x in torch.abs(sources).max(dim=-1)[0]],
        )
        mix_scaling = 1 / max_amp * 0.9
        sources = mix_scaling * sources
        mixture = mix_scaling * mixture

        yield mixture
        for i in range(hparams["num_spks"]):
            yield sources[i]

        # If the number of speakers is 2, yield None for the 3rd speaker
        if hparams["num_spks"] == 2:
            yield None

        if "wham" in Path(hparams["data_folder"]).stem:
            mean_source_lvl = sources.abs().mean()
            mean_noise_lvl = noise.abs().mean()
            noise = (mean_source_lvl / mean_noise_lvl) * noise
            yield noise
        else:
            yield None
def create_mixture(session_n, output_dir, params, metadata):
    os.makedirs(os.path.join(output_dir, session_n), exist_ok=True)

    session_meta = {}
    speakers = [
        x for x in metadata.keys() if x not in ["noises", "background"]
    ]

    tot_length = int(
        np.ceil(metadata["background"]["stop"] * params["samplerate"]))
    mixture = torch.zeros(tot_length)  # total mixture file
    assert len(mixture) > 0, "Mixture has length 0, please raise max_length."
    # step 1
    for spk in speakers:
        session_meta[spk] = []
        # we create mixture for each speaker and we optionally save it.
        if params["save_dry_sources"]:
            dry = torch.zeros(tot_length)
        if params["save_wet_sources"]:
            wet = torch.zeros(tot_length)

        for utt in metadata[spk]:
            c_audio, fs = torchaudio.load(
                os.path.join(params["librispeech_root"], utt["file"]))
            assert fs == params["samplerate"]
            if len(c_audio.shape) > 1:  # multichannel
                c_audio = c_audio[utt["channel"], :]
                c_audio = c_audio - torch.mean(c_audio)
            c_audio = rescale(
                c_audio,
                c_audio.size(0),
                utt["lvl"],
                scale="dB",
                amp_type="peak",
            )
            # we save it in dry
            dry_start = int(utt["start"] * params["samplerate"])
            dry_stop = dry_start + c_audio.shape[-1]
            if params["save_dry_sources"]:
                dry[dry_start:dry_stop] += c_audio
            # we add now reverb and put it in wet
            c_rir, fs = torchaudio.load(
                os.path.join(params["rirs_noises_root"], utt["rir"]))
            assert fs == params["samplerate"]
            c_rir = c_rir[utt["rir_channel"], :]

            c_audio = reverberate(c_audio, c_rir, "peak")
            # tof is not accounted because in reverberate we shift by it
            wet_start = dry_start
            wet_stop = dry_stop  # + early_rev_samples
            if params["save_wet_sources"]:
                wet[wet_start:wet_start + len(c_audio)] += c_audio

            session_meta[spk].append({
                "start":
                np.round(wet_start / params["samplerate"], 3),
                "stop":
                np.round(wet_stop / params["samplerate"], 3),
                "lvl":
                utt["lvl"],
                "words":
                utt["words"],
                "file":
                utt["file"],
                "channel":
                utt["channel"],
                "rir":
                utt["rir"],
                "rir_channels":
                utt["rir_channel"],
            })
            # we add to mixture
            mixture[wet_start:wet_start + len(c_audio)] += c_audio

        # we allow for clipping as it occurs also in real recordings.

        # save per speaker clean sources
        if params["save_dry_sources"]:
            torchaudio.save(
                os.path.join(
                    output_dir,
                    session_n,
                    "session_{}_spk_{}_dry.wav".format(session_n, spk),
                ),
                torch.clamp(dry, min=-1, max=1),
                params["samplerate"],
            )

        if params["save_wet_sources"]:
            torchaudio.save(
                os.path.join(
                    output_dir,
                    session_n,
                    "session_{}_spk_{}_wet.wav".format(session_n, spk),
                ),
                torch.clamp(wet, min=-1, max=1),
                params["samplerate"],
            )

    with open(os.path.join(output_dir, session_n, "{}.json".format(session_n)),
              "w") as f:
        json.dump(session_meta, f, indent=4)

    # add impulsive noises
    for noise_event in metadata["noises"]:

        c_audio, fs = torchaudio.load(
            os.path.join(params["rirs_noises_root"], noise_event["file"]))
        assert fs == params["samplerate"]
        if len(c_audio.shape) > 1:  # multichannel
            c_audio = c_audio[noise_event["channel"], :]
            c_audio = c_audio - torch.mean(c_audio)
        c_audio = rescale(
            c_audio,
            c_audio.size(0),
            noise_event["lvl"],
            scale="dB",
            amp_type="peak",
        )

        # we save it in dry
        dry_start = int(noise_event["start"] * params["samplerate"])
        # dry_stop = dry_start + c_audio.shape[-1]
        # we add now reverb and put it in wet
        c_rir, fs = torchaudio.load(
            os.path.join(params["rirs_noises_root"], noise_event["rir"]))
        assert fs == params["samplerate"]
        c_rir = c_rir[noise_event["rir_channel"], :]

        c_audio = reverberate(c_audio, c_rir, "peak")

        # tof is not accounted because in reverberate we shift by it
        wet_start = dry_start
        mixture[wet_start:wet_start + len(c_audio)] += c_audio

    # add background
    if metadata["background"]["file"]:
        c_audio, fs = torchaudio.load(
            os.path.join(params["backgrounds_root"],
                         metadata["background"]["file"]),
            frame_offset=metadata["background"]["orig_start"],
            num_frames=mixture.shape[-1],
        )
        assert fs == params["samplerate"]
        if len(c_audio.shape) > 1:  # multichannel
            c_audio = c_audio[metadata["background"]["channel"], :]
            c_audio = c_audio - torch.mean(c_audio)
        c_audio = rescale(
            c_audio,
            c_audio.size(0),
            metadata["background"]["lvl"],
            scale="dB",
            amp_type="avg",
        )
        mixture += c_audio

    else:
        # add gaussian noise
        mixture += rescale(
            torch.normal(0, 1, mixture.shape),
            mixture.size(0),
            metadata["background"]["lvl"],
            scale="dB",
            amp_type="peak",
        )

    # save total mixture
    mixture = torch.clamp(mixture, min=-1, max=1)
    torchaudio.save(
        os.path.join(output_dir, session_n,
                     "{}_mixture.wav".format(session_n)),
        mixture.unsqueeze(0),
        params["samplerate"],
    )
예제 #4
0
    def audio_pipeline(
        mix_wav,
    ):  # this is dummy --> it means one epoch will be same as without dynamic mixing

        speakers = np.random.choice(spk_list,
                                    hparams["num_spks"],
                                    replace=False,
                                    p=spk_weights)
        # select two speakers randomly
        sources = []
        first_lvl = None

        spk_files = [
            np.random.choice(spk_hashtable[spk], 1, False)[0]
            for spk in speakers
        ]

        minlen = min(
            *[torchaudio.info(x).num_frames for x in spk_files],
            hparams["training_signal_len"],
        )

        for i, spk_file in enumerate(spk_files):

            # select random offset
            length = torchaudio.info(spk_file).num_frames
            start = 0
            stop = length
            if length > minlen:  # take a random window
                start = np.random.randint(0, length - minlen)
                stop = start + minlen

            tmp, fs_read = torchaudio.load(
                spk_file,
                frame_offset=start,
                num_frames=stop - start,
            )

            # peak = float(Path(spk_file).stem.split("_peak_")[-1])
            tmp = tmp[0]  # * peak  # remove channel dim and normalize

            if i == 0:
                gain = np.clip(random.normalvariate(-27.43, 2.57), -45, 0)
                tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB")
                # assert not torch.all(torch.isnan(tmp))
                first_lvl = gain
            else:
                gain = np.clip(first_lvl + random.normalvariate(-2.51, 2.66),
                               -45, 0)
                tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB")
                # assert not torch.all(torch.isnan(tmp))
            sources.append(tmp)

        # we mix the sources together
        # here we can also use augmentations ! -> runs on cpu and for each
        # mixture parameters will be different rather than for whole batch.
        # no difference however for bsz=1 :)

        # padding left
        # sources, _ = batch_pad_right(sources)

        sources = torch.stack(sources)
        mixture = torch.sum(sources, 0)
        max_amp = max(
            torch.abs(mixture).max().item(),
            *[x.item() for x in torch.abs(sources).max(dim=-1)[0]],
        )
        mix_scaling = 1 / max_amp * 0.9
        sources = sources * mix_scaling
        mixture = mix_scaling * mixture

        yield mixture
        for i in range(hparams["num_spks"]):
            yield sources[i]
예제 #5
0
    def audio_pipeline(
        mix_wav,
    ):  # this is dummy --> it means one epoch will be same as without dynamic mixing
        """
        This audio pipeline defines the compute graph for dynamic mixing
        """

        speakers = np.random.choice(spk_list,
                                    num_spks,
                                    replace=False,
                                    p=spk_weights)

        if "wham" in Path(data_root_folder).stem:
            noise_file = np.random.choice(noise_files, 1, replace=False)

            noise, fs_read = torchaudio.load(noise_file[0])
            noise = noise.squeeze()

        # select two speakers randomly
        sources = []
        first_lvl = None

        spk_files = [
            np.random.choice(spk_hashtable[spk], 1, False)[0]
            for spk in speakers
        ]

        minlen = min(
            *[torchaudio.info(x).num_frames for x in spk_files],
            max_training_signal_len,
        )

        for i, spk_file in enumerate(spk_files):

            # select random offset
            length = torchaudio.info(spk_file).num_frames
            start = 0
            stop = length
            if length > minlen:  # take a random window
                start = np.random.randint(0, length - minlen)
                stop = start + minlen

            tmp, fs_read = torchaudio.load(
                spk_file,
                frame_offset=start,
                num_frames=stop - start,
            )

            tmp = tmp[0]  # * peak  # remove channel dim and normalize

            if i == 0:
                gain = np.clip(random.normalvariate(-27.43, 2.57), -45, 0)
                tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB")
                first_lvl = gain
            else:
                gain = np.clip(first_lvl + random.normalvariate(-2.51, 2.66),
                               -45, 0)
                tmp = rescale(tmp, torch.tensor(len(tmp)), gain, scale="dB")
            sources.append(tmp)

        # we mix the sources together
        sources = torch.stack(sources)
        mixture = torch.sum(sources, 0)
        if "wham" in Path(data_root_folder).stem:
            len_noise = len(noise)
            len_mix = len(mixture)
            min_len = min(len_noise, len_mix)
            mixture = mixture[:min_len] + noise[:min_len]

        max_amp = max(
            torch.abs(mixture).max().item(),
            *[x.item() for x in torch.abs(sources).max(dim=-1)[0]],
        )
        mix_scaling = 1 / max_amp * 0.9
        sources = mix_scaling * sources
        mixture = mix_scaling * mixture

        yield mixture
        for i in range(num_spks):
            yield sources[i]

        # If the number of speakers is 2, yield None for the 3rd speaker
        if num_spks == 2:
            yield None

        if "wham" in Path(data_root_folder).stem:
            mean_source_lvl = sources.abs().mean()
            mean_noise_lvl = noise.abs().mean()
            noise = (mean_source_lvl / mean_noise_lvl) * noise
            yield noise
        else:
            yield None