예제 #1
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,  # false to comparable with kaldi
        "transpose": True  # T x F
    }
    wpe_kwargs = {
        "taps": args.taps,
        "delay": args.delay,
        "iters": args.iters,
        "psd_context": args.context
    }
    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    if not os.path.exists(args.dst_dir):
        os.makedirs(args.dst_dir)

    for key, reverbed in spectrogram_reader:
        # N x T x F => F x N x T
        reverbed = np.transpose(reverbed, [2, 0, 1])
        # F x N x T
        dereverb = wpe(reverbed, **wpe_kwargs)
        # F x N x T => N x T x F
        dereverb = np.transpose(dereverb, [1, 2, 0])
        # write for each channel
        for chid in range(dereverb.shape[0]):
            samps = istft(dereverb[chid], **stft_kwargs)
            write_wav(os.path.join(args.dst_dir,
                                   "{}.CH{:d}.wav".format(key, chid + 1)),
                      samps,
                      fs=args.samp_freq)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
예제 #2
0
def run(args):
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": True  # F x T instead of T x F
    }

    spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs)

    for key, spectrogram in spectrogram_reader:
        logger.info("Processing utterance {}...".format(key))
        separated = auxiva(spectrogram, args.epochs)
        for idx in range(separated.shape[0]):
            samps = istft(
                separated[idx],
                **stft_kwargs,
                norm=spectrogram_reader.samp_norm(key))
            write_wav(
                os.path.join(args.dst_dir, "{}.SRC{:d}.wav".format(
                    key, idx + 1)),
                samps,
                fs=args.fs)
    logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
예제 #3
0
 def rir(self, fname, fs=16000, rir_nsamps=4096, v=340, gpu=False):
     """
     Generate rir for current settings
     """
     if gpu:
         # self.beta: rt60
         beta = pygpurir.beta_SabineEstimation(self.size, self.beta)
         # NOTE: do not clear here
         # diff = pygpurir.att2t_SabineEstimator(15, self.beta)
         tmax = rir_nsamps / fs
         nb_img = pygpurir.t2n(tmax, self.size)
         # S x R x T
         rir = pygpurir.simulateRIR(self.size,
                                    beta,
                                    np.array(self.spos)[None, ...],
                                    np.array(self.rpos),
                                    nb_img,
                                    tmax,
                                    fs,
                                    mic_pattern="omni")
         write_wav(fname, rir[0], fs=fs)
     elif cpp_rir_available:
         # format float
         ffloat = lambda f: "{:.3f}".format(f)
         # location for each microphone
         loc_for_each_channel = [
             ",".join(map(ffloat, p)) for p in self.rpos
         ]
         beta = ",".join(map(ffloat, self.beta)) if isinstance(
             self.beta, list) else round(self.beta, 3)
         run_command(
             "rir-simulate --sound-velocity={v} --samp-frequency={sample_rate} "
             "--hp-filter=true --number-samples={rir_samples} --beta={beta} "
             "--room-topo={room_size} --receiver-location=\"{receiver_location}\" "
             "--source-location={source_location} {dump_dest}".format(
                 v=v,
                 sample_rate=fs,
                 rir_samples=rir_nsamps,
                 room_size=",".join(map(ffloat, self.size)),
                 beta=beta,
                 receiver_location=";".join(loc_for_each_channel),
                 source_location=",".join(map(ffloat, self.spos)),
                 dump_dest=fname))
     elif pyrirgen_available:
         rir = pyrirgen.generateRir(self.size,
                                    self.spos,
                                    self.rpos,
                                    soundVelocity=v,
                                    fs=fs,
                                    nDim=3,
                                    nSamples=rir_nsamps,
                                    nOrder=-1,
                                    reverbTime=self.beta,
                                    micType="omnidirectional",
                                    isHighPassFilter=True)
         if isinstance(rir, list):
             rir = np.stack(rir)
         write_wav(fname, rir, fs=fs)
     else:
         raise RuntimeError("Both rir-simulate and pyrirgen unavailable")
예제 #4
0
파일: wav_simulate.py 프로젝트: funcwj/setk
def run(args):
    start = time.time()
    # run simulation
    mix, spk_ref, noise = run_simu(args)
    # Show RTF
    utt_dur = mix.shape[-1] / float(args.sr)
    time_cost = float(time.time() - start)
    print(
        f"Time cost: {time_cost:.4f}s, Utterance duration: {utt_dur:.2f}s, "
        f"RTF = {time_cost / utt_dur:.4f}",
        flush=True)
    # dump mixture
    write_wav(args.mix, mix, sr=args.sr)
    # dump reference
    if args.dump_ref_dir:
        basename = os.path.basename(args.mix)
        ref_dir = pathlib.Path(args.dump_ref_dir)
        ref_dir.mkdir(parents=True, exist_ok=True)
        # has noise
        if noise is not None:
            write_wav(ref_dir / "noise" / basename, noise, sr=args.sr)
        # one speaker
        if len(spk_ref) == 1:
            write_wav(ref_dir / "clean" / basename, spk_ref, sr=args.sr)
        else:
            for i, s in enumerate(spk_ref):
                write_wav(ref_dir / f"spk{i + 1}" / basename, s, sr=args.sr)
예제 #5
0
def run(args):
    griffin_lim_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center,
        "transpose": True,
        "epochs": args.epochs
    }

    feature_reader = ScriptReader(args.feat_scp)

    if args.fbank:
        mel_kwargs = {
            "n_mels": args.num_bins,
            "fmin": args.min_freq,
            "fmax": args.max_freq,
            "htk": True
        }
        # N x F
        mel_weights = audio_lib.filters.mel(args.samp_freq,
                                            nfft(args.frame_length),
                                            **mel_kwargs)
        # F x N
        mel_inv_weights = np.linalg.pinv(mel_weights)

    for key, spec in feature_reader:
        # if log, tranform to linear
        if args.apply_log:
            spec = np.exp(spec)
        # convert fbank to spectrum
        # feat: T x N
        if args.fbank:
            spec = np.maximum(spec @ np.transpose(mel_inv_weights), EPSILON)
        # if power spectrum, tranform to magnitude spectrum
        if args.apply_pow:
            spec = np.sqrt(spec)
        if spec.shape[1] - 1 != nfft(args.frame_length) // 2:
            raise RuntimeError("Seems missing --fbank options?")
        # griffin lim
        samps = griffin_lim(spec, **griffin_lim_kwargs)
        write_wav(os.path.join(args.dump_dir, "{}.wav".format(key)),
                  samps,
                  fs=args.samp_freq,
                  normalize=args.normalize)
    logger.info("Process {:d} utterance done".format(len(feature_reader)))
예제 #6
0
 def rir(self, fname, fs=16000, rir_nsamps=4096, v=340):
     """
     Generate rir for current settings
     """
     if shutil.which("rir-simulate"):
         # format float
         ffloat = lambda f: "{:.3f}".format(f)
         # location for each microphone
         loc_for_each_channel = [
             ",".join(map(ffloat, p)) for p in self.rpos
         ]
         beta = ",".join(map(ffloat, self.beta)) if isinstance(
             self.beta, list) else round(self.beta, 3)
         run_command(
             "rir-simulate --sound-velocity={v} --samp-frequency={sample_rate} "
             "--hp-filter=true --number-samples={rir_samples} --beta={beta} "
             "--room-topo={room_size} --receiver-location=\"{receiver_location}\" "
             "--source-location={source_location} {dump_dest}".format(
                 v=v,
                 sample_rate=fs,
                 rir_samples=rir_nsamps,
                 room_size=",".join(map(ffloat, self.size)),
                 beta=beta,
                 receiver_location=";".join(loc_for_each_channel),
                 source_location=",".join(map(ffloat, self.spos)),
                 dump_dest=fname))
     elif pyrirgen_available:
         rir = pyrirgen.generateRir(self.size,
                                    self.spos,
                                    self.rpos,
                                    soundVelocity=v,
                                    fs=fs,
                                    nDim=3,
                                    nSamples=rir_nsamps,
                                    nOrder=-1,
                                    reverbTime=self.beta,
                                    micType="omnidirectional",
                                    isHighPassFilter=True)
         if isinstance(rir, list):
             rir = np.stack(rir)
         write_wav(fname, rir, fs=fs)
     else:
         raise RuntimeError("Both rir-simulate and pyrirgen unavailable")
예제 #7
0
def run(args):
    # return complex result
    stft_kwargs = {
        "frame_length": args.frame_length,
        "frame_shift": args.frame_shift,
        "window": args.window,
        "center": args.center
    }
    logger.info("Using mask: {}".format(args.mask.upper()))
    mixture_reader = SpectrogramReader(
        args.mix_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    ref_scp_list = args.ref_scp.split(",")
    logger.info("Number of speakers: {:d}".format(len(ref_scp_list)))
    targets_reader = [
        SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list
    ]
    num_utts = 0
    for key, mixture in tqdm(mixture_reader):
        nsamps = mixture_reader.nsamps(key) if args.keep_length else None
        skip = False
        for reader in targets_reader:
            if key not in reader:
                logger.info("Skip utterance {}, missing targets".format(key))
                skip = True
                break
        if skip:
            continue
        num_utts += 1
        targets_list = [reader[key] for reader in targets_reader]
        spk_masks = compute_mask(mixture, targets_list, args.mask)
        for index, mask in enumerate(spk_masks):
            samps = istft(mixture * mask, **stft_kwargs, nsamps=nsamps)
            write_wav(os.path.join(args.dump_dir,
                                   "spk{:d}/{}.wav".format(index + 1, key)),
                      samps,
                      fs=args.fs)
    logger.info("Processed {} utterance!".format(num_utts))
예제 #8
0
def run(args):
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center,
        "transpose": True  # F x T instead of T x F
    }

    spectrogram_reader = SpectrogramReader(
        args.wav_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    for key, spectrogram in spectrogram_reader:
        logger.info(f"Processing utterance {key}...")
        separated = auxiva(spectrogram, args.epochs)
        norm = spectrogram_reader.maxabs(key)
        for idx in range(separated.shape[0]):
            samps = inverse_stft(separated[idx], **stft_kwargs, norm=norm)
            fname = Path(args.dst_dir) / f"{key}.src{idx + 1}.wav"
            write_wav(fname, samps, fs=args.fs)
    logger.info(f"Processed {len(spectrogram_reader)} utterances")
예제 #9
0
def run(args):
    # return complex result
    stft_kwargs = {
        "frame_len": args.frame_len,
        "frame_hop": args.frame_hop,
        "window": args.window,
        "center": args.center
    }
    logger.info(f"Using mask: {args.mask.upper()}")
    mixture_reader = SpectrogramReader(
        args.mix_scp,
        round_power_of_two=args.round_power_of_two,
        **stft_kwargs)
    ref_scp_list = args.ref_scp.split(",")
    logger.info(f"Number of speakers: {len(ref_scp_list)}")
    targets_reader = [
        SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list
    ]
    num_utts = 0
    for key, mixture in tqdm(mixture_reader):
        nsamps = mixture_reader.nsamps(key) if args.keep_length else None
        skip = False
        for reader in targets_reader:
            if key not in reader:
                logger.info(f"Skip utterance {key}, missing targets")
                skip = True
                break
        if skip:
            continue
        num_utts += 1
        targets_list = [reader[key] for reader in targets_reader]
        spk_masks = compute_mask(mixture, targets_list, args.mask)
        for index, mask in enumerate(spk_masks):
            samps = inverse_stft(mixture * mask, **stft_kwargs, nsamps=nsamps)
            write_wav(os.path.join(args.dump_dir, f"spk{index + 1}/{key}.wav"),
                      samps,
                      sr=args.sr)
    logger.info(f"Processed {num_utts} utterance")
예제 #10
0
def run(args):
    def arg_audio(src_args, beg=None):
        return [read_wav(s, fs=args.sr, beg=beg)
                for s in src_args.split(",")] if src_args else None

    def arg_float(src_args):
        return [float(s) for s in src_args.split(",")] if src_args else None

    src_spk = arg_audio(args.src_spk)
    src_rir = arg_audio(args.src_rir)
    if src_rir:
        if len(src_rir) != len(src_spk):
            raise RuntimeError(
                f"Number of --src-rir={args.src_rir} do not match with " +
                f"--src-spk={args.src_spk} option")
    sdr = arg_float(args.src_sdr)
    if len(src_spk) > 1 and not sdr:
        raise RuntimeError("--src-sdr need to be assigned for " +
                           f"--src-spk={args.src_spk}")
    if sdr:
        if len(src_spk) - 1 != len(sdr):
            raise RuntimeError("Number of --src-snr - 1 do not match with " +
                               "--src-snr option")
        sdr = [0] + sdr

    src_begin = arg_float(args.src_begin)
    if src_begin:
        src_begin = [int(v) for v in src_begin]
    else:
        src_begin = [0 for _ in src_spk]

    # number samples of the mixture
    mix_nsamps = max([b + s.size for b, s in zip(src_begin, src_spk)])

    point_noise = arg_audio(args.point_noise)
    point_noise_rir = arg_audio(args.point_noise_rir)
    if point_noise:
        if point_noise_rir:
            if len(point_noise) != len(point_noise_rir):
                raise RuntimeError(
                    f"Number of --point-noise-rir={args.point_noise_rir} do not match with "
                    + f"--point-noise={args.point_noise} option")
        point_snr = arg_float(args.point_noise_snr)
        if not point_snr:
            raise RuntimeError("--point-noise-snr need to be assigned for " +
                               f"--point-noise={args.point_noise}")
        if len(point_noise) != len(point_snr):
            raise RuntimeError(
                f"Number of --point-noise-snr={args.point_noise_snr} do not match with "
                + f"--point-noise={args.point_noise} option")

        point_begin = arg_float(args.point_noise_begin)
        if point_begin:
            point_begin = [int(v) for v in point_begin]
        else:
            point_begin = [0 for _ in point_noise]

    isotropic_noise = arg_audio(args.isotropic_noise,
                                beg=args.isotropic_noise_begin)
    if isotropic_noise:
        isotropic_noise = isotropic_noise[0]
        isotropic_snr = arg_float(args.isotropic_noise_snr)
        if not isotropic_snr:
            raise RuntimeError(
                "--isotropic-snr need to be assigned for " +
                f"--isotropic-noise={args.isotropic_noise} option")
        isotropic_snr = isotropic_snr[0]
    else:
        isotropic_snr = None

    # add speakers
    spk = add_speaker(mix_nsamps,
                      src_spk,
                      src_begin,
                      sdr,
                      src_rir=src_rir,
                      channel=args.dump_channel,
                      sr=args.sr)
    spk_utt = sum(spk)
    mix = spk_utt.copy()

    if point_noise:
        spk_power = np.mean(spk_utt[0]**2)
        noise = add_point_noise(mix_nsamps,
                                spk_power,
                                point_noise,
                                point_begin,
                                point_snr,
                                noise_rir=point_noise_rir,
                                channel=args.dump_channel,
                                sr=args.sr)
        if spk_utt.shape[0] != noise.shape[0]:
            raise RuntimeError("Channel mismatch between source speaker " +
                               "configuration and pointsource noise's, " +
                               f"{spk_utt.shape[0]} vs {noise.shape[0]}")
        mix = spk_utt + noise
    else:
        noise = None

    ch = args.dump_channel
    if isotropic_noise is not None:
        N, _ = spk_utt.shape
        if N == 1:
            if isotropic_noise.ndim == 1:
                isotropic_noise = isotropic_noise[None, ...]
            else:
                if ch >= 0:
                    isotropic_noise = isotropic_noise[ch:ch + 1]
                else:
                    raise RuntimeError(
                        "Single channel mixture vs multi-channel "
                        "isotropic noise")
        else:
            if isotropic_noise.shape[0] != N:
                raise RuntimeError(
                    "Channel number mismatch between mixture and isotropic noise, "
                    + f"{N} vs {isotropic_noise.shape[0]}")

        dur = min(mix_nsamps, isotropic_noise.shape[-1])
        isotropic_chunk = isotropic_noise[0, :dur]
        power = np.mean(isotropic_chunk**2)
        coeff = coeff_snr(power, spk_power, isotropic_snr)
        mix[..., :dur] += coeff * isotropic_chunk
        noise[..., :dur] += coeff * isotropic_chunk

    factor = args.norm_factor / (np.max(np.abs(mix)) + EPSILON)

    write_wav(args.mix, factor * mix, fs=args.sr)

    if args.dump_ref_dir:
        basename = os.path.basename(args.mix)
        ref_dir = pathlib.Path(args.dump_ref_dir)
        ref_dir.mkdir(parents=True, exist_ok=True)
        # has noise
        if noise is not None:
            write_wav(ref_dir / "noise" / basename, factor * noise, fs=args.sr)
        # one speaker
        if len(spk) == 1:
            write_wav(ref_dir / "speaker" / basename,
                      factor * spk[0],
                      fs=args.sr)
        else:
            for i, s in enumerate(spk):
                write_wav(ref_dir / f"spk{i + 1}" / basename,
                          factor * s,
                          fs=args.sr)