def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } wpe_kwargs = { "taps": args.taps, "delay": args.delay, "iters": args.iters, "psd_context": args.context } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) if not os.path.exists(args.dst_dir): os.makedirs(args.dst_dir) for key, reverbed in spectrogram_reader: # N x T x F => F x N x T reverbed = np.transpose(reverbed, [2, 0, 1]) # F x N x T dereverb = wpe(reverbed, **wpe_kwargs) # F x N x T => N x T x F dereverb = np.transpose(dereverb, [1, 2, 0]) # write for each channel for chid in range(dereverb.shape[0]): samps = istft(dereverb[chid], **stft_kwargs) write_wav(os.path.join(args.dst_dir, "{}.CH{:d}.wav".format(key, chid + 1)), samps, fs=args.samp_freq) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": True # F x T instead of T x F } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) for key, spectrogram in spectrogram_reader: logger.info("Processing utterance {}...".format(key)) separated = auxiva(spectrogram, args.epochs) for idx in range(separated.shape[0]): samps = istft( separated[idx], **stft_kwargs, norm=spectrogram_reader.samp_norm(key)) write_wav( os.path.join(args.dst_dir, "{}.SRC{:d}.wav".format( key, idx + 1)), samps, fs=args.fs) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def rir(self, fname, fs=16000, rir_nsamps=4096, v=340, gpu=False): """ Generate rir for current settings """ if gpu: # self.beta: rt60 beta = pygpurir.beta_SabineEstimation(self.size, self.beta) # NOTE: do not clear here # diff = pygpurir.att2t_SabineEstimator(15, self.beta) tmax = rir_nsamps / fs nb_img = pygpurir.t2n(tmax, self.size) # S x R x T rir = pygpurir.simulateRIR(self.size, beta, np.array(self.spos)[None, ...], np.array(self.rpos), nb_img, tmax, fs, mic_pattern="omni") write_wav(fname, rir[0], fs=fs) elif cpp_rir_available: # format float ffloat = lambda f: "{:.3f}".format(f) # location for each microphone loc_for_each_channel = [ ",".join(map(ffloat, p)) for p in self.rpos ] beta = ",".join(map(ffloat, self.beta)) if isinstance( self.beta, list) else round(self.beta, 3) run_command( "rir-simulate --sound-velocity={v} --samp-frequency={sample_rate} " "--hp-filter=true --number-samples={rir_samples} --beta={beta} " "--room-topo={room_size} --receiver-location=\"{receiver_location}\" " "--source-location={source_location} {dump_dest}".format( v=v, sample_rate=fs, rir_samples=rir_nsamps, room_size=",".join(map(ffloat, self.size)), beta=beta, receiver_location=";".join(loc_for_each_channel), source_location=",".join(map(ffloat, self.spos)), dump_dest=fname)) elif pyrirgen_available: rir = pyrirgen.generateRir(self.size, self.spos, self.rpos, soundVelocity=v, fs=fs, nDim=3, nSamples=rir_nsamps, nOrder=-1, reverbTime=self.beta, micType="omnidirectional", isHighPassFilter=True) if isinstance(rir, list): rir = np.stack(rir) write_wav(fname, rir, fs=fs) else: raise RuntimeError("Both rir-simulate and pyrirgen unavailable")
def run(args): start = time.time() # run simulation mix, spk_ref, noise = run_simu(args) # Show RTF utt_dur = mix.shape[-1] / float(args.sr) time_cost = float(time.time() - start) print( f"Time cost: {time_cost:.4f}s, Utterance duration: {utt_dur:.2f}s, " f"RTF = {time_cost / utt_dur:.4f}", flush=True) # dump mixture write_wav(args.mix, mix, sr=args.sr) # dump reference if args.dump_ref_dir: basename = os.path.basename(args.mix) ref_dir = pathlib.Path(args.dump_ref_dir) ref_dir.mkdir(parents=True, exist_ok=True) # has noise if noise is not None: write_wav(ref_dir / "noise" / basename, noise, sr=args.sr) # one speaker if len(spk_ref) == 1: write_wav(ref_dir / "clean" / basename, spk_ref, sr=args.sr) else: for i, s in enumerate(spk_ref): write_wav(ref_dir / f"spk{i + 1}" / basename, s, sr=args.sr)
def run(args): griffin_lim_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, "transpose": True, "epochs": args.epochs } feature_reader = ScriptReader(args.feat_scp) if args.fbank: mel_kwargs = { "n_mels": args.num_bins, "fmin": args.min_freq, "fmax": args.max_freq, "htk": True } # N x F mel_weights = audio_lib.filters.mel(args.samp_freq, nfft(args.frame_length), **mel_kwargs) # F x N mel_inv_weights = np.linalg.pinv(mel_weights) for key, spec in feature_reader: # if log, tranform to linear if args.apply_log: spec = np.exp(spec) # convert fbank to spectrum # feat: T x N if args.fbank: spec = np.maximum(spec @ np.transpose(mel_inv_weights), EPSILON) # if power spectrum, tranform to magnitude spectrum if args.apply_pow: spec = np.sqrt(spec) if spec.shape[1] - 1 != nfft(args.frame_length) // 2: raise RuntimeError("Seems missing --fbank options?") # griffin lim samps = griffin_lim(spec, **griffin_lim_kwargs) write_wav(os.path.join(args.dump_dir, "{}.wav".format(key)), samps, fs=args.samp_freq, normalize=args.normalize) logger.info("Process {:d} utterance done".format(len(feature_reader)))
def rir(self, fname, fs=16000, rir_nsamps=4096, v=340): """ Generate rir for current settings """ if shutil.which("rir-simulate"): # format float ffloat = lambda f: "{:.3f}".format(f) # location for each microphone loc_for_each_channel = [ ",".join(map(ffloat, p)) for p in self.rpos ] beta = ",".join(map(ffloat, self.beta)) if isinstance( self.beta, list) else round(self.beta, 3) run_command( "rir-simulate --sound-velocity={v} --samp-frequency={sample_rate} " "--hp-filter=true --number-samples={rir_samples} --beta={beta} " "--room-topo={room_size} --receiver-location=\"{receiver_location}\" " "--source-location={source_location} {dump_dest}".format( v=v, sample_rate=fs, rir_samples=rir_nsamps, room_size=",".join(map(ffloat, self.size)), beta=beta, receiver_location=";".join(loc_for_each_channel), source_location=",".join(map(ffloat, self.spos)), dump_dest=fname)) elif pyrirgen_available: rir = pyrirgen.generateRir(self.size, self.spos, self.rpos, soundVelocity=v, fs=fs, nDim=3, nSamples=rir_nsamps, nOrder=-1, reverbTime=self.beta, micType="omnidirectional", isHighPassFilter=True) if isinstance(rir, list): rir = np.stack(rir) write_wav(fname, rir, fs=fs) else: raise RuntimeError("Both rir-simulate and pyrirgen unavailable")
def run(args): # return complex result stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center } logger.info("Using mask: {}".format(args.mask.upper())) mixture_reader = SpectrogramReader( args.mix_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) ref_scp_list = args.ref_scp.split(",") logger.info("Number of speakers: {:d}".format(len(ref_scp_list))) targets_reader = [ SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list ] num_utts = 0 for key, mixture in tqdm(mixture_reader): nsamps = mixture_reader.nsamps(key) if args.keep_length else None skip = False for reader in targets_reader: if key not in reader: logger.info("Skip utterance {}, missing targets".format(key)) skip = True break if skip: continue num_utts += 1 targets_list = [reader[key] for reader in targets_reader] spk_masks = compute_mask(mixture, targets_list, args.mask) for index, mask in enumerate(spk_masks): samps = istft(mixture * mask, **stft_kwargs, nsamps=nsamps) write_wav(os.path.join(args.dump_dir, "spk{:d}/{}.wav".format(index + 1, key)), samps, fs=args.fs) logger.info("Processed {} utterance!".format(num_utts))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": True # F x T instead of T x F } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) for key, spectrogram in spectrogram_reader: logger.info(f"Processing utterance {key}...") separated = auxiva(spectrogram, args.epochs) norm = spectrogram_reader.maxabs(key) for idx in range(separated.shape[0]): samps = inverse_stft(separated[idx], **stft_kwargs, norm=norm) fname = Path(args.dst_dir) / f"{key}.src{idx + 1}.wav" write_wav(fname, samps, fs=args.fs) logger.info(f"Processed {len(spectrogram_reader)} utterances")
def run(args): # return complex result stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center } logger.info(f"Using mask: {args.mask.upper()}") mixture_reader = SpectrogramReader( args.mix_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) ref_scp_list = args.ref_scp.split(",") logger.info(f"Number of speakers: {len(ref_scp_list)}") targets_reader = [ SpectrogramReader(scp, **stft_kwargs) for scp in ref_scp_list ] num_utts = 0 for key, mixture in tqdm(mixture_reader): nsamps = mixture_reader.nsamps(key) if args.keep_length else None skip = False for reader in targets_reader: if key not in reader: logger.info(f"Skip utterance {key}, missing targets") skip = True break if skip: continue num_utts += 1 targets_list = [reader[key] for reader in targets_reader] spk_masks = compute_mask(mixture, targets_list, args.mask) for index, mask in enumerate(spk_masks): samps = inverse_stft(mixture * mask, **stft_kwargs, nsamps=nsamps) write_wav(os.path.join(args.dump_dir, f"spk{index + 1}/{key}.wav"), samps, sr=args.sr) logger.info(f"Processed {num_utts} utterance")
def run(args): def arg_audio(src_args, beg=None): return [read_wav(s, fs=args.sr, beg=beg) for s in src_args.split(",")] if src_args else None def arg_float(src_args): return [float(s) for s in src_args.split(",")] if src_args else None src_spk = arg_audio(args.src_spk) src_rir = arg_audio(args.src_rir) if src_rir: if len(src_rir) != len(src_spk): raise RuntimeError( f"Number of --src-rir={args.src_rir} do not match with " + f"--src-spk={args.src_spk} option") sdr = arg_float(args.src_sdr) if len(src_spk) > 1 and not sdr: raise RuntimeError("--src-sdr need to be assigned for " + f"--src-spk={args.src_spk}") if sdr: if len(src_spk) - 1 != len(sdr): raise RuntimeError("Number of --src-snr - 1 do not match with " + "--src-snr option") sdr = [0] + sdr src_begin = arg_float(args.src_begin) if src_begin: src_begin = [int(v) for v in src_begin] else: src_begin = [0 for _ in src_spk] # number samples of the mixture mix_nsamps = max([b + s.size for b, s in zip(src_begin, src_spk)]) point_noise = arg_audio(args.point_noise) point_noise_rir = arg_audio(args.point_noise_rir) if point_noise: if point_noise_rir: if len(point_noise) != len(point_noise_rir): raise RuntimeError( f"Number of --point-noise-rir={args.point_noise_rir} do not match with " + f"--point-noise={args.point_noise} option") point_snr = arg_float(args.point_noise_snr) if not point_snr: raise RuntimeError("--point-noise-snr need to be assigned for " + f"--point-noise={args.point_noise}") if len(point_noise) != len(point_snr): raise RuntimeError( f"Number of --point-noise-snr={args.point_noise_snr} do not match with " + f"--point-noise={args.point_noise} option") point_begin = arg_float(args.point_noise_begin) if point_begin: point_begin = [int(v) for v in point_begin] else: point_begin = [0 for _ in point_noise] isotropic_noise = arg_audio(args.isotropic_noise, beg=args.isotropic_noise_begin) if isotropic_noise: isotropic_noise = isotropic_noise[0] isotropic_snr = arg_float(args.isotropic_noise_snr) if not isotropic_snr: raise RuntimeError( "--isotropic-snr need to be assigned for " + f"--isotropic-noise={args.isotropic_noise} option") isotropic_snr = isotropic_snr[0] else: isotropic_snr = None # add speakers spk = add_speaker(mix_nsamps, src_spk, src_begin, sdr, src_rir=src_rir, channel=args.dump_channel, sr=args.sr) spk_utt = sum(spk) mix = spk_utt.copy() if point_noise: spk_power = np.mean(spk_utt[0]**2) noise = add_point_noise(mix_nsamps, spk_power, point_noise, point_begin, point_snr, noise_rir=point_noise_rir, channel=args.dump_channel, sr=args.sr) if spk_utt.shape[0] != noise.shape[0]: raise RuntimeError("Channel mismatch between source speaker " + "configuration and pointsource noise's, " + f"{spk_utt.shape[0]} vs {noise.shape[0]}") mix = spk_utt + noise else: noise = None ch = args.dump_channel if isotropic_noise is not None: N, _ = spk_utt.shape if N == 1: if isotropic_noise.ndim == 1: isotropic_noise = isotropic_noise[None, ...] else: if ch >= 0: isotropic_noise = isotropic_noise[ch:ch + 1] else: raise RuntimeError( "Single channel mixture vs multi-channel " "isotropic noise") else: if isotropic_noise.shape[0] != N: raise RuntimeError( "Channel number mismatch between mixture and isotropic noise, " + f"{N} vs {isotropic_noise.shape[0]}") dur = min(mix_nsamps, isotropic_noise.shape[-1]) isotropic_chunk = isotropic_noise[0, :dur] power = np.mean(isotropic_chunk**2) coeff = coeff_snr(power, spk_power, isotropic_snr) mix[..., :dur] += coeff * isotropic_chunk noise[..., :dur] += coeff * isotropic_chunk factor = args.norm_factor / (np.max(np.abs(mix)) + EPSILON) write_wav(args.mix, factor * mix, fs=args.sr) if args.dump_ref_dir: basename = os.path.basename(args.mix) ref_dir = pathlib.Path(args.dump_ref_dir) ref_dir.mkdir(parents=True, exist_ok=True) # has noise if noise is not None: write_wav(ref_dir / "noise" / basename, factor * noise, fs=args.sr) # one speaker if len(spk) == 1: write_wav(ref_dir / "speaker" / basename, factor * spk[0], fs=args.sr) else: for i, s in enumerate(spk): write_wav(ref_dir / f"spk{i + 1}" / basename, factor * s, fs=args.sr)