def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) # F x N or B x F x N weights = np.load(args.weights) if weights.ndim == 2: beamformer = FixedBeamformer(weights) beam_index = None else: beamformer = [FixedBeamformer(w) for w in weights] if not args.beam: raise RuntimeError( "--beam must be assigned, as there are multiple beams") beam_index = ScpReader(args.beam, value_processor=lambda x: int) with WaveWriter(args.dst_dir) as writer: for key, stft_mat in spectrogram_reader: logger.info(f"Processing utterance {key}...") if beamformer: beam = beam_index[key] stft_enh = beamformer[beam].run(stft_mat) else: stft_enh = beamformer.run(stft_mat) norm = spectrogram_reader.maxabs(key) samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm) writer.write(key, samps) logger.info(f"Processed {len(spectrogram_reader):d} utterances")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } if args.geometry == "linear": topo = str2tuple(args.linear_topo) beamformer = LinearSDBeamformer(topo) logger.info(f"Initialize LinearSDBeamformer for array: {topo}") else: beamformer = CircularSDBeamformer(args.circular_radius, args.circular_around, center=args.circular_center) logger.info( "Initialize CircularSDBeamformer for " + f"radius = {args.circular_radius}, center = {args.circular_center}" ) utt2doa = None doa = None if args.utt2doa: utt2doa = ScpReader(args.utt2doa, value_processor=lambda x: float(x)) logger.info(f"Use --utt2doa={args.utt2doa} for each utterance") else: doa = args.doa if not check_doa(args.geometry, doa): logger.info(f"Invalid doa {doa:.2f} for {args.geometry} array") logger.info(f"Use --doa={doa:.2f} for all utterances") spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) done = 0 with WaveWriter(args.dst_dir, sr=args.sr) as writer: for key, stft_src in spectrogram_reader: if utt2doa: if key not in utt2doa: continue doa = utt2doa[key] if not check_doa(args.geometry, doa): logger.info(f"Invalid DoA {doa:.2f} for utterance {key}") continue stft_enh = beamformer.run(doa, stft_src, c=args.speed, sr=args.sr) done += 1 norm = spectrogram_reader.maxabs(key) samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm) writer.write(key, samps) logger.info(f"Processed {done} utterances over {len(spectrogram_reader)}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } utt2doa = None doa = None if args.utt2doa: utt2doa = ScpReader(args.utt2doa, value_processor=lambda x: float(x)) logger.info(f"Use utt2doa {args.utt2doa} for each utterance") else: doa = args.doa if doa < 0: doa = 180 + doa if doa < 0 or doa > 180: raise RuntimeError(f"Invalid doa {doa:.2f} for --doa") logger.info(f"Use DoA {doa:.2f} for all utterances") spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) done = 0 topo = str2tuple(args.linear_topo) beamformer = LinearDSBeamformer(topo) logger.info(f"Initialize channel LinearDSBeamformer for array: {topo}") with WaveWriter(args.dst_dir, fs=args.fs) as writer: for key, stft_src in spectrogram_reader: if utt2doa: if key not in utt2doa: continue doa = utt2doa[key] if doa < 0: doa = 180 + doa if doa < 0 or doa > 180: logger.info(f"Invalid doa {doa:.2f} for utterance {key}") continue stft_enh = beamformer.run(doa, stft_src, c=args.speed, sr=args.fs) done += 1 norm = spectrogram_reader.maxabs(key) samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm) writer.write(key, samps) logger.info(f"Processed {done} utterances over {len(spectrogram_reader)}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) num_done = 0 with WaveWriter(args.dst_dir, sr=args.sr) as writer: for key, obs in spectrogram_reader: logger.info(f"Processing utt {key}...") if obs.ndim != 3: raise RuntimeError(f"Expected 3D array, but got {obs.ndim}") try: # N x T x F => T x F tf_mask, wpd_enh = facted_wpd(obs, wpd_iters=args.wpd_iters, cgmm_iters=args.cgmm_iters, update_alpha=args.update_alpha, context=args.context, taps=args.taps, delay=args.delay) except np.linalg.LinAlgError: logger.warn(f"{key}: Failed cause LinAlgError in wpd") continue norm = spectrogram_reader.maxabs(key) # dump multi-channel samps = inverse_stft(wpd_enh, norm=norm, **stft_kwargs) writer.write(key, samps) if args.dump_mask: np.save(f"{args.dst_dir}/{key}", tf_mask[..., 0]) # show progress cause slow speed num_done += 1 if not num_done % 100: logger.info(f"Processed {num_done:d} utterances...") logger.info( f"Processed {num_done:d} utterances over {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": True # F x T instead of T x F } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) for key, spectrogram in spectrogram_reader: logger.info(f"Processing utterance {key}...") separated = auxiva(spectrogram, args.epochs) norm = spectrogram_reader.maxabs(key) for idx in range(separated.shape[0]): samps = inverse_stft(separated[idx], **stft_kwargs, norm=norm) fname = Path(args.dst_dir) / f"{key}.src{idx + 1}.wav" write_wav(fname, samps, fs=args.fs) logger.info(f"Processed {len(spectrogram_reader)} utterances")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} tgt_mask_reader = MaskReader[args.fmt](args.tgt_mask) itf_mask_reader = MaskReader[args.fmt]( args.tgt_mask) if args.itf_mask else None if itf_mask_reader is not None: logger.info(f"Using interfering masks from {args.itf_mask}") online = False num_bins = nextpow2(args.frame_len) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "mpdr": MpdrBeamformer(num_bins), "mpdr-whiten": MpdrBeamformer(num_bins, whiten=True), "gevd": GevdBeamformer(num_bins), "pmwf-0": PmwfBeamformer(num_bins, beta=0, ref_channel=args.pmwf_ref, rank1_appro=args.rank1_appro), "pmwf-1": PmwfBeamformer(num_bins, beta=1, ref_channel=args.pmwf_ref, rank1_appro=args.rank1_appro) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info(f"Using offline {args.beamformer} beamformer") beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError(f"Seems chunk size({args.chunk_size:.2f}) " + "too small for online beamformer") beamformer = supported_online_beamformer[args.beamformer] online = True logger.info(f"Using online {args.beamformer} beamformer, " + f"chunk size = {args.chunk_size:d}") num_done = 0 with WaveWriter(args.dst_dir, sr=args.sr) as writer: for key, stft_mat in spectrogram_reader: if key in tgt_mask_reader: power = spectrogram_reader.power(key) norm = spectrogram_reader.maxabs(key) logger.info( f"Processing utterance {key}, " + f"signal power {10 * np.log10(power + 1e-5):.2f}...") # prefer T x F speech_mask = tgt_mask_reader[key] # constraint [0, 1] if itf_mask_reader is None: speech_mask = np.minimum(speech_mask, 1) interf_mask = None else: interf_mask = itf_mask_reader[key] # make sure speech_mask at shape T x F _, F, _ = stft_mat.shape # if in F x T if speech_mask.shape[0] == F and speech_mask.shape[1] != F: speech_mask = np.transpose(speech_mask) if interf_mask is not None: interf_mask = np.transpose(interf_mask) if 0.5 < args.vad_proportion < 1: vad_mask, N = compute_vad_masks(stft_mat[0], args.vad_proportion) logger.info(f"Filtering {N} TF-masks...") speech_mask = np.where(vad_mask, 1.0e-4, speech_mask) if interf_mask is not None: interf_mask = np.where(vad_mask, 1.0e-4, interf_mask) # stft_enh, stft_mat: (N) x F x T try: if not online: stft_enh = beamformer.run(speech_mask, stft_mat, mask_n=interf_mask, ban=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, interf_mask, stft_mat, args) except np.linalg.LinAlgError: logger.error(f"Raise linalg error: {key}") continue # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = inverse_stft(stft_enh, norm=norm, **stft_kwargs) writer.write(key, samps) num_done += 1 logger.info(f"Processed {num_done:d} utterances " + f"out of {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } supported_beamformer = { "ds": { "linear": LinearDSBeamformer(linear_topo=args.linear_topo), "circular": CircularDSBeamformer(radius=args.circular_radius, num_arounded=args.circular_around, center=args.circular_center) }, "sd": { "linear": LinearSDBeamformer(linear_topo=args.linear_topo), "circular": CircularSDBeamformer(radius=args.circular_radius, num_arounded=args.circular_around, center=args.circular_center) } } beamformer = supported_beamformer[args.beamformer][args.geometry] online = args.chunk_len > 0 utt2doa = parse_doa(args, online) spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) done = 0 with WaveWriter(args.dst_dir, sr=args.sr) as writer: for key, stft_src in spectrogram_reader: doa = utt2doa(key) if doa is None: logger.info(f"Missing doa for utterance {key}") continue if not check_doa(args.geometry, doa, online): logger.info(f"Invalid doa {doa:.2f} for utterance {key}") continue if online: num_chunks = math.ceil(stft_src.shape[-1] / args.chunk_len) if len(doa) != num_chunks: mn = math.ceil(stft_src.shape[-1] / len(doa)) mx = math.floor(stft_src.shape[-1] / (len(doa) - 1)) logger.info( f"Invalid chunk length {args.chunk_len} for utterance {key}," f" expected --chunk-len from {mn} to {mx}") continue stft_enh = do_online_beamform(beamformer, doa, stft_src, args) else: stft_enh = beamformer.run(doa, stft_src, c=args.speed, sr=args.sr) norm = spectrogram_reader.maxabs(key) if args.normalize else None samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm) writer.write(key, samps) done += 1 logger.info(f"Processed {done} utterances over {len(spectrogram_reader)}")
def run(args): # shape: T x F, complex stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, } spectrogram_reader = SpectrogramReader( args.wav_scp, **stft_kwargs, round_power_of_two=args.round_power_of_two) phase_reader = None if args.phase_ref: phase_reader = SpectrogramReader( args.phase_ref, **stft_kwargs, round_power_of_two=args.round_power_of_two) logger.info(f"Using phase reference from {args.phase_ref}") MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_done = 0 with WaveWriter(args.dst_dir, fs=args.sf) as writer: for key, specs in spectrogram_reader: # if multi-channel, choose ch0 if specs.ndim == 3: specs = specs[0] # specs: T x F if key in mask_reader: num_done += 1 mask = mask_reader[key] # mask sure mask in T x F _, F = specs.shape if mask.shape[0] == F: mask = np.transpose(mask) logger.info(f"Processing utterance {key}...") if mask.shape != specs.shape: raise ValueError( "Dimention mismatch between mask and spectrogram" + f"({mask.shape[0]} x {mask.shape[1]} vs " + f"{specs.shape[0]} x {specs.shape[1]}), need " + "check configures") nsamps = spectrogram_reader.nsamps( key) if args.keep_length else None norm = spectrogram_reader.maxabs( key) if args.mixed_norm else None # use phase from ref if phase_reader is not None: angle = np.angle(phase_reader[key]) phase = np.exp(angle * 1j) samps = inverse_stft(np.abs(specs) * mask * phase, **stft_kwargs, norm=norm, nsamps=nsamps) else: samps = inverse_stft(specs * mask, **stft_kwargs, norm=norm, nsamps=nsamps) writer.write(key, samps) logger.info( f"Processed {num_done:d} utterances over {len(spectrogram_reader):d}")