def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, "transpose": False } topo = list(map(float, args.linear_topo.split(","))) doa = args.doa if args.doa > 0 else 180 + args.doa if doa < 0 or doa > 180: raise RuntimeError("Illegal value for DoA: {:.2f}".format(args.doa)) spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) beamformer = DSBeamformer(topo) logger.info("Initialize {:d} channel DSBeamformer".format(len(topo))) with WaveWriter(args.dst_dir, fs=args.fs) as writer: for key, stft_src in spectrogram_reader: stft_enh = beamformer.run( doa, stft_src, c=args.speed, sample_rate=args.fs) power = spectrogram_reader.power(key) samps = istft(stft_enh, **stft_kwargs, power=power) writer.write(key, samps) logger.info("Processed {:d} utterances".format(len(spectrogram_reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} tgt_mask_reader = MaskReader[args.fmt](args.tgt_mask) itf_mask_reader = MaskReader[args.fmt]( args.tgt_mask) if args.itf_mask else None if itf_mask_reader is not None: logger.info(f"Using interfering masks from {args.itf_mask}") online = False num_bins = nextpow2(args.frame_len) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "mpdr": MpdrBeamformer(num_bins), "mpdr-whiten": MpdrBeamformer(num_bins, whiten=True), "gevd": GevdBeamformer(num_bins), "pmwf-0": PmwfBeamformer(num_bins, beta=0, ref_channel=args.pmwf_ref, rank1_appro=args.rank1_appro), "pmwf-1": PmwfBeamformer(num_bins, beta=1, ref_channel=args.pmwf_ref, rank1_appro=args.rank1_appro) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info(f"Using offline {args.beamformer} beamformer") beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError(f"Seems chunk size({args.chunk_size:.2f}) " + "too small for online beamformer") beamformer = supported_online_beamformer[args.beamformer] online = True logger.info(f"Using online {args.beamformer} beamformer, " + f"chunk size = {args.chunk_size:d}") num_done = 0 with WaveWriter(args.dst_dir, sr=args.sr) as writer: for key, stft_mat in spectrogram_reader: if key in tgt_mask_reader: power = spectrogram_reader.power(key) norm = spectrogram_reader.maxabs(key) logger.info( f"Processing utterance {key}, " + f"signal power {10 * np.log10(power + 1e-5):.2f}...") # prefer T x F speech_mask = tgt_mask_reader[key] # constraint [0, 1] if itf_mask_reader is None: speech_mask = np.minimum(speech_mask, 1) interf_mask = None else: interf_mask = itf_mask_reader[key] # make sure speech_mask at shape T x F _, F, _ = stft_mat.shape # if in F x T if speech_mask.shape[0] == F and speech_mask.shape[1] != F: speech_mask = np.transpose(speech_mask) if interf_mask is not None: interf_mask = np.transpose(interf_mask) if 0.5 < args.vad_proportion < 1: vad_mask, N = compute_vad_masks(stft_mat[0], args.vad_proportion) logger.info(f"Filtering {N} TF-masks...") speech_mask = np.where(vad_mask, 1.0e-4, speech_mask) if interf_mask is not None: interf_mask = np.where(vad_mask, 1.0e-4, interf_mask) # stft_enh, stft_mat: (N) x F x T try: if not online: stft_enh = beamformer.run(speech_mask, stft_mat, mask_n=interf_mask, ban=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, interf_mask, stft_mat, args) except np.linalg.LinAlgError: logger.error(f"Raise linalg error: {key}") continue # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = inverse_stft(stft_enh, norm=norm, **stft_kwargs) writer.write(key, samps) num_done += 1 logger.info(f"Processed {num_done:d} utterances " + f"out of {len(spectrogram_reader):d}")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } spectrogram_reader = SpectrogramReader( args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) online = False num_bins = nfft(args.frame_len) // 2 + 1 supported_beamformer = { "mvdr": MvdrBeamformer(num_bins), "gevd": GevdBeamformer(num_bins), "pmwf": PmwfBeamformer(num_bins) } supported_online_beamformer = { "mvdr": OnlineMvdrBeamformer(num_bins, args.channels, args.alpha), "gevd": OnlineGevdBeamformer(num_bins, args.channels, args.alpha), } if args.chunk_size <= 0: logger.info("Using offline {} beamformer".format(args.beamformer)) beamformer = supported_beamformer[args.beamformer] else: if args.chunk_size < 32: raise RuntimeError( "Seems chunk size({:.2f}) too small for online beamformer". format(args.chunk_size)) beamformer = supported_online_beamformer[args.beamformer] online = True logger.info("Using online {} beamformer, chunk size = {:d}".format( args.beamformer, args.chunk_size)) num_done = 0 with WaveWriter(args.dst_dir, fs=args.samp_freq) as writer: for key, stft_mat in spectrogram_reader: if key in mask_reader: num_done += 1 power = spectrogram_reader.power(key) logger.info( "Processing utterance {}, signal power {:.2f}...".format( key, 10 * np.log10(power + 1e-5))) # prefer T x F speech_mask = mask_reader[key] # constraint [0, 1] speech_mask = np.minimum(speech_mask, 1) # make sure speech_mask at shape T x F _, F, _ = stft_mat.shape # if in F x T if speech_mask.shape[0] == F: speech_mask = np.transpose(speech_mask) # stft_enh, stft_mat: (N) x F x T if not online: stft_enh = beamformer.run(speech_mask, stft_mat, normalize=args.ban) else: stft_enh = do_online_beamform(beamformer, speech_mask, stft_mat, args) # masking beamformer output if necessary if args.mask: stft_enh = stft_enh * np.transpose(speech_mask) samps = istft(stft_enh, power=power, **stft_kwargs) writer.write(key, samps) logger.info("Processed {:d} utterances out of {:d}".format( num_done, len(spectrogram_reader)))