def run(args): src_reader = NumpyReader(args.src_scp) if args.src == "npy" else MatReader( args.src_scp, args.key) num_mat = 0 mat_list = [] ops = args.op.split(",") for op in ops: if op and op not in supported_op: raise RuntimeError(f"Unknown operation: {op}") stack = "stack" in ops with ArchiveWriter(args.dst_ark, args.scp) as writer: for key, mat in src_reader: for op in ops: if op == "trans": mat = np.transpose(mat) elif op == "log": mat = np.log(np.maximum(mat, EPSILON)) elif op == "minus": mat = 1 - mat else: pass if stack: mat_list.append(mat) else: writer.write(key, mat) num_mat += 1 if stack: mat = np.vstack(mat_list) writer.write(filekey(args.dst_ark), mat) logger.info("Merge {0} matrix into archive {1}, shape as " "{2[0]}x{2[1]}".format(num_mat, args.dst_ark, mat.shape)) if not stack: logger.info("Copy {0} matrix into archive {1}".format( num_mat, args.dst_ark))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi } speech_reader = SpectrogramReader(args.speech_scp, **stft_kwargs) bnoise_reader = SpectrogramReader(args.noise_scp, **stft_kwargs) num_utts = 0 cutoff = args.cutoff with ArchiveWriter(args.mask_ark, args.scp) as writer: for key, speech in speech_reader: if key in bnoise_reader: num_utts += 1 noise = bnoise_reader[key] mask = compute_mask(speech, noise, args.mask) if cutoff > 0: num_items = np.sum(mask > cutoff) mask = np.minimum(mask, cutoff) if num_items: logger.info("Clip {:d} items for utterance {}".format( num_items, key)) mask = np.maximum(mask, 0) writer.write(key, mask) logger.info("Processed {} utterances".format(num_utts))
def run(args): src_reader = NumpyReader(args.src_scp) if not args.matlab else MatReader( args.src_scp, args.key) num_mat = 0 mat_list = [] with ArchiveWriter(args.dst_ark, args.scp) as writer: for key, mat in src_reader: if args.transpose: mat = np.transpose(mat) if args.apply_log: mat = np.log(np.maximum(mat, EPSILON)) if args.minus_by_one: mat = 1 - mat if not args.merge: writer.write(key, mat) else: mat_list.append(mat) num_mat += 1 if args.merge: mat = np.vstack(mat_list) writer.write(filekey(args.dst_ark), mat) logger.info( "Merge {0} matrix into archive {1}, shape as {2[0]}x{2[1]}". format(num_mat, args.dst_ark, mat.shape)) if not args.merge: logger.info("Copy {0} matrix into archive {1}".format( num_mat, args.dst_ark))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } stft_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) if args.utt2idx: utt2idx = ScpReader(args.utt2idx, value_processor=int) logger.info(f"Using --utt2idx={args.utt2idx}") else: utt2idx = None logger.info(f"Using --doa-idx={args.doa_idx}") df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")] if not len(df_pair): raise RuntimeError(f"Bad configurations with --pair {args.pair}") logger.info(f"Compute directional feature with {df_pair}") # A x M x F steer_vector = np.load(args.steer_vector) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, stft in stft_reader: # sv: M x F if utt2idx is None: idx = [int(v) for v in args.doa_idx.split(",")] dfs = [ directional_feats(stft, steer_vector[i], df_pair=df_pair) for i in idx ] if len(dfs) == 1: df = dfs[0] else: # N x T x F dfs = np.stack(dfs) df = dfs.transpose(1, 0, 2).reshape(dfs.shape[1], -1) elif key in utt2idx: # stft: M x F x T df = directional_feats(stft, steer_vector[utt2idx[key]], df_pair=df_pair) else: logger.warn(f"Missing utt2idx for utterance {key}") continue writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info(f"Processed {num_done:d} utterance...") logger.info(f"Processed {num_done:d} utterances over {len(stft_reader):d}")
def run(args): srp_pair = [ tuple(map(int, p.split(","))) for p in args.diag_pair.split(";") ] if not len(srp_pair): raise RuntimeError("Bad configurations with --pair {}".format( args.pair)) logger.info("Compute gcc with {}".format(srp_pair)) stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } num_done = 0 num_ffts = nfft( args.frame_len) if args.round_power_of_two else args.frame_len reader = SpectrogramReader(args.wav_scp, **stft_kwargs) with ArchiveWriter(args.srp_ark, args.scp) as writer: for key, stft_mat in reader: num_done += 1 srp = [] # N x T x F for (i, j) in srp_pair: srp.append( gcc_phat_diag( stft_mat[i], stft_mat[j], min(i, j) * np.pi * 2 / args.n, args.d, num_bins=num_ffts // 2 + 1, sr=args.sr, num_doa=args.num_doa)) srp = sum(srp) / len(srp_pair) nan = np.sum(np.isnan(srp)) if nan: raise RuntimeError("Matrix {} has nan ({:d}} items)".format( key, nan)) writer.write(key, srp) if not num_done % 1000: logger.info("Processed {:d} utterances...".format(num_done)) logger.info("Processd {:d} utterances done".format(len(reader)))
def run(args): # shape: T x F, complex stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi } speech_reader = SpectrogramReader(args.speech_scp, **stft_kwargs) denorm_reader = SpectrogramReader(args.denorm_scp, **stft_kwargs) num_utts = 0 cutoff = args.cutoff with ArchiveWriter(args.mask_ark, args.scp) as writer: for key, speech in speech_reader: if key in denorm_reader: num_utts += 1 denorm = denorm_reader[key] mask = compute_mask(speech[0] if speech.ndim == 3 else speech, denorm[0] if denorm.ndim == 3 else denorm, args.mask) # iam, psm, psa if cutoff > 0: num_items = np.sum(mask > cutoff) mask = np.minimum(mask, cutoff) if num_items: percent = float(num_items) / mask.size logger.info( "Clip {:d}({:.2f}) items over {:.2f} for utterance {}" .format(num_items, percent, cutoff, key)) num_items = np.sum(mask < 0) # psm, psa if num_items: percent = float(num_items) / mask.size average = np.sum(mask[mask < 0]) / num_items logger.info( "Clip {:d}({:.2f}, {:.2f}) items below zero for utterance {}" .format(num_items, percent, average, key)) mask = np.maximum(mask, 0) writer.write(key, mask) else: logger.warn("Missing bg-noise for utterance {}".format(key)) logger.info("Processed {} utterances".format(num_utts))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "apply_log": args.apply_log, "apply_pow": args.apply_pow, "normalize": args.normalize, "apply_abs": True, "transpose": True # T x F } reader = SpectrogramReader(args.wav_scp, **stft_kwargs) with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, feats in reader: # default using ch1 in multi-channel case writer.write(key, feats[0] if feats.ndim == 3 else feats) logger.info("Process {:d} utterances".format(len(reader)))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_bins = nextpow2(args.frame_len) // 2 + 1 beamformer = MvdrBeamformer(num_bins) df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")] if not len(df_pair): raise RuntimeError(f"Bad configurations with --pair {args.pair}") logger.info(f"Compute directional feature with {df_pair}") num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] # make sure speech_masks in T x F _, F, _ = spect.shape if speech_masks.shape[0] == F: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T, df_pair=df_pair) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info(f"Processed {num_done:d} utterance...") else: logger.warn(f"Missing TF-mask for utterance {key}") logger.info(f"Processed {num_done:d} utterances over {len(feat_reader):d}")
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) num_utts = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spectrogram in spectrogram_reader: # spectrogram: shape NxTxF feats = compute_spatial_feats(args, spectrogram) # feats: T x F writer.write(key, feats) num_utts += 1 logger.info("Processed {} for {:d} utterances".format( args.type.upper(), num_utts))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "apply_log": args.apply_log, "apply_pow": args.apply_pow, "normalize": args.normalize, "apply_abs": True, "transpose": True # T x F } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) num_utts = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, feats in spectrogram_reader: writer.write(key, feats) num_utts += 1 logger.info("Process {:d} utterances".format(num_utts))
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": True # T x F } spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) num_utts = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spectrogram in spectrogram_reader: # spectrogram: shape NxTxF feats = compute_spatial_feats(args, spectrogram) # feats: T x F writer.write(key, feats) num_utts += 1 if not num_utts % 1000: logger.info(f"Processed {num_utts} utterance...") logger.info(f"Processed {args.type.upper()} for {num_utts} utterances")
def run(args): stft_kwargs = { "frame_len": args.frame_len, "frame_hop": args.frame_hop, "round_power_of_two": args.round_power_of_two, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} mask_reader = MaskReader[args.fmt](args.mask_scp) num_bins = nfft(args.frame_len) // 2 + 1 beamformer = MvdrBeamformer(num_bins) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] # make sure speech_masks in T x F _, F, _ = spect.shape if speech_masks.shape[0] == F: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info("Processed {:d} utterance...".format(num_done)) else: logger.warn("Missing TF-mask for utterance {}".format(key)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(feat_reader)))
def run(args): stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "transpose": False # F x T } feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) mask_reader = NumpyReader(args.mask_scp) if args.numpy else ScriptReader( args.mask_scp) num_bins = nfft(args.frame_length) // 2 + 1 beamformer = MvdrBeamformer(num_bins) num_done = 0 with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spect in feat_reader: if key in mask_reader: speech_masks = mask_reader[key] if args.trans: speech_masks = np.transpose(speech_masks) speech_masks = np.minimum(speech_masks, 1) # spectrogram: N x F x T speech_covar = beamformer.compute_covar_mat( speech_masks, spect) sv = beamformer.compute_steer_vector(speech_covar) df = directional_feats(spect, sv.T) writer.write(key, df) num_done += 1 if not num_done % 1000: logger.info("Processed {:d} utterance...".format(num_done)) else: logger.warn("Missing TF-mask for utterance {}".format(key)) logger.info("Processed {:d} utterances over {:d}".format( num_done, len(feat_reader)))
def run(args): mel_kwargs = { "n_mels": args.num_bins, "fmin": args.min_freq, "fmax": args.max_freq, "htk": True } stft_kwargs = { "frame_length": args.frame_length, "frame_shift": args.frame_shift, "window": args.window, "center": args.center, # false to comparable with kaldi "apply_log": False, "apply_pow": args.apply_pow, "normalize": args.normalize, "apply_abs": True, "transpose": False # F x T } if args.max_freq > args.samp_freq // 2: raise RuntimeError("Max frequency for mel exceeds sample frequency") spectrogram_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) # N x F mel_weights = audio_lib.filters.mel(args.samp_freq, nfft(args.frame_length), **mel_kwargs) with ArchiveWriter(args.dup_ark, args.scp) as writer: for key, spectrum in spectrogram_reader: # N x F * F x T = N * T => T x N fbank = np.transpose( np.dot(mel_weights, spectrum[0] if spectrum.ndim == 3 else spectrum)) if args.apply_log: fbank = np.log(np.maximum(fbank, EPSILON)) writer.write(key, fbank) logger.info("Process {:d} utterances".format(len(spectrogram_reader)))