def extract_spec(filename, samp_freq, frame_length_ms=25, frame_shift_ms=10, round_to_power_of_two=True, snip_edges=True): ''' extract spectrogram using kaldi args: filename: wav file path samp_freq: sample frequence return: spectrogram: (frame, fre) ''' # get rspec and wspec with open('wav.scp', 'w') as f: f.write('test1 ' + filename + '\n') rspec = 'scp,p:' + 'wav.scp' wspec = 'ark,t:' + 'spec.ark' # set po usage = """Extract MFCC features.Usage: example.py [opts...] <rspec> <wspec>""" po = ParseOptions(usage) po.register_float("min-duration", 0.0, "minimum segment duration") opts = po.parse_args() # set options spec_opts = SpectrogramOptions() spec_opts.frame_opts.samp_freq = samp_freq spec_opts.frame_opts.frame_length_ms = frame_length_ms spec_opts.frame_opts.frame_shift_ms = frame_shift_ms spec_opts.frame_opts.round_to_power_of_two = round_to_power_of_two spec_opts.frame_opts.snip_edges = snip_edges spec_opts.register(po) spec = Spectrogram(spec_opts) sf = spec_opts.frame_opts.samp_freq with SequentialWaveReader(rspec) as reader, MatrixWriter(wspec) as writer: for key, wav in reader: if wav.duration < opts.min_duration: continue assert (wav.samp_freq >= sf) assert (wav.samp_freq % sf == 0) s = wav.data() s = s[:, ::int(wav.samp_freq / sf)] m = SubVector(mean(s, axis=0)) f = spec.compute_features(m, sf, 1.0) f_array = np.array(f) writer[key] = f return f_array
return num_success != 0 if __name__ == "__main__": usage = """Compute VAD. Usage: compute-vad [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-duration", 0.0, "Minimum duration of segments to process in seconds (default: 0.0).", ) po.register_int( "channel", -1, "Channel to extract (-1 -> mono (default), 0 -> left, 1 -> right)", ) po.register_int("frame-window", 25, "Length of frame window in ms (default: 25)") po.register_int("frame-shift", 10, "Length of frame shift in ms (default: 10)") po.register_int("nfft", 512, "Number of DFT points (default: 256)") po.register_int( "arma-order", 5,
file=sys.stderr) return num_success != 0 if __name__ == "__main__": usage = """Compute VAD. Usage: compute-vad [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-duration", 0.0, "Minimum duration of segments " "to process (in seconds).") po.register_int( "channel", -1, "Channel to extract (-1 -> expect mono, " "0 -> left, 1 -> right)") po.register_int("frame-window", 25, "Length of frame window in ms " "default is 25ms") po.register_int("frame-shift", 10, "Length of frame shift in ms " "default is 10ms") po.register_int("nfft", 256, "Number of DFT points " "default is 256") po.register_int( "arma-order", 5, "Length of ARMA window that will be applied " "to the spectrogram") po.register_int("ltsv-ctx-window", 50, "Context window for LTSV computation " "default is 50")
logging.basicConfig(format="%(levelname)s (%(module)s[{}]:%(funcName)s():" "%(filename)s:%(lineno)s) %(message)s" .format(__version__), level=logging.INFO) usage = """Decode features using GMM-based model. Usage: gmm-decode-faster.py [options] model-in fst-in features-rspecifier words-wspecifier [alignments-wspecifier [lattice-wspecifier]] Note: lattices, if output, will just be linear sequences; use gmm-latgen-faster if you want "real" lattices. """ po = ParseOptions(usage) decoder_opts = FasterDecoderOptions() decoder_opts.register(po, True) po.register_float("acoustic-scale", 0.1, "Scaling factor for acoustic likelihoods") po.register_bool("allow-partial", True, "Produce output even when final state was not reached") po.register_str("word-symbol-table", "", "Symbol table for words [for debug output]"); opts = po.parse_args() if po.num_args() < 4 or po.num_args() > 6: po.print_usage() sys.exit() model_rxfilename = po.get_arg(1) fst_rxfilename = po.get_arg(2) feature_rspecifier = po.get_arg(3) words_wspecifier = po.get_arg(4) alignment_wspecifier = po.get_opt_arg(5)
if __name__ == '__main__': usage = """Create MFCC feature files. Usage: compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier> """ po = ParseOptions(usage) mfcc_opts = MfccOptions() mfcc_opts.register(po) po.register_bool( "subtract-mean", False, "Subtract mean of each feature" "file [CMS]; not recommended to do it this way.") po.register_float( "vtln-warp", 1.0, "Vtln warp factor (only applicable " "if vtln-map not specified)") po.register_str( "vtln-map", "", "Map from utterance or speaker-id to " "vtln warp factor (rspecifier)") po.register_str( "utt2spk", "", "Utterance to speaker-id map rspecifier" "(if doing VTLN and you have warps per speaker)") po.register_int( "channel", -1, "Channel to extract (-1 -> expect mono, " "0 -> left, 1 -> right)") po.register_float( "min-duration", 0.0, "Minimum duration of segments " "to process (in seconds).") opts = po.parse_args()
if __name__ == '__main__': # Configure log messages to look like Kaldi messages from kaldi import __version__ logging.addLevelName(20, 'LOG') logging.basicConfig( format='%(levelname)s (%(module)s[{}]:%(funcName)s():' '%(filename)s:%(lineno)s) %(message)s'.format(__version__), level=logging.INFO) usage = """Extract segments from a large audio file in WAV format. Usage: extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier> """ po = ParseOptions(usage) po.register_float( "min-segment-length", 0.1, "Minimum segment length " "in seconds (reject shorter segments)") po.register_float( "max_overshoot", 0.5, "End segments overshooting audio " "by less than this (in seconds) are truncated, " "else rejected.") opts = po.parse_args() if po.num_args() != 3: po.print_usage() sys.exit() wav_rspecifier = po.get_arg(1) segments_rxfilename = po.get_arg(2) wav_wspecifier = po.get_arg(3)
Usage: copy-matrix [options] <matrix-in-rspecifier> <matrix-out-wspecifier> or copy-matrix [options] <matrix-in-rxfilename> <matrix-out-wxfilename> e.g. copy-matrix --binary=false 1.mat - copy-matrix ark:2.trans ark,t:- """ po = ParseOptions(usage) po.register_bool( "binary", True, "Write in binary mode (only relevant if output is a wxfilename)") po.register_float( "scale", 1.0, "This option can be used to scale the matrices being copied.") po.register_bool( "apply-log", False, "This option can be used to apply log on the matrices. Must be avoided if matrix has negative quantities." ) po.register_bool("apply-exp", False, "This option can be used to apply exp on the matrices") po.register_float( "apply-power", 1.0, "This option can be used to apply a power on the matrices") po.register_bool( "apply-softmax-per-row", False, "This option can be used to apply softmax per row of the matrices") opts = po.parse_args()
sw02005-A sw02005 A sw02005-B sw02005 B interpreted as <utterance-id> <call-id> <side> and for each <call-id> that has two sides, does the 'only-the-louder' computation, else does per-utterance stats in the normal way. Note: loudness is judged by the first feature component, either energy or c0 only applicable to MFCCs or PLPs (this code could be modified to handle filterbanks). Usage: compute-cmvn-stats-two-channel [options] <reco2file-and-channel> <feats-rspecifier> <stats-wspecifier> e.g.: compute-cmvn-stats-two-channel data/train_unseg/reco2file_and_channel scp:data/train_unseg/feats.scp ark,t:- """ po = ParseOptions(usage) po.register_float( "quieter_channel_weight", 0.01, "For the quieter channel," " apply this weight to the stats, so that we still get " "stats if one channel always dominates.") opts = po.parse_args() if po.num_args() != 3: po.print_usage() sys.exit(1) reco2file_and_channel_rxfilename = po.get_arg(1) feats_rspecifier = po.get_arg(2) stats_wspecifier = po.get_arg(3) compute_cmvn_stats_two_channel(reco2file_and_channel_rxfilename, feats_rspecifier, stats_wspecifier, opts)
po.register_int("sampling-rate", 16000, "Sampling rate of waveforms and labels.") po.register_int( "signal-window-length", 200, "Window length in ms (what will be presented to the network).") po.register_int("label-window-length", 25, "Window length of alignments / labels in ms.") po.register_int("label-window-shift", 10, "Window shift of alignments / labels in ms.") po.register_bool( "subtract-mean", False, "Subtract mean of each feature" "file [CMS]; not recommended to do it this way.") po.register_int( "channel", -1, "Channel to extract (-1 -> expect mono, " "0 -> left, 1 -> right)") po.register_float( "min-duration", 0.0, "Minimum duration of segments " "to process (in seconds).") opts = po.parse_args() if (po.num_args() != 2): po.print_usage() sys.exit() wav_rspecifier = po.get_arg(1) feats_wspecifier = po.get_arg(2) compute_mfcc_feats(wav_rspecifier, feats_wspecifier, opts, mfcc_opts)