def test_preprocessing(tmpdir): cmvn_ark = str(tmpdir.join("cmvn.ark")) kwargs = { "process": [ { "type": "fbank", "n_mels": 80, "fs": 16000, "n_fft": 1024, "n_shift": 512 }, { "type": "cmvn", "stats": cmvn_ark, "norm_vars": True }, { "type": "delta", "window": 2, "order": 2 }, ], "mode": "sequential", } # Creates cmvn_ark samples = np.random.randn(100, 80) stats = np.empty((2, 81), dtype=np.float32) stats[0, :80] = samples.sum(axis=0) stats[1, :80] = (samples**2).sum(axis=0) stats[0, -1] = 100.0 stats[1, -1] = 0.0 kaldiio.save_mat(cmvn_ark, stats) bs = 1 xs = [np.random.randn(1000).astype(np.float32) for _ in range(bs)] preprocessing = Transformation(kwargs) processed_xs = preprocessing(xs) for idx, x in enumerate(xs): opt = dict(kwargs["process"][0]) opt.pop("type") x = logmelspectrogram(x, **opt) opt = dict(kwargs["process"][1]) opt.pop("type") x = CMVN(**opt)(x) opt = dict(kwargs["process"][2]) opt.pop("type") x = add_deltas(x, **opt) np.testing.assert_allclose(processed_xs[idx], x)
def main(): parser = argparse.ArgumentParser( description='Compute cepstral mean and ' 'variance normalization statistics' 'If wspecifier provided: per-utterance by default, ' 'or per-speaker if' 'spk2utt option provided; if wxfilename: global', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--spk2utt', type=str, help='A text file of speaker to utterance-list map. ' '(Don\'t give rspecifier format, such as ' '"ark:utt2spk")') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--in-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], help='Specify the file format for the rspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--out-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'npy'], help='Specify the file format for the wspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--preprocess-conf', type=str, default=None, help='The configuration file for the pre-processing') parser.add_argument('rspecifier', type=str, help='Read specifier for feats. e.g. ark:some.ark') parser.add_argument('wspecifier_or_wxfilename', type=str, help='Write specifier. e.g. ark:some.ark') args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) is_wspecifier = ':' in args.wspecifier_or_wxfilename if is_wspecifier: if args.spk2utt is not None: logging.info('Performing as speaker CMVN mode') utt2spk_dict = {} with open(args.spk2utt) as f: for line in f: spk, utts = line.rstrip().split(None, 1) for utt in utts.split(): utt2spk_dict[utt] = spk def utt2spk(x): return utt2spk_dict[x] else: logging.info('Performing as utterance CMVN mode') def utt2spk(x): return x if args.out_filetype == 'npy': logging.warning('--out-filetype npy is allowed only for ' 'Global CMVN mode, changing to hdf5') args.out_filetype = 'hdf5' else: logging.info('Performing as global CMVN mode') if args.spk2utt is not None: logging.warning('spk2utt is not used for global CMVN mode') def utt2spk(x): return None if args.out_filetype == 'hdf5': logging.warning('--out-filetype hdf5 is not allowed for ' 'Global CMVN mode, changing to npy') args.out_filetype = 'npy' if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None # Calculate stats for each speaker counts = {} sum_feats = {} square_sum_feats = {} idx = 0 for idx, (utt, matrix) in enumerate( FileReaderWrapper(args.rspecifier, args.in_filetype), 1): if is_scipy_wav_style(matrix): # If data is sound file, then got as Tuple[int, ndarray] rate, matrix = matrix if preprocessing is not None: matrix = preprocessing(matrix, uttid_list=utt) spk = utt2spk(utt) # Init at the first seen of the spk if spk not in counts: counts[spk] = 0 feat_shape = matrix.shape[1:] # Accumulate in double precision sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) counts[spk] += matrix.shape[0] sum_feats[spk] += matrix.sum(axis=0) square_sum_feats[spk] += (matrix**2).sum(axis=0) logging.info('Processed {} utterances'.format(idx)) assert idx > 0, idx cmvn_stats = {} for spk in counts: feat_shape = sum_feats[spk].shape cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:] _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64) _cmvn_stats[0, :-1] = sum_feats[spk] _cmvn_stats[1, :-1] = square_sum_feats[spk] _cmvn_stats[0, -1] = counts[spk] _cmvn_stats[1, -1] = 0. # You can get the mean and std as following, # >>> N = _cmvn_stats[0, -1] # >>> mean = _cmvn_stats[0, :-1] / N # >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2) cmvn_stats[spk] = _cmvn_stats # Per utterance or speaker CMVN if is_wspecifier: with FileWriterWrapper(args.wspecifier_or_wxfilename, filetype=args.out_filetype) as writer: for spk, mat in cmvn_stats.items(): writer[spk] = mat # Global CMVN else: matrix = cmvn_stats[None] if args.out_filetype == 'npy': np.save(args.wspecifier_or_wxfilename, matrix) elif args.out_filetype == 'mat': # Kaldi supports only matrix or vector kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix) else: raise RuntimeError('Not supporting: --out-filetype {}'.format( args.out_filetype))
def test_write_read_mat(tmpdir, endian, dtype): path = tmpdir.mkdir('test') valid = np.random.rand(1000, 120).astype(dtype) kaldiio.save_mat(path.join('a.mat').strpath, valid, endian=endian) test = kaldiio.load_mat(path.join('a.mat').strpath, endian=endian) np.testing.assert_array_equal(test, valid)
import kaldiio import os, shutil import pandas as pd if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("lda_mat", help="original lda matrix (kaldi .mat)") parser.add_argument("out_lda_mat", help="new lda matrix to write on") parser.add_argument("--tgt_dim", type=int, default=5, help="number of dims kept (from end of the matrix)") parser.parse_args() args, leftovers = parser.parse_known_args() # Check if n is at least one less dim than the original matrix #load matrix orig_mat = kaldiio.load_mat(args.lda_mat) if orig_mat.shape[0] <= args.tgt_dim: raise ValueError( "Original matrix has less or same amount of dimensions ({}) than the target dim chosen for the new matrix ({})" .format(orig_mat.shape[0], args.tgt_dim)) new_mat = orig_mat[-args.tgt_dim - 1:-1] kaldiio.save_mat(args.out_lda_mat, new_mat)
def main(): args = get_parser().parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) is_wspecifier = ":" in args.wspecifier_or_wxfilename if is_wspecifier: if args.spk2utt is not None: logging.info("Performing as speaker CMVN mode") utt2spk_dict = {} with open(args.spk2utt) as f: for line in f: spk, utts = line.rstrip().split(None, 1) for utt in utts.split(): utt2spk_dict[utt] = spk def utt2spk(x): return utt2spk_dict[x] else: logging.info("Performing as utterance CMVN mode") def utt2spk(x): return x if args.out_filetype == "npy": logging.warning("--out-filetype npy is allowed only for " "Global CMVN mode, changing to hdf5") args.out_filetype = "hdf5" else: logging.info("Performing as global CMVN mode") if args.spk2utt is not None: logging.warning("spk2utt is not used for global CMVN mode") def utt2spk(x): return None if args.out_filetype == "hdf5": logging.warning("--out-filetype hdf5 is not allowed for " "Global CMVN mode, changing to npy") args.out_filetype = "npy" if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info("Apply preprocessing: {}".format(preprocessing)) else: preprocessing = None # Calculate stats for each speaker counts = {} sum_feats = {} square_sum_feats = {} idx = 0 for idx, (utt, matrix) in enumerate( file_reader_helper(args.rspecifier, args.in_filetype), 1): if is_scipy_wav_style(matrix): # If data is sound file, then got as Tuple[int, ndarray] rate, matrix = matrix if preprocessing is not None: matrix = preprocessing(matrix, uttid_list=utt) spk = utt2spk(utt) # Init at the first seen of the spk if spk not in counts: counts[spk] = 0 feat_shape = matrix.shape[1:] # Accumulate in double precision sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) counts[spk] += matrix.shape[0] sum_feats[spk] += matrix.sum(axis=0) square_sum_feats[spk] += (matrix**2).sum(axis=0) logging.info("Processed {} utterances".format(idx)) assert idx > 0, idx cmvn_stats = {} for spk in counts: feat_shape = sum_feats[spk].shape cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:] _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64) _cmvn_stats[0, :-1] = sum_feats[spk] _cmvn_stats[1, :-1] = square_sum_feats[spk] _cmvn_stats[0, -1] = counts[spk] _cmvn_stats[1, -1] = 0.0 # You can get the mean and std as following, # >>> N = _cmvn_stats[0, -1] # >>> mean = _cmvn_stats[0, :-1] / N # >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2) cmvn_stats[spk] = _cmvn_stats # Per utterance or speaker CMVN if is_wspecifier: with file_writer_helper(args.wspecifier_or_wxfilename, filetype=args.out_filetype) as writer: for spk, mat in cmvn_stats.items(): writer[spk] = mat # Global CMVN else: matrix = cmvn_stats[None] if args.out_filetype == "npy": np.save(args.wspecifier_or_wxfilename, matrix) elif args.out_filetype == "mat": # Kaldi supports only matrix or vector kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix) else: raise RuntimeError("Not supporting: --out-filetype {}".format( args.out_filetype))
def compute_cmvn_stats(): """ e.g. compute_cmvn_stats.py scp:data/train/feats.scp data/train/cmvn.ark # compute global cmvn """ args = get_parser().parse_args() is_wspecifier = ':' in args.wspecifier_or_wxfilename if is_wspecifier: if args.spk2utt is not None: utt2spk_dict = {} with open(args.spk2utt) as f: for line in f: spk, utts = line.rstrip().split(None, 1) for utt in utts.split(): utt2spk_dict[utt] = spk def utt2spk(x): return utt2spk_dict[x] else: logging.info('Performing as utterance CMVN mode') def utt2spk(x): return x else: logging.info('Performing as gloabl CMVN model') if args.spk2utt is not None: logging.warning('spk2utt is not used for global CMVN mode') def utt2spk(x): return None # Calculate stats for each speaker counts = {} sum_feats = {} square_sum_feats = {} idx = 0 for idx, (utt, matrix) in enumerate(KaldiReader(args.rspecifier), 1): spk = utt2spk(utt) if spk not in counts: counts[spk] = 0 feat_shape = matrix.shape[1:] sum_feats[spk] = np.zeros(feat_shape, dtype=np.float) square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float) counts[spk] += matrix.shape[0] sum_feats[spk] += matrix.sum(axis=0) square_sum_feats[spk] += (matrix**2).sum(axis=0) assert idx > 0, idx cmvn_stats = {} for spk in counts: feat_shape = sum_feats[spk].shape cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:] _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64) _cmvn_stats[0, :-1] = sum_feats[spk] _cmvn_stats[1, :-1] = square_sum_feats[spk] _cmvn_stats[0, -1] = counts[spk] _cmvn_stats[1, -1] = 0. cmvn_stats[spk] = _cmvn_stats if is_wspecifier: with KaldiWriter(args.wspecifier_or_wxfilename) as writer: for spk, mat in cmvn_stats.items(): writer[spk] = mat else: matrix = cmvn_stats[None] kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)
def write_cmvn_stats(path, cmvn_type, stats_dict): stats_dict = {spk: stats.to_numpy() for spk, stats in stats_dict.items()} if cmvn_type == "global": kaldiio.save_mat(path, stats_dict[None]) else: kaldiio.save_ark(path, stats_dict)