def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ FileWriterWrapper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method ) as writer: for utt_id, (_, array) in reader: array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) spc = spectrogram( x=array, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window) writer[utt_id] = spc
def main(): args = get_parser().parse_args() # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if ':' in args.stats_rspecifier_or_rxfilename: is_rspcifier = True if args.stats_filetype == 'npy': stats_filetype = 'hdf5' else: stats_filetype = args.stats_filetype stats_dict = dict( FileReaderWrapper(args.stats_rspecifier_or_rxfilename, stats_filetype)) else: is_rspcifier = False if args.stats_filetype == 'mat': stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename) else: stats = numpy.load(args.stats_rspecifier_or_rxfilename) stats_dict = {None: stats} cmvn = CMVN(stats=stats_dict, norm_means=args.norm_means, norm_vars=args.norm_vars, utt2spk=args.utt2spk, spk2utt=args.spk2utt, reverse=args.reverse) with FileWriterWrapper( args.wspecifier, filetype=args.out_filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt, mat in FileReaderWrapper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] rate, mat = mat mat = cmvn(mat, utt if is_rspcifier else None) writer[utt] = mat
def main(): parser = get_parser() args = parser.parse_args() # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None with FileWriterWrapper( args.wspecifier, filetype=args.out_filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt, mat in FileReaderWrapper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] rate, mat = mat if preprocessing is not None: mat = preprocessing(mat, uttid_list=utt) # shape = (Time, Channel) if args.out_filetype in ['sound.hdf5', 'sound']: # Write Tuple[int, numpy.ndarray] (scipy style) writer[utt] = (rate, mat) else: writer[utt] = mat
def main(): parser = argparse.ArgumentParser( description='Compute cepstral mean and ' 'variance normalization statistics' 'If wspecifier provided: per-utterance by default, ' 'or per-speaker if' 'spk2utt option provided; if wxfilename: global', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--spk2utt', type=str, help='A text file of speaker to utterance-list map. ' '(Don\'t give rspecifier format, such as ' '"ark:utt2spk")') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--in-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], help='Specify the file format for the rspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--out-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'npy'], help='Specify the file format for the wspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--preprocess-conf', type=str, default=None, help='The configuration file for the pre-processing') parser.add_argument('rspecifier', type=str, help='Read specifier for feats. e.g. ark:some.ark') parser.add_argument('wspecifier_or_wxfilename', type=str, help='Write specifier. e.g. ark:some.ark') args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) is_wspecifier = ':' in args.wspecifier_or_wxfilename if is_wspecifier: if args.spk2utt is not None: logging.info('Performing as speaker CMVN mode') utt2spk_dict = {} with open(args.spk2utt) as f: for line in f: spk, utts = line.rstrip().split(None, 1) for utt in utts.split(): utt2spk_dict[utt] = spk def utt2spk(x): return utt2spk_dict[x] else: logging.info('Performing as utterance CMVN mode') def utt2spk(x): return x if args.out_filetype == 'npy': logging.warning('--out-filetype npy is allowed only for ' 'Global CMVN mode, changing to hdf5') args.out_filetype = 'hdf5' else: logging.info('Performing as global CMVN mode') if args.spk2utt is not None: logging.warning('spk2utt is not used for global CMVN mode') def utt2spk(x): return None if args.out_filetype == 'hdf5': logging.warning('--out-filetype hdf5 is not allowed for ' 'Global CMVN mode, changing to npy') args.out_filetype = 'npy' if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None # Calculate stats for each speaker counts = {} sum_feats = {} square_sum_feats = {} idx = 0 for idx, (utt, matrix) in enumerate( FileReaderWrapper(args.rspecifier, args.in_filetype), 1): if is_scipy_wav_style(matrix): # If data is sound file, then got as Tuple[int, ndarray] rate, matrix = matrix if preprocessing is not None: matrix = preprocessing(matrix, uttid_list=utt) spk = utt2spk(utt) # Init at the first seen of the spk if spk not in counts: counts[spk] = 0 feat_shape = matrix.shape[1:] # Accumulate in double precision sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) counts[spk] += matrix.shape[0] sum_feats[spk] += matrix.sum(axis=0) square_sum_feats[spk] += (matrix**2).sum(axis=0) logging.info('Processed {} utterances'.format(idx)) assert idx > 0, idx cmvn_stats = {} for spk in counts: feat_shape = sum_feats[spk].shape cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:] _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64) _cmvn_stats[0, :-1] = sum_feats[spk] _cmvn_stats[1, :-1] = square_sum_feats[spk] _cmvn_stats[0, -1] = counts[spk] _cmvn_stats[1, -1] = 0. # You can get the mean and std as following, # >>> N = _cmvn_stats[0, -1] # >>> mean = _cmvn_stats[0, :-1] / N # >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2) cmvn_stats[spk] = _cmvn_stats # Per utterance or speaker CMVN if is_wspecifier: with FileWriterWrapper(args.wspecifier_or_wxfilename, filetype=args.out_filetype) as writer: for spk, mat in cmvn_stats.items(): writer[spk] = mat # Global CMVN else: matrix = cmvn_stats[None] if args.out_filetype == 'npy': np.save(args.wspecifier_or_wxfilename, matrix) elif args.out_filetype == 'mat': # Kaldi supports only matrix or vector kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix) else: raise RuntimeError('Not supporting: --out-filetype {}'.format( args.out_filetype))
def enhance(args): """Dumping enhanced speech and mask :param Namespace args: The program arguments """ set_deterministic_pytorch(args) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # load trained model parameters logging.info('reading model parameters from ' + args.model) model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) assert isinstance(model, ASRInterface) torch_load(args.model, model) model.recog_args = args # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info('gpu id: ' + str(gpu_id)) model.cuda() # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=False, sort_in_input_length=False, preprocess_conf=None # Apply pre_process in outer func ) if args.batchsize == 0: args.batchsize = 1 # Creates writers for outputs from the network if args.enh_wspecifier is not None: enh_writer = FileWriterWrapper(args.enh_wspecifier, filetype=args.enh_filetype) else: enh_writer = None # Creates a Transformation instance preprocess_conf = (train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf) if preprocess_conf is not None: logging.info('Use preprocessing'.format(preprocess_conf)) transform = Transformation(preprocess_conf) else: transform = None # Creates a IStft instance istft = None frame_shift = args.istft_n_shift # Used for plot the spectrogram if args.apply_istft: if preprocess_conf is not None: # Read the conffile and find stft setting with open(preprocess_conf) as f: # Json format: e.g. # {"process": [{"type": "stft", # "win_length": 400, # "n_fft": 512, "n_shift": 160, # "window": "han"}, # {"type": "foo", ...}, ...]} conf = json.load(f) assert 'process' in conf, conf # Find stft setting for p in conf['process']: if p['type'] == 'stft': istft = IStft(win_length=p['win_length'], n_shift=p['n_shift'], window=p.get('window', 'hann')) logging.info('stft is found in {}. ' 'Setting istft config from it\n{}'.format( preprocess_conf, istft)) frame_shift = p['n_shift'] break if istft is None: # Set from command line arguments istft = IStft(win_length=args.istft_win_length, n_shift=args.istft_n_shift, window=args.istft_window) logging.info( 'Setting istft config from the command line args\n{}'.format( istft)) # sort data keys = list(js.keys()) feat_lens = [js[key]['input'][0]['shape'][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) num_images = 0 if not os.path.exists(args.image_dir): os.makedirs(args.image_dir) for names in grouper(args.batchsize, keys, None): batch = [(name, js[name]) for name in names] # May be in time region: (Batch, [Time, Channel]) org_feats = load_inputs_and_targets(batch)[0] if transform is not None: # May be in time-freq region: : (Batch, [Time, Channel, Freq]) feats = transform(org_feats, train=False) else: feats = org_feats with torch.no_grad(): enhanced, mask, ilens = model.enhance(feats) for idx, name in enumerate(names): # Assuming mask, feats : [Batch, Time, Channel. Freq] # enhanced : [Batch, Time, Freq] enh = enhanced[idx][:ilens[idx]] mas = mask[idx][:ilens[idx]] feat = feats[idx] # Plot spectrogram if args.image_dir is not None and num_images < args.num_images: import matplotlib.pyplot as plt num_images += 1 ref_ch = 0 plt.figure(figsize=(20, 10)) plt.subplot(4, 1, 1) plt.title('Mask [ref={}ch]'.format(ref_ch)) plot_spectrogram(plt, mas[:, ref_ch].T, fs=args.fs, mode='linear', frame_shift=frame_shift, bottom=False, labelbottom=False) plt.subplot(4, 1, 2) plt.title('Noisy speech [ref={}ch]'.format(ref_ch)) plot_spectrogram(plt, feat[:, ref_ch].T, fs=args.fs, mode='db', frame_shift=frame_shift, bottom=False, labelbottom=False) plt.subplot(4, 1, 3) plt.title('Masked speech [ref={}ch]'.format(ref_ch)) plot_spectrogram(plt, (feat[:, ref_ch] * mas[:, ref_ch]).T, frame_shift=frame_shift, fs=args.fs, mode='db', bottom=False, labelbottom=False) plt.subplot(4, 1, 4) plt.title('Enhanced speech') plot_spectrogram(plt, enh.T, fs=args.fs, mode='db', frame_shift=frame_shift) plt.savefig(os.path.join(args.image_dir, name + '.png')) plt.clf() # Write enhanced wave files if enh_writer is not None: if istft is not None: enh = istft(enh) else: enh = enh if args.keep_length: if len(org_feats[idx]) < len(enh): # Truncate the frames added by stft padding enh = enh[:len(org_feats[idx])] elif len(org_feats) > len(enh): padwidth = [(0, (len(org_feats[idx]) - len(enh)))] \ + [(0, 0)] * (enh.ndim - 1) enh = np.pad(enh, padwidth, mode='constant') if args.enh_filetype in ('sound', 'sound.hdf5'): enh_writer[name] = (args.fs, enh) else: # Hint: To dump stft_signal, mask or etc, # enh_filetype='hdf5' might be convenient. enh_writer[name] = enh if num_images >= args.num_images and enh_writer is None: logging.info('Breaking the process.') break
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--in-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], help='Specify the file format for the rspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--out-filetype', type=str, default='mat', choices=['mat', 'hdf5'], help='Specify the file format for the wspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('--preprocess-conf', type=str, default=None, help='The configuration file for the pre-processing') parser.add_argument('rspecifier', type=str, help='Read specifier for feats. e.g. ark:some.ark') parser.add_argument('wspecifier', type=str, help='Write specifier. e.g. ark:some.ark') args = parser.parse_args() # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None with FileWriterWrapper( args.wspecifier, filetype=args.out_filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt, mat in FileReaderWrapper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] rate, mat = mat if preprocessing is not None: mat = preprocessing(mat, uttid_list=utt) writer[utt] = mat
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--filetype', type=str, default='mat', choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], help='Specify the file format for output. ' '"mat" is the matrix format in kaldi') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int, default=None, help='Give the bit depth of the PCM, ' 'then normalizes data to scale in [-1,1]') parser.add_argument('rspecifier', type=str, nargs='+', help='WAV scp file') parser.add_argument('--segments', type=str, help='segments-file format: each line is either' '<segment-id> <recording-id> <start-time> <end-time>' 'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5') parser.add_argument('wspecifier', type=str, help='Write specifier') args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with FileWriterWrapper( args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (rate, array) in wav_generator(args.rspecifier, args.segments): if args.filetype == 'mat': # Kaldi-matrix doesn't support integer array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array.astype(numpy.float32) array = array / (1 << (args.normalize - 1)) # shape = (Time, Channel) if args.filetype == 'sound.hdf5': # Write Tuple[int, numpy.ndarray] (scipy style) writer[utt_id] = (rate, array) else: writer[utt_id] = array
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--filetype', type=str, default='mat', choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], help='Specify the file format for output. ' '"mat" is the matrix format in kaldi') parser.add_argument('--format', type=str, default=None, help='The file format for output pcm. ' 'This option is only valid ' 'when "--filetype" is "sound.hdf5" or "sound"') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int, default=None, help='Give the bit depth of the PCM, ' 'then normalizes data to scale in [-1,1]') parser.add_argument('--preprocess-conf', type=str, default=None, help='The configuration file for the pre-processing') parser.add_argument('--keep-length', type=strtobool, default=True, help='Truncating or zero padding if the output length ' 'is changed from the input by preprocessing') parser.add_argument('rspecifier', type=str, help='WAV scp file') parser.add_argument('--segments', type=str, help='segments-file format: each line is either' '<segment-id> <recording-id> <start-time> <end-time>' 'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5') parser.add_argument('wspecifier', type=str, help='Write specifier') args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None with FileWriterWrapper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, pcm_format=args.format) as writer: for utt_id, (rate, array) in kaldiio.ReadHelper(args.rspecifier, args.segments): if args.filetype == 'mat': # Kaldi-matrix doesn't support integer array = array.astype(numpy.float32) if array.ndim == 1: # (Time) -> (Time, Channel) array = array[:, None] if args.normalize is not None and args.normalize != 1: array = array.astype(numpy.float32) array = array / (1 << (args.normalize - 1)) if preprocessing is not None: orgtype = array.dtype out = preprocessing(array, uttid_list=utt_id) out = out.astype(orgtype) if args.keep_length: if len(out) > len(array): out = numpy.pad(out, [(0, len(out) - len(array))] + [(0, 0) for _ in range(out.ndim - 1)], mode='constant') elif len(out) < len(array): # The length can be changed by stft, for example. out = out[:len(out)] array = out # shape = (Time, Channel) if args.filetype in ['sound.hdf5', 'sound']: # Write Tuple[int, numpy.ndarray] (scipy style) writer[utt_id] = (rate, array) else: writer[utt_id] = array
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--fs', type=int, help='Sampling frequency') parser.add_argument('--fmax', type=int, default=None, nargs='?', help='Maximum frequency') parser.add_argument('--fmin', type=int, default=None, nargs='?', help='Minimum frequency') parser.add_argument('--n_mels', type=int, default=80, help='Number of mel basis') parser.add_argument('--n_fft', type=int, default=1024, help='FFT length in point') parser.add_argument('--n_shift', type=int, default=512, help='Shift length in point') parser.add_argument('--win_length', type=int, default=None, nargs='?', help='Analisys window length in point') parser.add_argument('--window', type=str, default='hann', choices=['hann', 'hamming'], help='Type of window') parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--filetype', type=str, default='mat', choices=['mat', 'hdf5'], help='Specify the file format for output. ' '"mat" is the matrix format in kaldi') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int, default=None, help='Give the bit depth of the PCM, ' 'then normalizes data to scale in [-1,1]') parser.add_argument('rspecifier', type=str, help='WAV scp file') parser.add_argument('--segments', type=str, help='segments-file format: each line is either' '<segment-id> <recording-id> <start-time> <end-time>' 'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5') parser.add_argument('wspecifier', type=str, help='Write specifier') args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ FileWriterWrapper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method ) as writer: for utt_id, (rate, array) in reader: assert rate == args.fs array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) lmspc = logmelspectrogram(x=array, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, fmin=args.fmin, fmax=args.fmax) writer[utt_id] = lmspc
def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None with FileWriterWrapper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, pcm_format=args.format ) as writer: for utt_id, (rate, array) in kaldiio.ReadHelper(args.rspecifier, args.segments): if args.filetype == 'mat': # Kaldi-matrix doesn't support integer array = array.astype(numpy.float32) if array.ndim == 1: # (Time) -> (Time, Channel) array = array[:, None] if args.normalize is not None and args.normalize != 1: array = array.astype(numpy.float32) array = array / (1 << (args.normalize - 1)) if preprocessing is not None: orgtype = array.dtype out = preprocessing(array, uttid_list=utt_id) out = out.astype(orgtype) if args.keep_length: if len(out) > len(array): out = numpy.pad( out, [(0, len(out) - len(array))] + [(0, 0) for _ in range(out.ndim - 1)], mode='constant') elif len(out) < len(array): # The length can be changed by stft, for example. out = out[:len(out)] array = out # shape = (Time, Channel) if args.filetype in ['sound.hdf5', 'sound']: # Write Tuple[int, numpy.ndarray] (scipy style) writer[utt_id] = (rate, array) else: writer[utt_id] = array
def main(): args = get_parser().parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) is_wspecifier = ':' in args.wspecifier_or_wxfilename if is_wspecifier: if args.spk2utt is not None: logging.info('Performing as speaker CMVN mode') utt2spk_dict = {} with open(args.spk2utt) as f: for line in f: spk, utts = line.rstrip().split(None, 1) for utt in utts.split(): utt2spk_dict[utt] = spk def utt2spk(x): return utt2spk_dict[x] else: logging.info('Performing as utterance CMVN mode') def utt2spk(x): return x if args.out_filetype == 'npy': logging.warning('--out-filetype npy is allowed only for ' 'Global CMVN mode, changing to hdf5') args.out_filetype = 'hdf5' else: logging.info('Performing as global CMVN mode') if args.spk2utt is not None: logging.warning('spk2utt is not used for global CMVN mode') def utt2spk(x): return None if args.out_filetype == 'hdf5': logging.warning('--out-filetype hdf5 is not allowed for ' 'Global CMVN mode, changing to npy') args.out_filetype = 'npy' if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None # Calculate stats for each speaker counts = {} sum_feats = {} square_sum_feats = {} idx = 0 for idx, (utt, matrix) in enumerate( FileReaderWrapper(args.rspecifier, args.in_filetype), 1): if is_scipy_wav_style(matrix): # If data is sound file, then got as Tuple[int, ndarray] rate, matrix = matrix if preprocessing is not None: matrix = preprocessing(matrix, uttid_list=utt) spk = utt2spk(utt) # Init at the first seen of the spk if spk not in counts: counts[spk] = 0 feat_shape = matrix.shape[1:] # Accumulate in double precision sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) counts[spk] += matrix.shape[0] sum_feats[spk] += matrix.sum(axis=0) square_sum_feats[spk] += (matrix**2).sum(axis=0) logging.info('Processed {} utterances'.format(idx)) assert idx > 0, idx cmvn_stats = {} for spk in counts: feat_shape = sum_feats[spk].shape cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:] _cmvn_stats = np.empty(cmvn_shape, dtype=np.float64) _cmvn_stats[0, :-1] = sum_feats[spk] _cmvn_stats[1, :-1] = square_sum_feats[spk] _cmvn_stats[0, -1] = counts[spk] _cmvn_stats[1, -1] = 0. # You can get the mean and std as following, # >>> N = _cmvn_stats[0, -1] # >>> mean = _cmvn_stats[0, :-1] / N # >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2) cmvn_stats[spk] = _cmvn_stats # Per utterance or speaker CMVN if is_wspecifier: with FileWriterWrapper(args.wspecifier_or_wxfilename, filetype=args.out_filetype) as writer: for spk, mat in cmvn_stats.items(): writer[spk] = mat # Global CMVN else: matrix = cmvn_stats[None] if args.out_filetype == 'npy': np.save(args.wspecifier_or_wxfilename, matrix) elif args.out_filetype == 'mat': # Kaldi supports only matrix or vector kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix) else: raise RuntimeError('Not supporting: --out-filetype {}'.format( args.out_filetype))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--in-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], help='Specify the file format for the rspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--stats-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'npy'], help='Specify the file format for the rspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--out-filetype', type=str, default='mat', choices=['mat', 'hdf5'], help='Specify the file format for the wspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--norm-means', type=strtobool, default=True, help='Do variance normalization or not.') parser.add_argument('--norm-vars', type=strtobool, default=False, help='Do variance normalization or not.') parser.add_argument('--reverse', type=strtobool, default=False, help='Do reverse mode or not') parser.add_argument('--spk2utt', type=str, help='A text file of speaker to utterance-list map. ' '(Don\'t give rspecifier format, such as ' '"ark:spk2utt")') parser.add_argument('--utt2spk', type=str, help='A text file of utterance to speaker map. ' '(Don\'t give rspecifier format, such as ' '"ark:utt2spk")') parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('stats_rspecifier_or_rxfilename', help='Input stats. e.g. ark:stats.ark or stats.mat') parser.add_argument('rspecifier', type=str, help='Read specifier id. e.g. ark:some.ark') parser.add_argument('wspecifier', type=str, help='Write specifier id. e.g. ark:some.ark') args = parser.parse_args() # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if ':' in args.stats_rspecifier_or_rxfilename: is_rspcifier = True if args.stats_filetype == 'npy': stats_filetype = 'hdf5' else: stats_filetype = args.stats_filetype stats_dict = dict( FileReaderWrapper(args.stats_rspecifier_or_rxfilename, stats_filetype)) else: is_rspcifier = False if args.stats_filetype == 'mat': stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename) else: stats = numpy.load(args.stats_rspecifier_or_rxfilename) stats_dict = {None: stats} cmvn = CMVN(stats=stats_dict, norm_means=args.norm_means, norm_vars=args.norm_vars, utt2spk=args.utt2spk, spk2utt=args.spk2utt, reverse=args.reverse) with FileWriterWrapper( args.wspecifier, filetype=args.out_filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt, mat in FileReaderWrapper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] rate, mat = mat mat = cmvn(mat, utt if is_rspcifier else None) writer[utt] = mat