def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ file_writer_helper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method ) as writer: for utt_id, (rate, array) in reader: assert rate == args.fs array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) lmspc = logmelspectrogram(x=array, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, fmin=args.fmin, fmax=args.fmax) writer[utt_id] = lmspc
def test_preprocessing(tmpdir): cmvn_ark = str(tmpdir.join("cmvn.ark")) kwargs = { "process": [ { "type": "fbank", "n_mels": 80, "fs": 16000, "n_fft": 1024, "n_shift": 512 }, { "type": "cmvn", "stats": cmvn_ark, "norm_vars": True }, { "type": "delta", "window": 2, "order": 2 }, ], "mode": "sequential", } # Creates cmvn_ark samples = np.random.randn(100, 80) stats = np.empty((2, 81), dtype=np.float32) stats[0, :80] = samples.sum(axis=0) stats[1, :80] = (samples**2).sum(axis=0) stats[0, -1] = 100.0 stats[1, -1] = 0.0 kaldiio.save_mat(cmvn_ark, stats) bs = 1 xs = [np.random.randn(1000).astype(np.float32) for _ in range(bs)] preprocessing = Transformation(kwargs) processed_xs = preprocessing(xs) for idx, x in enumerate(xs): opt = dict(kwargs["process"][0]) opt.pop("type") x = logmelspectrogram(x, **opt) opt = dict(kwargs["process"][1]) opt.pop("type") x = CMVN(**opt)(x) opt = dict(kwargs["process"][2]) opt.pop("type") x = add_deltas(x, **opt) np.testing.assert_allclose(processed_xs[idx], x)
def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) # Find the number of utterances n_utt = sum(1 for line in open(args.segments)) logging.info("%d utterances found to be processed." % n_utt) # Compute fbank features with kaldiio.ReadHelper( args.rspecifier, segments=args.segments) as reader, file_writer_helper( args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, ) as writer: for i, struct in enumerate(reader, start=1): logging.info("processing %d/%d(%.2f%%)" % (i, n_utt, 100 * i / n_utt)) utt_id, (rate, array) = struct try: assert rate == args.fs array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) lmspc = logmelspectrogram( x=array, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, fmin=args.fmin, fmax=args.fmax, ) writer[utt_id] = lmspc except: logging.warning("failed to compute fbank for utt_id=`%s`" % utt_id)
def test_compatible_with_espnet1(): layer = LogMelFbank(n_fft=16, hop_length=4, n_mels=4, fs="16k", fmin=80, fmax=7600) x = torch.randn(1, 100) y, _ = layer(x, torch.LongTensor([100])) y = y.numpy()[0] y2 = logmelspectrogram(x[0].numpy(), n_fft=16, n_shift=4, n_mels=4, fs=16000, fmin=80, fmax=7600) np.testing.assert_allclose(y, y2, rtol=0, atol=1e-5)
def acoustic_features_process_one_utterance(wav_path, args, utt2dur_phn): uttid = os.path.basename(wav_path).split('.')[0] # extract mel-spectrogram (log) wav, fs = sf.read(wav_path) peak = np.abs(wav).max() if peak > 1.0: wav /= peak if fs != args.set_fs: wav = resampy.resample(wav, fs, args.set_fs, axis=0) fs = args.set_fs mel = logmelspectrogram( x=wav, fs=fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shifts, win_length=args.win_length, window=args.windows, fmin=args.fmin, fmax=args.fmax, ) # make sum(dur) = mel length & save durations durations = utt2dur_phn[uttid][0] durations[-1] += mel.shape[0] - sum(durations) durations = np.array(durations, dtype=float).reshape(-1, 1) dur_save_root = f'{args.feature_root}/durations_MFA' os.makedirs(dur_save_root, exist_ok=True) dur_save_path = f'{dur_save_root}/{uttid}.npy' np.save(dur_save_path, durations) # extract phn-level F0 & energy tlen = mel.shape[0] frame_period = args.n_shifts / fs * 1000 f0, timeaxis = pw.dio(wav.astype('float64'), fs, frame_period=frame_period) f0 = pw.stonemask(wav.astype('float64'), f0, timeaxis, fs) f0 = f0[:tlen].reshape(-1).astype('float32') nonzeros_indices = np.nonzero(f0) lf0 = f0.copy() lf0[nonzeros_indices] = np.log( f0[nonzeros_indices]) # for f0(Hz), lf0 > 0 when f0 != 0 x_mag = np.abs( stft(wav, args.n_fft, args.n_shifts, win_length=args.win_length, window=args.windows)) # T x F energy = np.linalg.norm(x_mag, axis=1).reshape(-1) assert len(energy) == tlen durs = durations.reshape(-1) durs_cum = np.cumsum(np.pad(durs, (1, 0))) pitch_phn = np.zeros((durs.shape[0], ), dtype=np.float) energy_phn = np.zeros((durs.shape[0], ), dtype=np.float) for idx, a, b in zip(range(durs.shape[0]), durs_cum[:-1], durs_cum[1:]): a = int(a) b = int(b) values = lf0[a:b][np.where( f0[a:b] != 0.0)[0]] # use avg-lf0 instead of avg-f0 pitch_phn[idx] = np.mean(values) if len(values) > 0 else 0.0 values = energy[a:b] energy_phn[idx] = np.mean(values) if len(values) > 0 else 0.0 f0 = pitch_phn energy = energy_phn mel_save_path = f'{args.feature_root}/mels-ori/{uttid}.npy' f0_save_path = f'{args.feature_root}/f0-ori/{uttid}.npy' en_save_path = f'{args.feature_root}/en-ori/{uttid}.npy' os.makedirs(os.path.dirname(mel_save_path), exist_ok=True) os.makedirs(os.path.dirname(f0_save_path), exist_ok=True) os.makedirs(os.path.dirname(en_save_path), exist_ok=True) np.save(mel_save_path, mel) np.save(f0_save_path, f0) np.save(en_save_path, energy) return uttid, mel, f0, energy
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--fs', type=int, help='Sampling frequency') parser.add_argument('--fmax', type=int, default=None, nargs='?', help='Maximum frequency') parser.add_argument('--fmin', type=int, default=None, nargs='?', help='Minimum frequency') parser.add_argument('--n_mels', type=int, default=80, help='Number of mel basis') parser.add_argument('--n_fft', type=int, default=1024, help='FFT length in point') parser.add_argument('--n_shift', type=int, default=512, help='Shift length in point') parser.add_argument('--win_length', type=int, default=None, nargs='?', help='Analisys window length in point') parser.add_argument('--window', type=str, default='hann', choices=['hann', 'hamming'], help='Type of window') parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--filetype', type=str, default='mat', choices=['mat', 'hdf5'], help='Specify the file format for output. ' '"mat" is the matrix format in kaldi') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int, default=None, help='Give the bit depth of the PCM, ' 'then normalizes data to scale in [-1,1]') parser.add_argument('rspecifier', type=str, help='WAV scp file') parser.add_argument('--segments', type=str, help='segments-file format: each line is either' '<segment-id> <recording-id> <start-time> <end-time>' 'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5') parser.add_argument('wspecifier', type=str, help='Write specifier') args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ FileWriterWrapper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method ) as writer: for utt_id, (rate, array) in reader: assert rate == args.fs array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) lmspc = logmelspectrogram(x=array, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, fmin=args.fmin, fmax=args.fmax) writer[utt_id] = lmspc