def get_plp_dd(wav_fn, norm): """Return the MFCCs with deltas and delta-deltas for a audio file.""" audio = Audio.load(wav_fn) processor = PlpProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01, low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2) plp_static = processor.process(audio, vtln_warp=1.0) d_processor = DeltaPostProcessor(order=2) plp_deltas = d_processor.process(plp_static) features = np.float64(plp_deltas._to_dict()["data"]) if norm == "cmvn": features = (features - np.mean(features, axis=0)) / np.std(features, axis=0) return features
def get_mfcc_vtln(wav_fn, f, norm, lang): """Return the MFCCs with deltas and delta-deltas for a audio file.""" ref = os.path.basename(f).replace(".wav", "") if not os.path.isfile("warps_{}.pkl".format(lang)): if os.path.isfile('warps_{}.txt'.format(lang)): factors = {} with open('warps_{}.txt'.format(lang), mode='r', encoding='utf-8') as opfile: wop = opfile.read().split('\n') for line in wop: if len(line) > 1: l_sp = line.split() factors[l_sp[0]] = float(l_sp[1]) print(factors) with open('warps_{}.pkl'.format(lang), mode='wb') as opfile: pickle.dump(factors, opfile) else: print('no warp factors found') exit() with open("warps_{}.pkl".format(lang), mode="rb") as op: factors = pickle.load(op) warp = float(factors[ref]) audio = Audio.load(wav_fn) processor = MfccProcessor(sample_rate=audio.sample_rate, window_type="hamming", frame_length=0.025, frame_shift=0.01, cepstral_lifter=26.0, low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate / 2) d_processor = DeltaPostProcessor(order=2) mfcc_static = processor.process(audio, vtln_warp=warp) mfcc_deltas = d_processor.process(mfcc_static) features = np.float64(mfcc_deltas._to_dict()["data"]) if norm == "cmvn": features = (features - np.mean(features, axis=0)) / np.std(features, axis=0) return features
def test_params(): d = DeltaPostProcessor() d.order = 0 with pytest.raises(ValueError): d.window = 0 with pytest.raises(ValueError): d.window = 2000 d.window = 1 assert d.get_params() == {'order': 0, 'window': 1} p = {'order': 0, 'window': 1} d = DeltaPostProcessor() assert d.get_params()['order'] == 2 d.set_params(**p) assert d.get_params() == p
def test_ndims(): with pytest.raises(ValueError) as err: DeltaPostProcessor().ndims assert 'output dimension for delta processor depends on input' in str(err)
def test_output(mfcc, order, window): delta = DeltaPostProcessor(order=order, window=window).process(mfcc) assert delta.shape[0] == mfcc.shape[0] assert delta.shape[1] == mfcc.shape[1] * (order + 1) assert np.array_equal(delta.times, mfcc.times) assert delta.data[:, :mfcc.shape[1]] == pytest.approx(mfcc.data)
def get_features(self, y, sample_rate): """Feature extraction Parameters ---------- y : (n_samples, 1) numpy array Waveform sample_rate : int Sample rate Returns ------- data : (n_frames, n_dimensions) numpy array Features """ # scale the audio signal between -1 and 1 before # creating audio object w/ shennong: Do this because # when pyannote uses "data augmentation", it normalizes # the signal, but when loading the data without data # augmentation it doesn't normalize it. y = y / np.max((-np.min(y), np.max(y))) # create audio object for shennong audio = Audio(data=y, sample_rate=sample_rate) # MFCC parameters processor = MfccProcessor(sample_rate=sample_rate) processor.dither = self.dither processor.preemph_coeff = self.preemph_coeff processor.remove_dc_offset = self.remove_dc_offset processor.window_type = self.window_type processor.blackman_coeff = self.blackman_coeff processor.vtln_low = self.vtln_low processor.vtln_high = self.vtln_high processor.energy_floor = self.energy_floor processor.raw_energy = self.raw_energy processor.cepstral_lifter = self.cepstral_lifter processor.htk_compat = self.htk_compat processor.low_freq = self.mfccLowFreq processor.high_freq = self.mfccHighFreq # defines it as (nyquist - 100) processor.use_energy = self.e processor.num_ceps = self.coefs processor.snip_edges = False # end with correct number of frames # MFCC extraction #audio = Audio(data=y, sample_rate=sample_rate) mfcc = processor.process(audio) # compute deltas if self.D: # define first or second order derivative if not self.DD: derivative_proc = DeltaPostProcessor(order=1) else: derivative_proc = DeltaPostProcessor(order=2) # process Mfccs mfcc = derivative_proc.process(mfcc) # Compute CMVN if self.with_cmvn: # define cmvn postproc = CmvnPostProcessor(self.get_dimension(), stats=None) # accumulate stats stats = postproc.accumulate(mfcc) # process cmvn mfcc = postproc.process(mfcc) # Compute Pitch if self.with_pitch: # extract pitch pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax) mfcc = self.concatenate_with_pitch(mfcc.data, pitch.data) else: mfcc = mfcc.data return mfcc
import argparse import glob import os import pickle import numpy as np import pandas as pd from pathlib import Path from shennong.audio import Audio from shennong.features.processor.mfcc import MfccProcessor from shennong.features.postprocessor.delta import DeltaPostProcessor from shennong.features.processor.bottleneck import BottleneckProcessor mfcc_processor = MfccProcessor(sample_rate=8000) delta_processor = DeltaPostProcessor(order=2) bnf_processor = BottleneckProcessor(weights='BabelMulti') parser = argparse.ArgumentParser( description='example: python wav_to_shennong-feats.py mfcc wrm-pd', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('features', help='features to extract using the Shennong library (mfcc or bnf), use _all_ for both') parser.add_argument('dataset', help = 'name of dataset, use _all_ to iterate over all') parser.add_argument('--feats_dir', default='data/interim/features', help = "directory for features") parser.add_argument('--datasets_dir', default='data/raw/datasets', help = "directory for raw datasets and labels files") parser.add_argument('--queries_dir', default='queries', help = "directory with .wav files for queries") parser.add_argument('--references_dir', default='references', help = "directory with .wav files for references")