def mel_transform(signal, sample_rate = 8000, pre_emphasis = 0.97 , frame_size = 0.025, frame_stride = 0.01, window_func = np.hamming , N_FFT = 512, nfilt = 40, mean_normalised = True): feat, energy = fbank(signal, samplerate = sample_rate, winlen = frame_size , winstep = frame_stride, nfilt = nfilt, nfft = N_FFT , preemph = pre_emphasis, winfunc = np.hamming) return np.log(feat)
def computeLogMelFilterBank(self, file_name): ''' Compute the log-mel frequency filterbank feature vector with deltas and double deltas ''' (rate, sig) = wav.read(file_name) fbank_feat, energy = fbank(sig, rate, winlen=0.025, winstep=0.01, nfilt=40) fbank_feat = np.log(fbank_feat) fbank_feat = np.vstack((fbank_feat.transpose(), energy.transpose())).transpose() deltas = self.computeDeltas(fbank_feat) assert deltas.shape == fbank_feat.shape, "Shapes not equal {0} and \ {1}".format(deltas.shape, fbank_feat.shape) feat_vec = np.vstack((fbank_feat.transpose(), deltas.transpose())) double_deltas = self.computeDeltas(deltas) feat_vec = np.vstack((feat_vec, double_deltas.transpose())).transpose() assert len(feat_vec[0]) == 123, "Something wrong with feature vector dimensions..." return feat_vec
def mfcc_without_dct(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=lambda x: numpy.ones((x, ))): """Compute MFCC features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. :param winfunc: the analysis window to apply to each frame. By default no window is applied. :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) feat = numpy.log(feat) # feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] # feat = lifter(feat,ceplifter) if appendEnergy: feat[:, 0] = numpy.log( energy ) # replace first cepstral coefficient with log of frame energy return feat
def generator(): if is_training: _wav_files, _labels = _shuffle(wav_files, labels) else: _wav_files, _labels = wav_files, labels for wav_file, label in zip(_wav_files, _labels): signal, sample_rate, _ = read_audio(wav_file) num_frames = ceil(desired_ms / window_stride_ms) num_samples = from_ms_to_samples(desired_ms, sample_rate) if input_feature == 'fbank': feat, _ = fbank(signal, sample_rate, winlen=window_size_ms / 1000, winstep=window_stride_ms / 1000, nfilt=input_feature_dim) feat = _random_select(feat, num_frames) elif input_feature == 'logfbank': feat = logfbank(signal, sample_rate, winlen=window_size_ms / 1000, winstep=window_stride_ms / 1000, nfilt=input_feature_dim) feat = _random_select(feat, num_frames) elif input_feature == 'mfcc': feat = mfcc(signal, sample_rate, winlen=window_size_ms / 1000, winstep=window_stride_ms / 1000, nfilt=input_feature_dim, numcep=input_feature_dim) feat = _random_select(feat, num_frames) elif input_feature == 'raw': feat = np.expand_dims(signal, 1) feat = _random_select(feat, num_samples) # norm per dimension across all frames if normalize_frames: feat = _normalize_frames(feat) yield (feat, label)
def run_main(): if len(sys.argv) <= 1: raise Exception("Need to specify input wav-file to process") wavname = sys.argv[1] if not os.path.exists(wavname): raise Exception("Specified wavfile {0} does not seem to exist!".format(wavname)) print("Will process file {0}".format(wavname)) (samplerate, signal) = wav.read(wavname) sampleperiod = 1.0 / samplerate signal = signal.reshape( (-1, 1) ) fft_size = 256 nfilters = 15 signal = utils_sig.pad_to_multiple_of(signal, fft_size, 0.0) sigchunks = utils_sig.cut_sig_into_chunks(signal.T, fft_size) spec_envs = utils_sp.get_spec_envelopes(sigchunks) fbank_envs = utils_sp.get_mel_fb_curves(spec_envs, samplerate, nfilters) timestep = float(fft_size) / float(samplerate) (fbank_envs_py, _) = psf.fbank(signal,samplerate=samplerate,winlen=timestep,winstep=timestep, nfilt=nfilters,nfft=fft_size,lowfreq=0,highfreq=None,preemph=0) #simple_plot(signal, numpy.arange(signal.shape[0]) * sampleperiod) #simple_plot(fbank_envs[30,:]) #simple_plot(fbank_envs_py[30,:]) print(fbank_envs.shape) print(fbank_envs_py.shape) print(fbank_envs.dtype) print(fbank_envs_py.dtype) fbank_envs.tofile('./tmp/my_fbank.bin') fbank_envs_py.tofile('./tmp/py_fbank.bin')
def lift(signal, samplerate=16000, winlen=0.08, winstep=0.04, numcep=39, nfilt=39, nfft=2048, lowfreq=12.5, highfreq=None, preemph=0.97, ceplifter=39, winfunc=lambda x: numpy.ones((x, ))): feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) feat = numpy.log(feat) feat = dct(feat, n=max(numcep, feat.shape[1]), type=2, axis=1, norm='ortho')[:, :numcep] feat = lifter(feat, ceplifter) return feat
def __extractFeatures(stackedWav, numSteps, numFilt, samplerate, winlen, winstep): ''' [number of waves, Len(wave)] returns [number of waves, numSteps, numFilt] All waves are assumed to be of fixed length ''' assert stackedWav.ndim == 2, 'Should be [number of waves, len(wav)]' extractedList = [] eps = 1e-10 for sample in stackedWav: temp, _ = fbank(sample, samplerate=samplerate, winlen=winlen, winstep=winstep, nfilt=numFilt, winfunc=np.hamming) temp = np.log(temp + eps) assert temp.ndim == 2, 'Should be [numSteps, numFilt]' assert temp.shape[0] == numSteps, 'Should be [numSteps, numFilt]' extractedList.append(temp) return np.array(extractedList)
def extract_mfb(filename, feat_dir, mode, count): audio, sr = librosa.load(filename, sr=c.SR, mono=True) features, energies = fbank(signal=audio, samplerate=c.SR, nfilt=c.FILTER_BANK, winlen=0.025) if c.USE_LOGSCALE: features = 20 * np.log10(np.maximum(features, 1e-5)) features = normalize_frame(features, scale=c.USE_SCALE) print(features.shape) # features_shape : (# of frames, nfilt) output_folder_name, output_file_name = convert_wav_to_feature(filename, feat_dir, mode=mode) if not os.path.exists(output_folder_name): os.makedirs(output_folder_name) if os.path.isfile(output_file_name): print('\'' + '/'.join(output_file_name.split('/')[-3:]) + '\'' + 'file already extracted!') else: with open(output_file_name, 'wb') as fp: pickle.dump(features, fp) print('[%s]feature extraction (%s DB). step : %d, file : \'%s\'' % ('MFB', mode, count, '/'.join(filename.split('/')[-3:])))
def get_features(filename, numcep, numfilt, winlen, winstep, grad): f = Sndfile(filename, 'r') frames = f.nframes samplerate = f.samplerate data = f.read_frames(frames) data = np.asarray(data) #calc mfcc feat_raw, energy = sf.fbank(data, samplerate, winlen, winstep, nfilt=numfilt) feat = np.log(feat_raw) feat = sf.dct(feat, type=2, axis=1, norm='ortho')[:, :numcep] feat = sf.lifter(feat, L=22) feat = np.asarray(feat) #calc log energy log_energy = np.log(energy) #np.log( np.sum(feat_raw**2, axis=1) ) log_energy = log_energy.reshape([log_energy.shape[0], 1]) mat = (feat - np.mean(feat, axis=0)) / (0.5 * np.std(feat, axis=0)) mat = np.concatenate((mat, log_energy), axis=1) #calc first order derivatives if grad >= 1: gradf = np.gradient(mat)[0] mat = np.concatenate((mat, gradf), axis=1) #calc second order derivatives if grad == 2: grad2f = np.gradient(gradf)[0] mat = np.concatenate((mat, grad2f), axis=1) return mat, frames, samplerate
def get_kaldi_features(wav_, y_, X_): ''' Get Kaldi - Discrete FFT features :param wav_: list of trimmed wav file :param y : Array of accents :param filename: Array of filenames :return (numpy array): array of (mfcc, filter_banks, delta_1, delta_2), accent array (utternace level), dict(filename,number of frames) ''' n_mfcc = 13 n_filt = 32 features = [] target = [] f_len = defaultdict(list) for wav, accent, x_arr in (zip(wav_, y_, np.array(X_))): if len(wav) > 0: mfcc_ = mfcc(wav, samplerate=16000, winlen=0.025, winstep=0.01, numcep=n_mfcc) filter_banks, energies = fbank(wav, samplerate=16000, nfilt=n_filt) filter_banks = 20 * np.log10(np.maximum(filter_banks,1e-5)) delta_1 = delta(filter_banks, N=1) delta_2 = delta(delta_1, N=1) filter_banks = normalize_frames(filter_banks, Scale=True) delta_1 = normalize_frames(delta_1, Scale=True) delta_2 = normalize_frames(delta_2, Scale=True) accent_ = list(itertools.repeat(accent, len(mfcc_))) dummies = list(itertools.repeat(x_arr[1:], len(mfcc_))) frames_features = np.hstack([mfcc_, filter_banks, delta_1, delta_2, dummies]) features.append(frames_features) target.append(accent_) f_len[x_arr[0]] = [len(mfcc_),accent]# num of frames features = np.vstack(features) target = np.hstack(target) df = pd.DataFrame.from_dict(f_len,orient='index').reset_index() df.columns = ['filename', 'frame_len', 'accent'] return features, target, df #def get_kaldi_features(wav, accent, x_arr): '''
def extract_mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=lambda x: numpy.ones((x, ))): feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) feat = numpy.log(feat) feat = dct(feat, type=2, axis=1, norm='ortho')[:, :numcep] feat = lifter(feat, ceplifter) if appendEnergy: feat = numpy.c_[feat, numpy.log( energy)] # append cepstral coefficient with log of frame energy return feat, numpy.log(energy)
def mk_MFB(filename, sample_rate=c.SAMPLE_RATE, use_delta=c.USE_DELTA): audio, sr = librosa.load(filename, sr=sample_rate, mono=True) audio = audio.flatten() filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=c.FILTER_BANK, winlen=0.025) delta_1 = delta(filter_banks, N=1) delta_2 = delta(delta_1, N=1) filter_banks = normalize_frames(filter_banks) delta_1 = normalize_frames(delta_1) delta_2 = normalize_frames(delta_2) if use_delta: frames_features = np.hstack([filter_banks, delta_1, delta_2]) else: frames_features = filter_banks np.save(filename.replace('.wav', '.npy'), frames_features) return
def feature_extract(filename, wavpath, tgpath): filename = os.path.splitext(filename)[0] wav_filename = wavpath + '/' + filename + '.wav' tg_filename = tgpath + '/' + filename + '.TextGrid' y, sr = read_wav(wav_filename) _mfccs = fbank(signal=y, samplerate=sr, winfunc=np.hamming, winlen=0.02, nfilt=40)[0] print(_mfccs.shape) # mfccs = mfcc(signal=y,samplerate=sr,winlen=0.02,winfunc=np.hamming) # delta1 = delta(mfccs,1) # delta2 = delta(mfccs,2) # # _mfccs = np.concatenate((mfccs,delta1,delta2),1) _mfccs = normalize(_mfccs) _mfccs = get_martix(_mfccs, 30, 10) _labels = read_textgrid(tg_filename, len(_mfccs)) _labels = to_one_hot(_labels) return _mfccs, _labels
def filter(samplerate, signal, winlen=0.02, winstep=0.01, nfilt=40, nfft=512, lowfreq=100, highfreq=5000, preemph=0.97): """extracts mel filterbank energies from a given signal Args: samplerate (int): samples taken per second signal(1d numpy array): sample values winlen(float): sliding window size in seconds winstep(float): overlap of sliding windows in seconds nfilt(int): number of mel filters to apply nfft(int): size of the discrete fourier transform to use lowfreq(int): lowest frequency to collect highfreq(int): highest frequency to collect preemph(float): preemphesis factor Returns: feat(2d numpy array): filterbank energies """ feat, energy = speechfeatures.fbank(np.array(signal), samplerate, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, lowfreq=lowfreq, highfreq=highfreq, preemph=preemph) return np.swapaxes(feat, 0, 1)
def logfbank_features(signal, samplerate=44100, fps=24, num_filt=40, num_cepstra=40, nfft=8192, **kwargs): winstep = 2 / fps winlen = winstep * 2 feat, energy = psf.fbank(signal=signal, samplerate=samplerate, winlen=winlen, winstep=winstep, nfilt=num_filt, nfft=nfft) feat = np.log(feat) feat = psf.dct(feat, type=2, axis=1, norm='ortho')[:, :num_cepstra] feat = psf.lifter(feat, L=22) feat = np.asarray(feat) energy = np.log(energy) energy = energy.reshape([energy.shape[0], 1]) if feat.shape[0] > 1: std = 0.5 * np.std(feat, axis=0) mat = (feat - np.mean(feat, axis=0)) / std else: mat = feat mat = np.concatenate((mat, energy), axis=1) duration = signal.shape[0] / samplerate expected_frames = fps * duration assert mat.shape[ 0] - expected_frames <= 1, "Producted feature number does not match framerate" return mat
def mk_MFB(filename, sample_rate=c.SAMPLE_RATE, use_delta=c.USE_DELTA, use_scale=c.USE_SCALE, use_logscale=c.USE_LOGSCALE): audio, sr = librosa.load(filename, sr=sample_rate, mono=True) #audio = audio.flatten() filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=c.FILTER_BANK, winlen=0.025) if use_logscale: filter_banks = 20 * np.log10(np.maximum(filter_banks, 1e-5)) ''' if use_delta: delta_1 = delta(filter_banks, N=1) delta_2 = delta(delta_1, N=1) filter_banks = normalize_frames(filter_banks, Scale=use_scale) delta_1 = normalize_frames(delta_1, Scale=use_scale) delta_2 = normalize_frames(delta_2, Scale=use_scale) frames_features = np.hstack([filter_banks, delta_1, delta_2]) ''' #else: filter_banks = normalize_frames(filter_banks, Scale=use_scale) frames_features = filter_banks #print(frames_features.shape) #np.save(filename.replace('.wav', '.npy'),frames_features) return frames_features
def get_fbanks(audio_file): def normalize_frames(signal, epsilon=1e-12): return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal]) y, sr = librosa.load(audio_file, sr=None) assert sr == 16000 trim_len = int(0.25 * sr) if y.shape[0] < 1 * sr: return None y = y[trim_len:-trim_len] filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01) filter_banks = normalize_frames(signal=filter_banks) filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1)) return filter_banks
def test_one(self, file_path): (rate, sig) = wav.read(file_path) assert rate == 16000 # sig ranges from -32768 to +32768 AND NOT -1 to +1 feat, energy = fbank(sig, samplerate=rate, nfilt=self.config_file['feat_dim'], winfunc=np.hamming) tsteps, hidden_dim = feat.shape # calculate log mel filterbank energies for complete file feat_log_full = np.reshape(np.log(feat), (1, tsteps, hidden_dim)) lens = np.array([tsteps]) inputs, lens = torch.from_numpy( np.array(feat_log_full)).float(), torch.from_numpy( np.array(lens)).long() id_to_phone = {v[0]: k for k, v in self.model.phone_to_id.items()} self.model.eval() with torch.no_grad(): if self.cuda: inputs = inputs.cuda() lens = lens.cuda() # Pass through model a = time.time() outputs = self.model(inputs, lens).cpu().numpy() print(time.time() - a) # Since only one example per batch and ignore blank token outputs = outputs[0, :, :-1] softmax = np.exp(outputs) / np.sum(np.exp(outputs), axis=1)[:, None] return softmax, id_to_phone
def graves_2013(self, wav_path): """ Alex Graves, Abdel-rahman Mohamed, Geoffrey E. Hinton: Speech recognition with deep recurrent neural networks. ICASSP 2013: 6645-6649 FBANK features : (40 fbank, 1 energy * 3) The audio data was encoded using a Fourier-transform-based filter-bank with 40 coefficients (plus energy) distributed on a mel-scale, together with their first and second temporal derivatives. Each input vector was therefore size 123. For CMVN The data were normalised so that every element of the input vec- tors had zero mean and unit variance over the training set. there is not description about window I chose to use a hanning window. I left as default the other options which were not mentioned in the paper such as nfft, lowfreq, highfreq, ceplifter, etc. :param wav_path: wav file path :return: a feature sequence """ (rate, sig) = wav.read(Util.get_file_path(self.basepath, wav_path)) # computing features fbank_feat, _ = \ fbank(signal=sig, samplerate=rate, nfilt=40, winfunc=np.hanning) # adding energy energy = np.expand_dims(np.sum(np.power(fbank_feat, 2), axis=-1), 1) fbank_e_feat = np.concatenate((energy, fbank_feat), axis=-1) # concatenating delta vectors delta_feat = delta(fbank_e_feat, 1) delta_delta_feat = delta(fbank_e_feat, 2) return np.concatenate((fbank_e_feat, delta_feat, delta_delta_feat), axis=1)
def computeLogMelFilterBank(self, file_name): ''' Compute the log-mel frequency filterbank feature vector with deltas and double deltas ''' (rate, sig) = wav.read(file_name) fbank_feat, energy = fbank(sig, rate, winlen=0.025, winstep=0.01, nfilt=40) fbank_feat = np.log(fbank_feat) fbank_feat = np.vstack( (fbank_feat.transpose(), energy.transpose())).transpose() deltas = self.computeDeltas(fbank_feat) assert deltas.shape == fbank_feat.shape, "Shapes not equal {0} and \ {1}".format(deltas.shape, fbank_feat.shape) feat_vec = np.vstack((fbank_feat.transpose(), deltas.transpose())) double_deltas = self.computeDeltas(deltas) feat_vec = np.vstack((feat_vec, double_deltas.transpose())).transpose() assert len( feat_vec[0] ) == 123, "Something wrong with feature vector dimensions..." return feat_vec
def read_wav(wav_path, feature_type='logmelfbank', batch_size=1): """Read wav file & convert to MFCC or log mel filterbank features. Args: wav_path: path to a wav file feature: logmelfbank or mfcc Returns: inputs: `[batch_size, max_time, feature_dim]` inputs_seq_len: `[batch_size, frame_num]` """ # Load wav file fs, audio = scipy.io.wavfile.read(wav_path) if feature_type == 'mfcc': features = mfcc(audio, samplerate=fs) # `[291, 13]` elif feature_type == 'logmelfbank': fbank_features, energy = fbank(audio, nfilt=40) logfbank = np.log(fbank_features) logenergy = np.log(energy) logmelfbank = hz2mel(logfbank) features = np.c_[logmelfbank, logenergy] # `[291, 41]` delta1 = delta(features, N=2) delta2 = delta(delta1, N=2) input_data = np.c_[features, delta1, delta2] # `[291, 123]` # Transform to 3D array # `[1, 291, 39]` or `[1, 291, 123]` inputs = np.zeros((batch_size, input_data.shape[0], input_data.shape[1])) for i in range(batch_size): inputs[i] = input_data inputs_seq_len = [inputs.shape[1]] * batch_size # `[291]` # Normalization inputs = (inputs - np.mean(inputs)) / np.std(inputs) return inputs, inputs_seq_len
def audio_feature(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=40, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=np.hamming): feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) log_fbank = np.log(feat) # discard the 0-th dct coefficient mfcc = dct(log_fbank, type=2, axis=1, norm='ortho')[:, 1:numcep] mfcc = lifter(mfcc, ceplifter) d1_mfcc = delta(mfcc, 1) d2_mfcc = delta(d1_mfcc, 1) energy = np.reshape(np.log(energy), (energy.shape[0], 1)) mixed = np.concatenate((mfcc, d1_mfcc, d2_mfcc, energy), axis=1) return mixed
num += 1 prev_sum = list(map(add, prev_sum, vals)) else: final_lattice.append(list(zip(previous_phones, [x / num for x in prev_sum]))) previous_phones = ids prev_sum = vals num = 1 final_lattice.append(list(zip(previous_phones, [x / num for x in prev_sum]))) return final_lattice if __name__=="__main__": rate, sig = wavfile.read('./SA1.WAV.wav') feat, energy = fbank(sig, samplerate=rate, nfilt=38, winfunc=np.hamming) #feat = np.log(feat) tsteps, hidden_dim = feat.shape feat_log_full = np.reshape(np.log(feat), (1, tsteps, hidden_dim)) lens = np.array([tsteps]) inputs, lens = torch.from_numpy(np.array(feat_log_full)).float(), torch.from_numpy(np.array(lens)).long() dl_model = dl_model("test_one") id_to_phone = {v[0]: k for k, v in dl_model.model.phone_to_id.items()} dl_model.model.eval() with torch.no_grad(): #if cuda: # inputs = inputs.cuda() # lens = lens.cuda() # Pass through model
from python_speech_features import fbank from scipy.signal import stft import scipy.io.wavfile as wav # Get data place = os.getcwd() sound_path = place + "/sounds/" data = glob.glob(os.path.join(sound_path, "*.wav")) patterns = [] for path in data: rate, sig = wav.read(path) if "mfcc" in sys.argv: feat = mfcc(sig, rate) elif "fbank" in sys.argv: feat = fbank(sig, rate)[0] elif "logfbank" in sys.argv: feat = logfbank(sig, rate) elif "powspec" in sys.argv: feat = stft(sig)[2].transpose() feat = np.real(feat * np.conj(feat)) else: raise IndexError("Ge mig ett jävla kommandoradsargument för fan!") patterns.append(feat) patterns = np.array(patterns) np.save("numpy_features", patterns)
cnt = 0 for session in os.listdir(audio_path): for dialog in os.listdir(os.path.join(audio_path, session, 'sentences/wav')): if 'Ses' in dialog: for audio in os.listdir( os.path.join(audio_path, session, 'sentences/wav', dialog)): if audio[-4:] == '.wav': input_path = os.path.join(audio_path, session, 'sentences/wav', dialog, audio) (rate, sig) = wav.read(input_path) feat, energy = fbank(sig, samplerate=rate, winlen=0.025, winstep=0.01, nfilt=40, nfft=2048, winfunc=np.hamming) output_file = os.path.join(acoustic_features_path, session, dialog) os.makedirs(output_file, exist_ok=True) np.save(os.path.join(output_file, audio[:-4]), feat) cnt += 1 if cnt % 200 == 0: print(cnt) print(cnt) # 10039
def wav2feature(wav_paths, feature_type='logfbank', feature_dim=40, energy=True, delta1=True, delta2=True): """Read wav file & convert to MFCC or log mel filterbank features. Args: wav_paths (list): paths to a wav file batch_size (int, optional): the batch size feature_type (string, optional): logfbank or fbank or mfcc feature_dim (int, optional): the demension of each feature energy (bool, optional): if True, add energy delta1 (bool, optional): if True, add delta features delta2 (bool, optional): if True, add delta delta features Returns: inputs: A tensor of size `[B, T, input_size]` inputs_seq_len: A tensor of size `[B]` """ if feature_type not in ['logmelfbank', 'logfbank', 'fbank', 'mfcc']: raise ValueError( 'feature_type is "logmelfbank" or "logfbank" or "fbank" or "mfcc".') if not isinstance(wav_paths, list): raise ValueError('wav_paths must be a list.') if delta2 and not delta1: delta1 = True batch_size = len(wav_paths) max_time = 0 for wav_path in wav_paths: # Read wav file fs, audio = scipy.io.wavfile.read(wav_path) if len(audio) > max_time: max_time = len(audio) input_size = feature_dim if energy: input_size + 1 if delta2: input_size *= 3 elif delta1: input_size *= 2 inputs = None inputs_seq_len = np.zeros((batch_size,), dtype=np.int32) for i, wav_path in enumerate(wav_paths): if feature_type == 'mfcc': feat = mfcc(audio, samplerate=fs, numcep=feature_dim) if energy: energy_feat = fbank(audio, samplerate=fs, nfilt=feature_dim)[1] feat = np.c_[feat, energy_feat] else: fbank_feat, energy_feat = fbank( audio, samplerate=fs, nfilt=feature_dim) if feature_type == 'logfbank': fbank_feat = np.log(fbank_feat) feat = fbank_feat if energy: # logenergy = np.log(energy_feat) feat = np.c_[feat, energy_feat] if delta2: delta1_feat = _delta(feat, N=2) delta2_feat = _delta(delta1_feat, N=2) feat = np.c_[feat, delta1_feat, delta2_feat] elif delta1: delta1_feat = _delta(feat, N=2) feat = np.c_[feat, delta1_feat] # Normalize per wav feat = (feat - np.mean(feat)) / np.std(feat) if inputs is None: max_time = feat.shape[0] input_size = feat.shape[-1] inputs = np.zeros((batch_size, max_time, input_size)) inputs[i] = feat inputs_seq_len[i] = len(feat) return inputs, inputs_seq_len
return (s[-1][0], s[-2], s[-1][1] ) # BH/1A_endpt.wav: sort by '1', 'BH', 'A' filelist.sort(key=keyfunc) for i, file in enumerate(filelist): rate, sig = wav.read(file) duration = sig.size / rate winlen = duration / (n_frames * (1 - overlap) + overlap) winstep = winlen * (1 - overlap) feat, energy = fbank(sig, rate, winlen, winstep, nfilt=n_bands, nfft=4096, winfunc=np.hamming) feat = np.log(feat) feat = feat.transpose() # plt.subplot(131) plt.imshow(feat) plt.axis('off') # feat2 = feat.copy() # feat2[feat2 < 4] = 0 # plt.subplot(132) # plt.imshow(feat2)
def get_features(filename, numcep, numfilt, winlen, winstep, method=1, quaternion=False): #f = Sndfile(filename, 'r') #frames = f.nframes #samplerate = f.samplerate #data = f.read_frames(frames) #data = np.asarray(data) samplerate, data = wav.read(filename) # Claculate mfcc feat_raw, energy = sf.fbank(data, samplerate, winlen, winstep, nfilt=numfilt) feat = np.log(feat_raw) feat = sf.dct(feat, type=2, axis=1, norm='ortho')[:, :numcep] feat = sf.lifter(feat, L=22) feat = np.asarray(feat) #calc log energy log_energy = np.log(energy) #np.log( np.sum(feat_raw**2, axis=1) ) log_energy = log_energy.reshape([log_energy.shape[0], 1]) mat = (feat - np.mean(feat, axis=0)) / (0.5 * np.std(feat, axis=0)) mat = np.concatenate((mat, log_energy), axis=1) # Calculate first order derivatives # if grad >= 1: # gradf = np.gradient(mat)[0] # mat = np.concatenate((mat, gradf), axis=1) # #calc second order derivatives # if grad == 2: # grad2f = np.gradient(gradf)[0] # mat = np.concatenate((mat, grad2f), axis=1) # Calculate 1st-2nd-3rd order derivatives if method: gradf = np.gradient(mat)[0] mat = np.concatenate((mat, gradf), axis=1) grad2f = np.gradient(gradf)[0] mat = np.concatenate((mat, grad2f), axis=1) grad3f = np.gradient(grad2f)[0] mat = np.concatenate((mat, grad3f), axis=1) else: zerof = np.zeros(shape=mat.shape) mat = np.concatenate((mat, zerof), axis=1) gradf = np.gradient(mat)[0] mat = np.concatenate((mat, gradf), axis=1) grad2f = np.gradient(gradf)[0] mat = np.concatenate((mat, grad2f), axis=1) if quaternion: Q_mat = np.reshape(mat, (mat.shape[0], 4, mat.shape[1] // 4)) mat = Q_mat return mat, data, samplerate
#coding=utf8 from python_speech_features import fbank from python_speech_features import logfbank import scipy.io.wavfile as wav path = '/home/sw/Shin/Codes/DL4SS_Keras/Data_with_dev/male_test.wav' (rate, sig) = wav.read(path) print(rate, sig) print sig.shape #43520 fbank_feat = fbank(sig, rate, winstep=0.01, nfilt=40) print fbank_feat[0].shape # 271的结果是这样得到噢的,43520/(0.01s*16000)
#aa,bb,cc,dd, plt = get_spectrogram(new_file_name_path) fd = 2048 fs = 1024 f_size = fd * fs (rate, sig) = wav.read(new_file_name_path) x_brahms, sr_brahms = librosa.load(file, duration=30, offset=30) mfcc_feat = mfcc(sig, samplerate=rate) #(2992, 13) ipdb.set_trace() #mfcc_one_line = mfcc_feat.reshape(38896, 1) fbank_feat = fbank(sig, samplerate=rate) logfbank_feat = logfbank(sig, samplerate=rate) d_mfcc_feat = delta(mfcc_feat, 2) #gammatone.gtgram.gtgram(wave, fs, window_time, hop_time, channels, f_min) gtgram_function = gtgram.gtgram(sig, rate, .250, .125, 1, 20) print("mfcc_feat.shape:", mfcc_feat.shape) print("mfcc_one_line.shape", mfcc_one_line.shape) print("logfbank_feat.shape", logfbank_feat.shape) print("d_mfcc_feat.shape", d_mfcc_feat.shape) print("gtgram_function.shape", gtgram_function.shape) print("gtgram_function.shape.T", gtgram_function.T.shape) #ssc = ssc(sig,samplerate=rate) #print(logfbank_feat[1:3,:])
def wav2feature(wav_path, feature_type='fbank', feature_dim=40, use_energy=True, use_delta1=True, use_delta2=True, window=0.025, slide=0.01, dtype=np.float32): """Read wav file & convert to MFCC or log mel filterbank features. Args: wav_path (string): the path to a wav file feature_type (string, optional): fbank or mfcc feature_dim (int, optional): the demension of each feature use_energy (bool, optional): if True, add energy use_delta1 (bool, optional): if True, add delta features use_delta2 (bool, optional): if True, add delta delta features window (float, optional): window width to extract features slide (float, optional): extract features per 'slide' dtype (optional): default is np.float32 Returns: feat (np.ndarray): A tensor of size `[T, feature_dim]` """ if feature_type not in ['fbank', 'mfcc']: raise ValueError('feature_type is or "fbank" or "mfcc".') if use_delta2: delta1 = True # Read wav file try: fs, audio = scipy.io.wavfile.read(wav_path) except ValueError: # Read NIST file wav_path_tmp = '/tmp//tmp.wav' # result = subprocess.call(['sph2pipe', '-f', 'wav', wav_path, wav_path_tmp]) result = subprocess.call(['sox', wav_path, '-t', 'wav', wav_path_tmp]) if result != 0: raise ValueError # Try again fs, audio = scipy.io.wavfile.read(wav_path_tmp) subprocess.call(['rm', wav_path_tmp]) if feature_type == 'mfcc': feat = mfcc(audio, samplerate=fs, numcep=feature_dim) if use_energy: energy_feat = fbank(audio, samplerate=fs, nfilt=feature_dim)[1] energy_feat = energy_feat.reshape(-1, 1) feat = np.concatenate((feat, energy_feat), axis=1) # NOTE: only fbank function retures energy else: fbank_feat, energy_feat = fbank(audio, samplerate=fs, winlen=window, winstep=slide, nfilt=feature_dim, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, winfunc=np.hamming) if feature_type == 'fbank': feat = np.log(fbank_feat) if use_energy: energy_feat = energy_feat.reshape(-1, 1) # logenergy = np.log(energy_feat) feat = np.concatenate((feat, energy_feat), axis=1) # NOTE: energy_feat may be not log-scale. if use_delta2: delta1_feat = _delta(feat, N=2) delta2_feat = _delta(delta1_feat, N=2) feat = np.concatenate((feat, delta1_feat, delta2_feat), axis=1) elif delta1: delta1_feat = _delta(feat, N=2) feat = np.concatenate((feat, delta1_feat), axis=1) return feat
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) return np.log(feat),energy
def _fbank(*args, **kwargs) -> np.ndarray: feat, _ = fbank(*args, **kwargs) return feat
def make_features(file_path: str, **kwargs) -> np.ndarray: """ Use `python_speech_features` lib to extract MFCC features from the audio file. """ fs, audio = wav.read(file_path) feat, energy = python_speech_features.fbank(audio, samplerate=fs, **kwargs) features = np.log(feat) return features
def get_fbank(signal, target_sample_rate): filter_banks, energies = fbank(signal, samplerate=target_sample_rate, nfilt=40,nfft=int(target_sample_rate*0.025)) filter_banks = normalize_frames(filter_banks) return np.array(filter_banks)