def create_mfcc(filename: str) -> np.ndarray: bitrate, signal = wav.read(filename) mfcc_data = mfcc(signal, bitrate, numcep=lingua_franca_config.num_cepstra, nfft=1200) return mfcc_data
def mfcc_features( wavarr, win_len=5, # window length for feature extraction in secs - run_orig.m win_overlap=0, # specify the overlap between adjacent windows for feature extraction in percentage - run_orig.m nfft=0, lowfreq=5, highfreq=1000, kDelta=False, logging=False ): # rate, aud_data = scipy.io.wavfile.read(file) rate = wavarr[0] signal = wavarr[1] d_mfcc_feat = None if nfft == 0: nfft = fft.calculate_nfft(signal.size) #FFT size as the padded next power-of-two mfcc_feat = base.mfcc(signal, rate, winlen=win_len, #window_length*1000 in extractFeatures.m winstep=win_len-win_overlap, #10ms shift; Ts = 10 in extractFeatures.m numcep=13, #C=12; in extractFeatures.m nfilt=20, #M=20; in extractFeatures.m nfft=nfft, #pad to next power-of-2 lowfreq=5, highfreq=1000, #LF=5; HF=1000; in extractFeatures.m preemph=0.97, ceplifter=22, #alpha=0.97; L=22; in extractFeatures.m winfunc=np.hamming, #@hamming appendEnergy=False # replace first cepstral coefficient with log of frame energy ) if kDelta: d_mfcc_feat = base.delta(mfcc_feat, 2) #compute delta features from a feature vector #fbank_feat = sigproc.logfbank(signal, rate) #compute log Mel-filterbank energy features from an audio signal return mfcc_feat, d_mfcc_feat
def get_mfcc_pca(sample_rate, signal, num_components): ''' Returns the N largest principal components of input multivariate time series Required input format: each time series arranged in a column vector ''' mfccs = mfcc(signal, samplerate=sample_rate, appendEnergy=False) pca = PCA(n_components = num_components) pca.fit(mfccs) components = pca.components_ # each row is a component return components.flatten()
def mfcc(signal, rate=default_rate, filters_number=default_filters_number, augmented=default_augmented): mfcc_features = mfcc(signal, rate, numcep=filters_number) if not augmented: return mfcc_features d_mfcc_features = delta(mfcc_features, 2) a_mfcc_features = delta(d_mfcc_features, 2) concatenated_features = np.concatenate( (mfcc_features, d_mfcc_features, a_mfcc_features), axis=1) return concatenated_features
def extract_feature(wav_path): """Extract 39-dim mfcc feature.""" fs, audio = wav.read(wav_path) mfcc = base.mfcc(audio, fs, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, preemph=0.97, appendEnergy=True) mfcc_d = base.delta(mfcc, N=2) mfcc_dd = base.delta(mfcc_d, N=2) feat = np.concatenate([mfcc, mfcc_d, mfcc_dd], axis=1) return feat
def Features(self, data, rate, dim): spec = np.abs(np.fft.rfft(data)) freq = np.fft.rfftfreq(len(data), d=1 / dim) a = spec / spec.sum() meaN = (freq * a).sum() std = np.sqrt(np.sum(a * ((freq - meaN) ** 2))) a_cumsum = np.cumsum(a) mediaN = freq[len(a_cumsum[a_cumsum <= 0.5])] modE = freq[a.argmax()] q25 = freq[len(a_cumsum[a_cumsum <= 0.25])] q75 = freq[len(a_cumsum[a_cumsum <= 0.75])] IQR = q75 - q25 z = a - a.mean() w = a.std() skewnesS = ((z ** 3).sum() / (len(spec) - 1)) / w ** 3 kurtosiS = ((z ** 4).sum() / (len(spec) - 1)) / w ** 4 m = speech.mfcc(data,rate) f = speech.fbank(data,rate) l = speech.logfbank(data,rate) s = speech.ssc(data,rate) data = pd.DataFrame(data) desc = data.describe() mean = desc.loc["mean"].get(0) mad = data.mad().get(0) sd = desc.loc["std"].get(0) median = data.median().get(0) minimum = desc.loc["min"].get(0) maximum = desc.loc["max"].get(0) Q25 = desc.loc["25%"].get(0) Q75 = desc.loc["75%"].get(0) interquartileR = Q75 - Q25 skewness = data.skew().get(0) kurtosis = data.kurtosis().get(0) result = { "Mean": mean, "Mad": mad, "deviation": sd, "Median": median, "Min": minimum, "Max": maximum, "interquartileR": interquartileR, "Skewness": skewness, "Q25": Q25, "Q75": Q75, "Kurtosis": kurtosis, "mfcc_mean": np.mean(m), "mfcc_max": np.max(m), "mfcc_min": np.min(m), "fbank_mean": np.mean(f[0]), "fbank_max": np.max(f[0]), "fbank_min": np.min(f[0]), "energy_mean": np.mean(f[1]), "energy_max": np.max(f[1]), "energy_min": np.min(f[1]), "lfbank_mean": np.mean(l), "lfbank_max": np.max(l), "lfbank_min": np.min(l), "ssc_mean": np.mean(s), "ssc_max": np.max(s), "ssc_min": np.min(s), "meaN": meaN, "deviatioN": std, "mediaN": mediaN, "modE": modE, "IQR": IQR, "skewnesS": skewnesS, "q25": q25, "q75": q75, "kurtosiS": kurtosiS} return result
def audio_read(datafs): (data, fs) = wav.read(datafs) ceps = mfcc(fs, numcep=cepCount) feat2 = ssc(fs, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97) ls = [] for i in range(ceps.shape[1]): temp = ceps[:, i] dtemp = np.gradient(temp) lfeatures = [ np.mean(temp), np.var(temp), np.amax(temp), np.amin(temp), np.var(dtemp), np.mean(temp[0:temp.shape[0] / 2]), np.mean(temp[temp.shape[0] / 2:temp.shape[0]]) ] temp2 = np.array(lfeatures) ls.append(temp2) ls2 = [] for i in range(feat2.shape[1]): temp = feat2[:, i] dtemp = np.gradient(temp) lfeatures = [ np.mean(temp), np.var(temp), np.amax(temp), np.amin(temp), np.var(dtemp), np.mean(temp[0:temp.shape[0] / 2]), np.mean(temp[temp.shape[0] / 2:temp.shape[0]]) ] temp2 = np.array(lfeatures) ls2.append(temp2) source = np.array(ls).flatten() source = np.append(source, np.array(ls2).flatten()) return source
def get_mfcc(x): y = np.concatenate([ mfcc(x, numcep=12, winlen=0.01, winstep=0.005), logfbank(x, nfilt=1, winlen=0.01, winstep=0.005) ], axis=-1) derivatives = [] previousf = np.zeros((13, )) for i in range(len(y)): if (i + 1) == len(y): nextf = np.zeros((13, )) else: nextf = y[i + 1] derivatives.append(((nextf - previousf) / 2).reshape((1, 13))) previousf = y[i] derivatives = np.concatenate(derivatives, axis=0) y = np.concatenate([y, derivatives], axis=1) return y
def mel(self, name): y, sr = librosa.load(name, sr=None) y = self.norm(y) # plt.figure() # plt.plot([x for x in range(y.shape[0])],y) # plt.show() zero, ener = self.get_feature(y) new_y = self.detect(y, zero, ener, name) mfcc_feature = mfcc(signal=new_y, samplerate=sr, winlen=self.len_frame, winstep=(1 - self.ratio) * self.len_frame, numcep=self.n_mfcc, nfilt=26, nfft=2000, winfunc=np.hamming) # plt.matshow(mfcc_feature) # plt.show() return mfcc_feature
def compute_mfcc(wav_path, winstep=0.01): (rate, sig) = wav.read(wav_path) mfcc_feat = mfcc(signal=sig, samplerate=rate, appendEnergy=True, winstep=winstep) # Deltas d_mfcc_feat = delta(mfcc_feat, 2) # Deltas-Deltas dd_mfcc_feat = delta(d_mfcc_feat, 2) # transpose mfcc_feat = np.transpose(mfcc_feat) d_mfcc_feat = np.transpose(d_mfcc_feat) dd_mfcc_feat = np.transpose(dd_mfcc_feat) # concat above three features concat_mfcc_feat = np.concatenate((mfcc_feat, d_mfcc_feat, dd_mfcc_feat)) return concat_mfcc_feat
def extract_mfcc(wave_files, encoded_labels, files_destination, labels_destination, mfcc_type): labels_df = pd.DataFrame(columns=['file', 'label']) files_num = len(wave_files) for i, (wave_file, label) in enumerate(zip(wave_files, encoded_labels)): wave_file_name = wave_file.split('/')[-1] mfcc_file_path = files_destination + wave_file_name.split('.')[0] + '.npy' print('{}/{}\t{}'.format(i + 1, files_num, wave_file_name)) wave_data, sample_rate = sf.read(wave_file) # save mfcc if mfcc_type == 'cnn': mfcc = librosa.feature.mfcc(wave_data, sr=sample_rate) elif mfcc_type == 'rnn': mfcc = base.mfcc(wave_data, samplerate=sample_rate, numcep=13, winstep=0.01, winfunc=np.hamming) deltas = base.delta(mfcc, 2) # normalize mfcc over all frames mfcc_mean = np.mean(mfcc, axis=0) mfcc_std = np.std(mfcc, axis=0) mfcc = (mfcc - mfcc_mean)/mfcc_std # normalize deltas over all frames delta_mean = np.mean(deltas, axis=0) delta_std = np.std(deltas, axis=0) deltas = (deltas - delta_mean)/delta_std np.save(mfcc_file_path, np.concatenate((mfcc, deltas), axis=1), allow_pickle=False) labels_df.loc[i] = [wave_file_name, label] labels_df.to_csv(labels_destination, sep='\t', index=False)
def get_mfcc(x): y = np.concatenate([ mfcc(x, numcep=12, winlen=0.01, winstep=0.005), logfbank(x, nfilt=1, winlen=0.01, winstep=0.005) ], axis=-1) derivatives = [] previousf = np.zeros((13, )) for i in range(len(y)): if (i + 1) == len(y): nextf = np.zeros((13, )) else: nextf = y[i + 1] derivatives.append(((nextf - previousf) / 2).reshape((1, 13))) previousf = y[i] derivatives = np.concatenate(derivatives, axis=0) y = np.concatenate([y, derivatives], axis=1) ynoise = np.random.normal(0, 0.6, y.shape) orig_len = len(y) pad = [np.zeros((1, 26))] * (3150 - y.shape[0]) ypad = np.concatenate([y] + pad, axis=0) noisepad = np.concatenate([ynoise] + pad, axis=0) return orig_len, ypad, ypad + noisepad
def get_mfcc(sample_rate, signal): ''' Returns Mel Frequency Cepstral Coefficients Provides information about sinusoids that constitute sound wave, adjusted to account for the way human's perceive sound ''' mfccs = mfcc(signal, samplerate=sample_rate, appendEnergy=False) mfcc_cov = np.cov(mfccs.T) dim = mfcc_cov.shape[0] # Get means mean = mfccs.mean(axis=0) # Get variances (i.e. diagonal of covariance matrix) var_mask = np.nonzero(np.eye(dim)) var = mfcc_cov[var_mask] # Get off-diagonal covariances cov_mask = np.nonzero(np.tri(dim) - np.eye(dim)) cov = mfcc_cov[cov_mask] # NOTE: librosa also provides an MFCC function, but I believe it # requires passing as input some complicated information return mean, var, cov
window_size = int(fs * params['t_window']) exp = 1 while True: if np.power(2, exp) - window_size >= 0: fft_size = np.power(2, exp) break else: exp += 1 prime_features = base.mfcc(prime_data[1], samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], numcep=params['ncep'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], ceplifter=0, appendEnergy=params['use_energy'], winfunc=params['windowing']) target_features = base.mfcc(target_data[1], samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], numcep=params['ncep'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0,
#coding:utf-8 from pydub.audio_segment import AudioSegment#pydub是python中用户处理音频文件的一个库 from scipy.io import wavfile from python_speech_features.base import mfcc #傅里叶变换+梅尔倒谱 import pandas as pd import numpy as np import sys #mfcc 包含了两个步骤,一个是傅里叶变换,一个是梅尔倒谱系数 song = AudioSegment.from_file('./data/灰姑娘.mp3', format = 'mp3')#读入歌曲 # song_split = song[-30*1000:]#切分歌曲 song.export('./data/灰姑娘.wav', format= 'wav')#MP3到wav的转换 rate, data = wavfile.read('./data/灰姑娘.wav')#每秒播放速度及数据 mf_feat = mfcc(data, rate, numcep = 13, nfft = 2048)#傅里叶变换速度每秒多少帧 # numcep = 13 越大越慢 # 108键, 小于1/4 欢快,大于1/4悲伤 print(mf_feat) print(mf_feat.shape) sys.exit(0) # df = pd.DataFrame(mf_feat) # df.to_csv('./mfFeat.csv') # print(mf_feat) # print(mf_feat.shape) mm = np.mean(mf_feat, axis = 0)#隐含了时域上的相关性 mf = np.transpose(mf_feat) mc = np.cov(mf) #原mf_feat矩阵列的协方差矩阵 # print(mc) result = mm
def calc_mfcc(pathname): samprate, samples = wavfile.read(pathname) return mfcc(samples, samplerate=samprate, appendEnergy=False)
def audio_features(params, img_audio, audio_path, append_name, node_list): output_file = params['output_file'] # create pytable atom for the features f_atom = tables.Float32Atom() count = 1 # keep track of the nodes for which no features could be made, places # database contains some empty audio files invalid = [] for node in node_list: print(f'processing file: {count}') count += 1 # create a group for the desired feature type audio_node = output_file.create_group(node, params['feat']) # get the base name of the node this feature will be appended to base_name = node._v_name.split(append_name)[1] # get the caption file names corresponding to the image of this node caption_files = img_audio[base_name][1] for cap in caption_files: # remove extension from the caption filename base_capt = cap.split('.')[0] # remove folder path from file names (Places/coco database) if '/' in base_capt: base_capt = base_capt.split('/')[-1] if '-' in base_capt: base_capt = base_capt.replace('-', '_') # read audio samples try: input_data, fs = librosa.load(os.path.join(audio_path, cap), sr=None) # in the places database some of the audiofiles are empty if len(input_data) == 0: break except: # try to repair broken files, some files had a wrong header. # In Places I found some that could not be fixed however try: fix_wav(os.path.join(audio_path, cap)) #input_data = read(os.path.join(audio_path, cap)) except: # the loop will break, if no valid audio features could # be made for this image, the entire node is deleted. break # set the fft size to the power of two equal to or greater than # the window size. window_size = int(fs * params['t_window']) exp = 1 while True: if np.power(2, exp) - window_size >= 0: fft_size = np.power(2, exp) break else: exp += 1 ############################################################################### # create audio features if params['feat'] == 'raw': # calculate the needed frame shift, premphasize and frame # the signal frame_shift = int(fs * params['t_shift']) input = sigproc.preemphasis(input_data, coeff=params['alpha']) features = sigproc.framesig(input_data, frame_len=window_size, frame_step=frame_shift, winfunc=params['windowing']) elif params['feat'] == 'freq_spectrum': # calculate the needed frame shift, premphasize and frame # the signal frame_shift = int(fs * params['t_shift']) input = sigproc.preemphasis(input_data, coeff=params['alpha']) frames = sigproc.framesig(input, frame_len=window_size, frame_step=frame_shift, winfunc=params['windowing']) # create the power spectrum features = sigproc.powspec(frames, fft_size) elif params['feat'] == 'fbanks': # create mel filterbank features [features, energy] = base.fbank(input_data, samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], winfunc=params['windowing']) elif params['feat'] == 'mfcc': # create mfcc features features = base.mfcc(input_data, samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], numcep=params['ncep'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], ceplifter=0, appendEnergy=params['use_energy'], winfunc=params['windowing']) # apply cepstral mean variance normalisation if params['normalise']: features = (features - features.mean(0)) / features.std(0) # optionally add the deltas and double deltas if params['use_deltas']: single_delta = base.delta(features, params['delta_n']) double_delta = base.delta(single_delta, params['delta_n']) features = np.concatenate( [features, single_delta, double_delta], 1) ############################################################################### # create new leaf node in the feature node for the current audio # file feature_shape = np.shape(features)[1] f_table = output_file.create_earray(audio_node, append_name + base_capt, f_atom, (0, feature_shape), expectedrows=5000) # append new data to the tables f_table.append(features) if audio_node._f_list_nodes() == []: # keep track of all the invalid nodes for which no features could # be made invalid.append(node._v_name) # remove the top node including all other features if no captions # features could be created output_file.remove_node(node, recursive=True) print(invalid) print(f'There were {len(invalid)} files that could not be processed')
def calcAcousticFeatures(sound, fs, featureMode, speakerType='male', tmpDir=".", speech_sound_type='vowel', octave_binding=None): """ Calculates acoustic features with given featureMode for sound with audio sampling rate fs. sound: 1D np.array or 2D array of horizontal vector. Returns a tuple containing a 2D np.array of numTimeSteps x numFeatureParams and a 2 x numFeatureParams array that contains scaling factors that can be used to ensure equal contribution of each feature type. """ # how many Hz may change in the first formant in mergeFactor ms – few variation for vowels required if speech_sound_type == 'vowel': maxFormantChange = 50 elif speech_sound_type == 'syllable': maxFormantChange = 800 if np.ndim(sound) == 2: sound = sound[0, :] if featureMode == 'formants': formants = getPraatFormantsMean(sound, fs, speakerType, tmpDir) return (np.array(formants).reshape((1, -1)), None) elif featureMode == 'formants_full': (timePos, formants) = getPraatFormants(sound, fs, speakerType, tmpDir) # downsample mergeFactor = 10 # how many time steps (=ms) should be merged to one value newFormants = np.array(np.mean(formants[0:mergeFactor, :], 0), ndmin=2) for t in range(mergeFactor, len(timePos), mergeFactor): new = np.mean(formants[t:t + mergeFactor, :], 0) if abs(newFormants[-1, 0] - new[0]) > maxFormantChange: # TODO: this is dangerous if the first detected formant is incorrect! pass else: newFormants = np.vstack((newFormants, new)) return (newFormants, None) elif featureMode == 'mfcc': # sound as 1d if sound.ndim == 2: sound = np.reshape(sound, (-1)) # returns (numFrames x numCeps) np array window_length = 0.02 # 0.025 * 22050 = ca. 551 frames window_step = 0.01 # 0.01 * 22050 = ca. 221 frames num_cepstrals = 13 features = mfcc(sound, fs, window_length, window_step, num_cepstrals) return (features, None) elif featureMode == 'mfcc_formants': # sound as 1d if sound.ndim == 2: sound = np.reshape(sound, (-1)) # returns (numFrames x numCeps) np array window_length = 0.02 # 0.025 * 22050 = ca. 551 frames window_step = 0.005 # 0.01 * 22050 = ca. 221 frames num_cepstrals = 13 features = mfcc(sound, fs, window_length, window_step, num_cepstrals) (timePos, formants) = getPraatFormants(sound, fs, speakerType, tmpDir) # downsample mergeFactor = 10 # how many time steps (=ms) should be merged to one value # get a good estimate for initial formants (ignoring initial perturbations): initialFormants = np.median(formants[0:5, :], axis=0) newFormants = None i = 0 while not newFormants: if abs(formants[i, 0] - initialFormants[0]) < maxFormantChange: newFormants = formants[i, :] break else: i += 1 newFormants = np.array(np.mean(formants[0:mergeFactor, :], 0), ndmin=2) for t in range(mergeFactor, len(timePos), mergeFactor): new = np.mean(formants[t:t + mergeFactor, :], 0) if abs(newFormants[-1, 0] - new[0]) > maxFormantChange: pass else: newFormants = np.vstack((newFormants, new)) # resample formants according to mfccs # TODO Warning: interp just copies the last element to make trajectories longer!!! # alternative: https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html resampledFormants = np.zeros( (np.shape(features)[0], np.shape(newFormants)[1])) for i in range(np.shape(newFormants)[1]): resampledFormants[:, i] = np.interp(range(np.shape(features)[0]), range(np.shape(newFormants)[0]), newFormants[:, i]) minmax = np.array([ np.concatenate((np.repeat([-1 / np.shape(resampledFormants)[1]], np.shape(resampledFormants)[1]), np.repeat([-1 / np.shape(features)[1]], np.shape(features)[1]))), np.concatenate((np.repeat([1 / np.shape(resampledFormants)[1]], np.shape(resampledFormants)[1]), np.repeat([1 / np.shape(features)[1]], np.shape(features)[1]))) ]) return (np.concatenate((resampledFormants, features), axis=1), minmax) elif featureMode == "fbank": # sound as 1d if sound.ndim == 2: sound = np.reshape(sound, (-1)) fbank_feat = logfbank(sound, fs, nfft=1024) return (fbank_feat, np.concatenate(([-1 * np.ones(fbank_feat.shape[1])], [np.ones(fbank_feat.shape[1])]))) elif featureMode == "logfbank": # sound as 1d if sound.ndim == 2: sound = np.reshape(sound, (-1)) fbank_feat = logfbank(sound, fs, nfft=1024) return (fbank_feat, np.concatenate(([-1 * np.ones(fbank_feat.shape[1])], [np.ones(fbank_feat.shape[1])]))) elif featureMode == 'gbfb': # Gabor filter bank features, requires Octave installed # scaledAudio = np.int16(copiedAudio/maxAmplitude * 32767) soundNorm = sound / 32767 #features = octave_binding.gbfb_feature_extraction(soundNorm, fs) features = octave_binding.heq( octave_binding.gbfb( octave_binding.log_mel_spectrogram(soundNorm, fs))) features = features.transpose() return (features, np.concatenate(([-1 * np.ones(features.shape[1])], [np.ones(features.shape[1])]))) else: print("Feature mode " + featureMode + " not yet defined in calcAcousticFeatures()!") return None