def draw_logmel(wav_file, label, feature_name, logmelband_nums=[4, 5]): (y, sr) = librosa.load(wav_file) rate, data = read(wav_file) plt.figure() plt.subplot(2, 1, 1) D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) librosa.display.specshow(D, y_axis='linear', cmap="viridis") plt.colorbar(format='%+2.0f dB') plt.title('Linear-frequency power spectrogram') plt.ylim([0, 8192]) plt.subplot(2, 1, 2) feat, _ = fbank(y, samplerate=rate, nfft=2048) logfbank_energy = np.log(feat).T logfbank_energy = logfbank(data, samplerate=rate, nfft=2048) colors = ["r", "g", "b"] for i in range(len(logmelband_nums)): logmelband_num = logmelband_nums[i] X = np.linspace(0, len(logfbank_energy[logmelband_num]), len(logfbank_energy[logmelband_num])) plt.plot(X, logfbank_energy[logmelband_num], 'o', markersize=5, color=colors[i], label="logMelFreqBands[{}]".format(logmelband_num)) #quantile_value=np.quantile(logfbank_energy[logmelband_num],0.25*2) #plt.plot(X,[quantile_value]*len(X),markersize=2,color="r",label="quartile2") #quantile_value=np.quantile(logfbank_energy[logmelband_num],0.25*3) #plt.plot(X,[quantile_value]*len(X),markersize=2,color="g",label="quartile3") plt.title('logMelFrequencyBands(de)') plt.ylabel("Filterbank") plt.xlabel("Frame Idx") plt.legend(loc="upper right", prop={"size": 8}) plt.savefig("/home/jialu/voice_quality_plots/v2/logMelFreqBand/" + label + "_" + feature_name + ".png")
def tiqu(path, weidu, logenergy, energy_1): basedir = path for mulu in os.listdir(basedir): input_dir = os.path.join(basedir, mulu, "wav") #语音文件的路劲 output_dir2 = os.path.join(basedir, mulu, 'log_yuan') #log梅尔普系数 # output_dir3 =r"C:\Users\a7825\Desktop\工作空间\语音数据\UUDB\第一次实验\打标签\第三批\C063L\mfcc" #mfcc muluz.mkdir(output_dir2) for ad_file in os.listdir(input_dir): print(input_dir + "/" + ad_file) (fs, audio) = wav.read(input_dir + "/" + ad_file) if energy_1 == True: feat, energy = fbank(audio, fs, nfilt=weidu) np.savetxt(output_dir2 + "/" + ad_file + ".csv", feat, delimiter=',') if logenergy == True: log = logfbank(audio, fs, nfilt=weidu) np.savetxt(output_dir2 + "/" + ad_file + ".csv", log, delimiter=',')
def logfbank_features(fname): """ Compute log Mel-filterbank energy features """ (rate, signal) = wavfile.read(fname) fbank_beat = logfbank(signal, rate) # take mean of all rows features = fbank_beat.mean(axis=0) return features
def logfbank(signal, rate=default_rate, filters_number=default_filters_number, augmented=default_augmented): logfbank_features = logfbank(signal, rate, nfilt=filters_number) if not augmented: return logfbank_features d_logfbank_features = delta(logfbank_features, 2) a_logfbank_features = delta(d_logfbank_features, 2) concatenated_features = np.concatenate( (logfbank_features, d_logfbank_features, a_logfbank_features), axis=1) return concatenated_features
def Features(self, data, rate, dim): spec = np.abs(np.fft.rfft(data)) freq = np.fft.rfftfreq(len(data), d=1 / dim) a = spec / spec.sum() meaN = (freq * a).sum() std = np.sqrt(np.sum(a * ((freq - meaN) ** 2))) a_cumsum = np.cumsum(a) mediaN = freq[len(a_cumsum[a_cumsum <= 0.5])] modE = freq[a.argmax()] q25 = freq[len(a_cumsum[a_cumsum <= 0.25])] q75 = freq[len(a_cumsum[a_cumsum <= 0.75])] IQR = q75 - q25 z = a - a.mean() w = a.std() skewnesS = ((z ** 3).sum() / (len(spec) - 1)) / w ** 3 kurtosiS = ((z ** 4).sum() / (len(spec) - 1)) / w ** 4 m = speech.mfcc(data,rate) f = speech.fbank(data,rate) l = speech.logfbank(data,rate) s = speech.ssc(data,rate) data = pd.DataFrame(data) desc = data.describe() mean = desc.loc["mean"].get(0) mad = data.mad().get(0) sd = desc.loc["std"].get(0) median = data.median().get(0) minimum = desc.loc["min"].get(0) maximum = desc.loc["max"].get(0) Q25 = desc.loc["25%"].get(0) Q75 = desc.loc["75%"].get(0) interquartileR = Q75 - Q25 skewness = data.skew().get(0) kurtosis = data.kurtosis().get(0) result = { "Mean": mean, "Mad": mad, "deviation": sd, "Median": median, "Min": minimum, "Max": maximum, "interquartileR": interquartileR, "Skewness": skewness, "Q25": Q25, "Q75": Q75, "Kurtosis": kurtosis, "mfcc_mean": np.mean(m), "mfcc_max": np.max(m), "mfcc_min": np.min(m), "fbank_mean": np.mean(f[0]), "fbank_max": np.max(f[0]), "fbank_min": np.min(f[0]), "energy_mean": np.mean(f[1]), "energy_max": np.max(f[1]), "energy_min": np.min(f[1]), "lfbank_mean": np.mean(l), "lfbank_max": np.max(l), "lfbank_min": np.min(l), "ssc_mean": np.mean(s), "ssc_max": np.max(s), "ssc_min": np.min(s), "meaN": meaN, "deviatioN": std, "mediaN": mediaN, "modE": modE, "IQR": IQR, "skewnesS": skewnesS, "q25": q25, "q75": q75, "kurtosiS": kurtosiS} return result
def get_fbank_feature(wavsignal, fs): """ 输入为wav文件数学表示和采样频率,输出为语音的FBANK特征+一阶差分+二阶差分 :param wavsignal: :param fs: :return: """ feat_fbank = logfbank(wavsignal, fs, nfilt=40, nfft=2048, winstep=0.025, winlen=0.05) feat_fbank_d = delta(feat_fbank, 2) feat_fbank_dd = delta(feat_fbank_d, 2) wav_feature = np.column_stack((feat_fbank, feat_fbank_d, feat_fbank_dd)) return wav_feature
def get_mfcc(x): y = np.concatenate([ mfcc(x, numcep=12, winlen=0.01, winstep=0.005), logfbank(x, nfilt=1, winlen=0.01, winstep=0.005) ], axis=-1) derivatives = [] previousf = np.zeros((13, )) for i in range(len(y)): if (i + 1) == len(y): nextf = np.zeros((13, )) else: nextf = y[i + 1] derivatives.append(((nextf - previousf) / 2).reshape((1, 13))) previousf = y[i] derivatives = np.concatenate(derivatives, axis=0) y = np.concatenate([y, derivatives], axis=1) return y
def _compute_sample(self, file, sliceLength, segmentLength): """computes the feature and label vector for single audio slice""" filename = self.dataset_path + file['audio_file_path'] fs, sound_data = wavfile.read(filename) sound_data = sound_data.astype('float32') label = file['is_hotword'] label_vec = np.full(int(sliceLength / segmentLength - 1), 0, dtype='int8') if label: is_audio_subsampled = self._detect_audio( sound_data )[0:: segmentLength] #detect audio and subsample to single label per segment num_segments = int(sliceLength / segmentLength) if len(is_audio_subsampled) >= num_segments: is_audio_subsampled = is_audio_subsampled[: num_segments] #truncate if longer than target segments else: is_audio_subsampled = np.pad( is_audio_subsampled, (num_segments - len(is_audio_subsampled), 0)) #pad if shorter than target segments label_vec = label_vec + is_audio_subsampled[ 1:] # overlay frames where label is true if len(sound_data) >= sliceLength: sound_data = sound_data[:sliceLength] else: sound_data = np.pad(sound_data, (sliceLength - len(sound_data), 0)) #compute log mel filterbank energies feature_vec = logfbank(sound_data, samplerate=self.sampling_frequency, winlen=self.window_size, winstep=self.time_step, nfilt=self.num_features) return feature_vec, label_vec
def get_mfcc(x): y = np.concatenate([ mfcc(x, numcep=12, winlen=0.01, winstep=0.005), logfbank(x, nfilt=1, winlen=0.01, winstep=0.005) ], axis=-1) derivatives = [] previousf = np.zeros((13, )) for i in range(len(y)): if (i + 1) == len(y): nextf = np.zeros((13, )) else: nextf = y[i + 1] derivatives.append(((nextf - previousf) / 2).reshape((1, 13))) previousf = y[i] derivatives = np.concatenate(derivatives, axis=0) y = np.concatenate([y, derivatives], axis=1) ynoise = np.random.normal(0, 0.6, y.shape) orig_len = len(y) pad = [np.zeros((1, 26))] * (3150 - y.shape[0]) ypad = np.concatenate([y] + pad, axis=0) noisepad = np.concatenate([ynoise] + pad, axis=0) return orig_len, ypad, ypad + noisepad
import scipy.io.wavfile as wav from python_speech_features.base import mfcc, fbank, logfbank import numpy as np import os input_dir = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\wav_16" output_dir1 = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\melbank" output_dir2 = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\log" if __name__ == "__main__": for ad_file in os.listdir(input_dir): (fs, audio) = wav.read(input_dir + "/" + ad_file) feature_m, feature_n = fbank(audio, fs, winfunc=np.hamming) log = logfbank(audio, fs) np.savetxt(output_dir1 + "/" + ad_file + ".fbank.csv", feature_m, delimiter=',') np.savetxt(output_dir2 + "/" + ad_file + ".log.csv", log, delimiter=',') # 拼接字符串,把单引号改成双引号居然好使 # fs, audio = wav.read(r"C:\Users\a7825\Desktop\新しいフォルダー/a.wav") # # feature_mfcc = mfcc(audio, samplerate=fs, numcep=40, nfilt=40) # feature_m,feature_n = fbank(audio, fs) # feature_log = logfbank(audio, fs) # # print(feature_mel[0].shape) # # print(feature_m) # # print(feature_n.shape) # np.savetxt('fbank.csv', feature_m, delimiter = ',') # np.savetxt('energy.csv', feature_n, delimiter= ',')
import matplotlib.pyplot as plt import librosa as li import librosa.display as ds import librosa from python_speech_features.base import mfcc, logfbank import scipy.io.wavfile as wav import numpy as np import os indir = r'C:\Users\a7825\Desktop\工作空间\セミナー\语音\wav/C001L_061.wav' # indir_1 =r'C:\Users\a7825\Desktop\工作空间\杂物\临时\这个就对了' # 显示メルフィルタバンク的图 x, fs = li.load(indir, sr=16000) # (fs, x) = wav.read(indir) log = logfbank(x, fs) # np.savetxt(indir_1 + ".csv", log, delimiter=',') # print(log.shape) # os.system('pause') # for e in range(11,21): # for i in range(26): # log[e][i] = log[e][i]+3.5#为了让图变得亮一点 # ig, ax = plt.subplots() # plt.xlim(0,130)#设置x的范围 plt.ylim(0, 25.6) #设置y的范围 # new_ticks = np.linspace(0.00,5.00,4) # plt.xticks(new_ticks)
import matplotlib.pyplot as plt import librosa as li import librosa.display as ds import librosa from python_speech_features.base import mfcc, logfbank, fbank import scipy.io.wavfile as wav import numpy as np import zhengguihua import os indir = r'C:\Users\a7825\Desktop\工作空间\セミナー\语音\wav/C001L_061.wav' # indir_1 =r'C:\Users\a7825\Desktop\工作空间\杂物\临时\这个就对了' # 显示logメルフィルタバンク的图 (fs, x) = wav.read(indir) log = logfbank(x, fs, nfilt=40) np.savetxt(indir + ".csv", log, delimiter=',') ig, ax = plt.subplots() log = np.swapaxes(log, 0, 1) cax = ax.imshow(log, interpolation='nearest', origin='lower', aspect='auto') plt.show()
def calcAcousticFeatures(sound, fs, featureMode, speakerType='male', tmpDir=".", speech_sound_type='vowel', octave_binding=None): """ Calculates acoustic features with given featureMode for sound with audio sampling rate fs. sound: 1D np.array or 2D array of horizontal vector. Returns a tuple containing a 2D np.array of numTimeSteps x numFeatureParams and a 2 x numFeatureParams array that contains scaling factors that can be used to ensure equal contribution of each feature type. """ # how many Hz may change in the first formant in mergeFactor ms – few variation for vowels required if speech_sound_type == 'vowel': maxFormantChange = 50 elif speech_sound_type == 'syllable': maxFormantChange = 800 if np.ndim(sound) == 2: sound = sound[0, :] if featureMode == 'formants': formants = getPraatFormantsMean(sound, fs, speakerType, tmpDir) return (np.array(formants).reshape((1, -1)), None) elif featureMode == 'formants_full': (timePos, formants) = getPraatFormants(sound, fs, speakerType, tmpDir) # downsample mergeFactor = 10 # how many time steps (=ms) should be merged to one value newFormants = np.array(np.mean(formants[0:mergeFactor, :], 0), ndmin=2) for t in range(mergeFactor, len(timePos), mergeFactor): new = np.mean(formants[t:t + mergeFactor, :], 0) if abs(newFormants[-1, 0] - new[0]) > maxFormantChange: # TODO: this is dangerous if the first detected formant is incorrect! pass else: newFormants = np.vstack((newFormants, new)) return (newFormants, None) elif featureMode == 'mfcc': # sound as 1d if sound.ndim == 2: sound = np.reshape(sound, (-1)) # returns (numFrames x numCeps) np array window_length = 0.02 # 0.025 * 22050 = ca. 551 frames window_step = 0.01 # 0.01 * 22050 = ca. 221 frames num_cepstrals = 13 features = mfcc(sound, fs, window_length, window_step, num_cepstrals) return (features, None) elif featureMode == 'mfcc_formants': # sound as 1d if sound.ndim == 2: sound = np.reshape(sound, (-1)) # returns (numFrames x numCeps) np array window_length = 0.02 # 0.025 * 22050 = ca. 551 frames window_step = 0.005 # 0.01 * 22050 = ca. 221 frames num_cepstrals = 13 features = mfcc(sound, fs, window_length, window_step, num_cepstrals) (timePos, formants) = getPraatFormants(sound, fs, speakerType, tmpDir) # downsample mergeFactor = 10 # how many time steps (=ms) should be merged to one value # get a good estimate for initial formants (ignoring initial perturbations): initialFormants = np.median(formants[0:5, :], axis=0) newFormants = None i = 0 while not newFormants: if abs(formants[i, 0] - initialFormants[0]) < maxFormantChange: newFormants = formants[i, :] break else: i += 1 newFormants = np.array(np.mean(formants[0:mergeFactor, :], 0), ndmin=2) for t in range(mergeFactor, len(timePos), mergeFactor): new = np.mean(formants[t:t + mergeFactor, :], 0) if abs(newFormants[-1, 0] - new[0]) > maxFormantChange: pass else: newFormants = np.vstack((newFormants, new)) # resample formants according to mfccs # TODO Warning: interp just copies the last element to make trajectories longer!!! # alternative: https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html resampledFormants = np.zeros( (np.shape(features)[0], np.shape(newFormants)[1])) for i in range(np.shape(newFormants)[1]): resampledFormants[:, i] = np.interp(range(np.shape(features)[0]), range(np.shape(newFormants)[0]), newFormants[:, i]) minmax = np.array([ np.concatenate((np.repeat([-1 / np.shape(resampledFormants)[1]], np.shape(resampledFormants)[1]), np.repeat([-1 / np.shape(features)[1]], np.shape(features)[1]))), np.concatenate((np.repeat([1 / np.shape(resampledFormants)[1]], np.shape(resampledFormants)[1]), np.repeat([1 / np.shape(features)[1]], np.shape(features)[1]))) ]) return (np.concatenate((resampledFormants, features), axis=1), minmax) elif featureMode == "fbank": # sound as 1d if sound.ndim == 2: sound = np.reshape(sound, (-1)) fbank_feat = logfbank(sound, fs, nfft=1024) return (fbank_feat, np.concatenate(([-1 * np.ones(fbank_feat.shape[1])], [np.ones(fbank_feat.shape[1])]))) elif featureMode == "logfbank": # sound as 1d if sound.ndim == 2: sound = np.reshape(sound, (-1)) fbank_feat = logfbank(sound, fs, nfft=1024) return (fbank_feat, np.concatenate(([-1 * np.ones(fbank_feat.shape[1])], [np.ones(fbank_feat.shape[1])]))) elif featureMode == 'gbfb': # Gabor filter bank features, requires Octave installed # scaledAudio = np.int16(copiedAudio/maxAmplitude * 32767) soundNorm = sound / 32767 #features = octave_binding.gbfb_feature_extraction(soundNorm, fs) features = octave_binding.heq( octave_binding.gbfb( octave_binding.log_mel_spectrogram(soundNorm, fs))) features = features.transpose() return (features, np.concatenate(([-1 * np.ones(features.shape[1])], [np.ones(features.shape[1])]))) else: print("Feature mode " + featureMode + " not yet defined in calcAcousticFeatures()!") return None