def preprocess(audio: Audio) -> np.ndarray: features = [FEATURE_TYPES[feature_type]( audio.data, samplerate=audio.rate, **kwargs)] for _ in range(delta_order): features.append(delta(features[-1], delta_window)) return np.concatenate(features, axis=1)
def mk_MFB(filename, sample_rate=c.SAMPLE_RATE, use_delta=c.USE_DELTA): audio, sr = librosa.load(filename, sr=sample_rate, mono=True) audio = audio.flatten() filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=c.FILTER_BANK, winlen=0.025) delta_1 = delta(filter_banks, N=1) delta_2 = delta(delta_1, N=1) filter_banks = normalize_frames(filter_banks) delta_1 = normalize_frames(delta_1) delta_2 = normalize_frames(delta_2) if use_delta: frames_features = np.hstack([filter_banks, delta_1, delta_2]) else: frames_features = filter_banks np.save(filename.replace('.wav', '.npy'), frames_features) return
def create_mfcc(filename): """Perform standard preprocessing, as described by Alex Graves (2012) http://www.cs.toronto.edu/~graves/preprint.pdf Output consists of 12 MFCC and 1 energy, as well as the first derivative of these. [1 energy, 12 MFCC, 1 diff(energy), 12 diff(MFCC) """ (rate, sample) = wav.read(filename) mfcc = features.mfcc(sample, rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, preemph=0.97, appendEnergy=True) d_mfcc = features.delta(mfcc, 2) a_mfcc = features.delta(d_mfcc, 2) out = np.concatenate([mfcc, d_mfcc, a_mfcc], axis=1) return out, out.shape[0]
def extract_features(samples, sample_rate, win_len, win_shift, win_fun=np.hamming): """ Computes 13 MFCC + delta + delta-delta features for an utterance. :param samples: samples of the utterance, numpy array of shape (n_samples,) :param sample_rate: sampling rate :param win_len: window length (in seconds) :param win_shift: window shift (in seconds) :param win_fun: window function :return: numpy array of shape (n_frames, n_features), where n_features=39 """ mfcc = pss.mfcc(samples, sample_rate, winlen=win_len, winstep=win_shift, winfunc=win_fun) delta = pss.delta(mfcc, 3) delta_delta = pss.delta(delta, 3) return np.concatenate((mfcc, delta, delta_delta), axis=1)
def mfcc(wav_path): """ Grabs MFCC features with energy and derivates. """ (rate, sig) = wav.read(wav_path) feat = python_speech_features.mfcc(sig, rate, appendEnergy=True) delta_feat = python_speech_features.delta(feat, 2) all_feats = [feat, delta_feat] all_feats = np.array(all_feats) # Make time the first dimension for easy length normalization padding later. all_feats = np.swapaxes(all_feats, 0, 1) all_feats = np.swapaxes(all_feats, 1, 2) feat_fn = wav_path[:-3] + "mfcc13_d.npy" np.save(feat_fn, all_feats)
def get_mfcc_v2(y, sr, n_mfcc=13, tgt_sr=16000, win_len=0.025, hop_len=0.010, n_fft=512, n_mels=22, fmin=0.0, fmax=None, cep_lifter=22, pre_emph=0.97, win_func=lambda x: np.ones((x, )), append_energy=True, delta=True, delta_delta=True): if sr != 16000.0: y = librosa.core.resample(y, orig_sr=sr, target_sr=16000) mfccs = python_speech_features.mfcc(y, tgt_sr, winlen=win_len, winstep=hop_len, numcep=n_mfcc, nfilt=n_mels, nfft=n_fft, lowfreq=fmin, highfreq=fmax, preemph=pre_emph, ceplifter=cep_lifter, appendEnergy=append_energy, winfunc=win_func) features = [mfccs] if delta: features.append(python_speech_features.delta(mfccs, 1)) if delta_delta: features.append(python_speech_features.delta(mfccs, 2)) return np.hstack(features)
def extract_feature_for_audio(self, audio_file_path): # load the wav file to an array signal, sr = librosa.load(audio_file_path, mono=True, sr=self.sample_rate) # trim the leading and trailing slience signal_trimed, index = librosa.effects.trim(signal, top_db=self.silence_cutoff) # extract the mfcc feature, for details about mfcc, # see: http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/ MFCC = mfcc(signal_trimed, self.sample_rate, winlen=self.winlen, winstep=self.winlen / 2, winfunc=numpy.hamming, nfft=self.nfft, numcep=self.numcep) # do not use the first mfcc coefficient features = MFCC[:, 1:self.numcep] # caculate the delta of the mfcc and add to the features Delta = delta(MFCC, 2) features = numpy.column_stack((features, Delta)) # caculate the delta of the delta of the mfcc and add to the features Acc = delta(Delta, 2) features = numpy.column_stack((features, Acc)) # total number of features would be the number of columns of the `features` array self.num_features = features.shape[1] # each audio file will be transformed to an numpy array with a shape(N, self.num_features) # where N is the number of frames that are extracted from the audio file by the mfcc function return features
def process_mel(self, mel_input): #mel_input [80,344] mel_input = mel_input.T #[344,80] delta1 = ps.delta(mel_input, 2) delta2 = ps.delta(delta1, 2) time = mel_input.shape[0] mel = np.pad(mel_input, ((0, 800 - time), (0, 0)), 'constant', constant_values=0) #[800,80] delta1 = np.pad(delta1, ((0, 800 - time), (0, 0)), 'constant', constant_values=0) delta2 = np.pad(delta2, ((0, 800 - time), (0, 0)), 'constant', constant_values=0) mel_output = np.zeros((3, 800, 80)) mel_output[0, :, :] = mel mel_output[1, :, :] = delta1 mel_output[2, :, :] = delta2 return mel_output
def get_mfcc(self, data, fs): wav_feature = mfcc(data, fs, numcep=self.numc, winlen=c.FRAME_LEN, winstep=c.FRAME_STEP, nfilt=26, nfft=c.NUM_FFT) #print(wav_feature.shape," before",type(wav_feature)) reserve_length = wav_feature.shape[0] - wav_feature.shape[0] % 100 d_wav_feature_1 = delta(wav_feature, 2) d_wav_feature_2 = delta(d_wav_feature_1, 2) mfcc_feat_normal = normalize_frames(wav_feature.T) d_mfcc_feat_1_normal = normalize_frames(d_wav_feature_1.T) d_mfcc_feat_2_normal = normalize_frames(d_wav_feature_2.T) mfcc_feature = [ mfcc_feat_normal, d_mfcc_feat_1_normal, d_mfcc_feat_2_normal ] mfcc_feature = torch.tensor(mfcc_feature) length = (reserve_length / 100 - 1) / 2 total_length = (int(length) * 2 + 1) * 2 index = torch.randperm(total_length) if self.mode == "train": feature = torch.zeros([int(length), 3, self.numc, 300]) for r in range(int(length)): for i in range(6): feature[r, :, :, i * 50:(i + 1) * 50] =\ mfcc_feature[:, :, index[r * 4 + i] * 50:(index[r * 4 + i] + 1) * 50] feature = feature.permute(0, 1, 3, 2) else: feature = mfcc_feature[:, :, 0:reserve_length] feature = feature.unsqueeze(0) feature = feature.permute(0, 1, 3, 2) return feature
def audio_feature(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=40, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=np.hamming): feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) log_fbank = np.log(feat) # discard the 0-th dct coefficient mfcc = dct(log_fbank, type=2, axis=1, norm='ortho')[:, 1:numcep] mfcc = lifter(mfcc, ceplifter) d1_mfcc = delta(mfcc, 1) d2_mfcc = delta(d1_mfcc, 1) energy = np.reshape(np.log(energy), (energy.shape[0], 1)) mixed = np.concatenate((mfcc, d1_mfcc, d2_mfcc, energy), axis=1) return mixed
def fbank(wav_path, flat=True): """ Currently grabs log Mel filterbank, deltas and double deltas.""" (rate, sig) = wav.read(wav_path) if len(sig) == 0: logger.warning("Empty wav: {}".format(wav_path)) fbank_feat = python_speech_features.logfbank(sig, rate, nfilt=40) energy = extract_energy(rate, sig) feat = np.hstack([energy, fbank_feat]) delta_feat = python_speech_features.delta(feat, 2) delta_delta_feat = python_speech_features.delta(delta_feat, 2) all_feats = [feat, delta_feat, delta_delta_feat] if not flat: all_feats = np.array(all_feats) # Make time the first dimension for easy length normalization padding # later. all_feats = np.swapaxes(all_feats, 0, 1) all_feats = np.swapaxes(all_feats, 1, 2) else: all_feats = np.concatenate(all_feats, axis=1) # Log Mel Filterbank, with delta, and double delta feat_fn = wav_path[:-3] + "fbank.npy" np.save(feat_fn, all_feats)
def mfccProc2(self, results_dict): (rate, sig) = audioBasicIO.readAudioFile(self.fname) #Create 2d array for MFCC features mfcc_feat = mfcc(sig, samplerate=44100, nfft=1103) #Create 2d array for the delta of MFCC features d_mfcc_feat = delta(mfcc_feat, 2) #Create 2d array for the log of fbank features fbank_feat = logfbank(sig, rate) dev_array = [] for i in mfcc_feat: temp = stdev(i) dev_array.append(temp) tone = stdev(dev_array) results_dict["tone"] = tone return (mfcc_feat)
def log_mel_filterbank(audio, sample_rate, window_size, step_size): """Returns the log of the mel filterbank energies as well as the first and second order deltas. Hanning window used for parity with log_spectrogram function. Args: audio (np.ndarray): audio signal array sample_rate (int): sample_rate of signal window_size (int): window size step_size (int): step size Returns: np.ndarray: log mel filterbank, delta, and delta-deltas """ delta_window = 1 log_mel = python_speech_features.base.logfbank(audio, sample_rate, winlen=window_size / 1000, winstep=step_size / 1000, winfunc=np.hanning) delta = python_speech_features.delta(log_mel, N=delta_window) delta_delta = python_speech_features.delta(delta, N=delta_window) output = np.concatenate((log_mel, delta, delta_delta), axis=1) return output.astype(np.float32)
def mfcc(y, sr, numcep=13, delta=False, delta_delta=False, width=2, **kwargs): """ Compute MFCCs of Audio Signal :param y: Audio signal :param sr: Original sample rate :param numcep: Number of MFCCs to compute :param delta: If delta of MFCCs are required :param delta_delta: If acceleration of MFCCs are required :param width: Number of samples to consider for computing delta :param kwargs: Other parameters to pass on python_speech_features like hop length etc. :return: MFCCs (numpy array of shape n_frames * n_mfccs) """ mfccs = python_speech_features.mfcc(signal=y, samplerate=sr, numcep=numcep, **kwargs) if delta: d1 = python_speech_features.delta(mfccs, N=width) mfccs = np.hstack((mfccs, d1)) if delta_delta: d2 = python_speech_features.delta(mfccs[:, mfccs.shape[1] / 2:], N=width) mfccs = np.hstack((mfccs, d2)) return mfccs
def test_all_files(path, hmm_list): global false_list, word_list, false_number, correct_number file_list = [] files = os.listdir(path) for file_name in files: for u in word_list: if file_name.__contains__(u): file_list.append(file_name) for file in file_list: for labels in word_list: if file.__contains__(labels): tested_label = labels break #tested_label = file[ :-6] print("Tested label : ", tested_label) audio = AudioSegment.from_file(path + "/" + file, format="wav", frame_rate=32000) audio = audio.set_frame_rate(16000) audio.export("filtered-talk1.wav", format="wav") (rate, sig) = wav.read("filtered-talk1.wav") # print "Read : " , ses_yol + word + "/" + word + str ( i / 10 ) + str ( i %10 ) +".wav" mfcc_feat = mfcc(sig, rate, nfft=1536) d_mfcc_feat = delta(mfcc_feat, 2) data = np.concatenate((mfcc_feat, d_mfcc_feat), axis=1).tolist() for label in word_list: if label in file: break data = scalers[label].transform(data) vector = get_score_vector(data, hmm_list) vector = nn_scaler.transform([vector]) print("Vector : ", vector.shape) predicted = trained.predict(vector) print("type : ", type(predicted[0])) print("Res : ", str(predicted[0])) print("Prediction ", trained.predict_proba(vector)) if predicted[0] != tested_label: false_list.append(file + " predicted answer : " + predicted[0]) false_number += 1 else: correct_number += 1
def convert_to_vector(filename): (rate, sig) = wav.read(filename) mfcc_feat = mfcc(sig, rate) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(sig, rate) #print(fbank_feat) print("######################") vector1 = (fbank_feat[1:3, :][1]) #print(vector1) vector2 = (fbank_feat[1:3, :][0]) #print vector2 print("######################") z = np.hstack((vector1, vector2)) # vector.extend(list(fbank_feat[1:3,:][1])) return z
def mfcc_delta_feature_extraction(self, data): vectorized_data = [] hop_length = int(self.wnd_step * self.sample_rate) win_length = int(self.wnd_len * self.sample_rate) for d in data: mfcc_librosa = mfcc(d, sr=self.sample_rate, n_mfcc=self.num_features, hop_length=hop_length, win_length=win_length) # mfcc_librosa = mfcc(d, sr=self.sample_rate, n_mfcc=self.num_features) mfcc_f = np.transpose(mfcc_librosa) delta_f = delta(mfcc_f, 8) v_d = np.append(mfcc_f, delta_f, axis=1) vectorized_data.append(v_d) return vectorized_data
def get_24_coefficients(filenames, data_dir=os.path.join( 'Support_CentraleDigitale_Lab_201920', 'Data_Submarin', 'Dataset_J1')): """Input : audio recordings filenames list of a same speaker For instance, filenames_MJPM = ['MJPM-1','MJPM-2','MJPM-3'] Output : Array with 24 columns corresponding to the MFC Coefficients 2-13, and the delta MFC Coefficients of these MFC Coefficients. It has I rows corresponding to speaking frames of the recordings.""" # The 12 MFC Coefficients 2-13 mfcc_speaker_speaking = mfcc_locuteur_speaking(filenames, data_dir) # The 12 delta MFC Coefficients corresponding to the previous coefficients d_mfcc_speaker_speaking = delta(mfcc_speaker_speaking, 2) mfcc_24_coeffs = np.hstack( (mfcc_speaker_speaking, d_mfcc_speaker_speaking)) return mfcc_24_coeffs
def test_features(fname): #os.chdir(path) #feat=np.zeros((1,27)) #fnames=[x for x in os.listdir(path) if x[-3:]=="wav"] (rate, sig) = wav.read(fname) #sig=sig[:,1] fr_l = math.floor(rate * 0.025) mfcc_feat = mfcc(sig, rate, nfft=fr_l + 1) d_mfcc_feat = delta(mfcc_feat, 2) feat = np.zeros((len(d_mfcc_feat), 26)) # first row is the tag feat[:, :13] = mfcc_feat feat[:, 13:] = d_mfcc_feat feat_std = preprocessing.scale(feat) return feat
def compute_mfcc(filename): (rate, sig) = wav.read(filename) m = mfcc(sig, samplerate=rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=40, nfft=512, lowfreq=0, highfreq=None, preemph=0, ceplifter=22, appendEnergy=True) m = delta(m, 6) #m = delta(m, 2) return m
def mfcc_with_delta( audio, samplerate, n_features, n_channels, **kwargs): # 이 함수가 호출될때 위의 함수 (preprocessor)에 이 함수가 붙어서 호출됨. """Calculate Mel-frequency cepstral coefficients, and calculate delta features if requested.""" tmp = _features.mfcc( audio, samplerate, numcep=n_features, **kwargs ) # python_speech_features.mfcc 여기서 audio 는 1d array 여야하는데.. 언제 바뀜? # return a numpy array of size containing features. # Each row holds 1 feature vector. tmp -= _np.mean(tmp, axis=0) + 1e-8 result = [tmp] for _ in range(1, n_channels): tmp = _features.delta(tmp, 2) result.append(tmp) result = _np.stack(result, axis=2) return result
def featureList(self, path): # obj = Silence() # newpath = os.path.splitext(path)[0] + "_silenced" + os.path.splitext(path)[1] # obj.silencemain(path,newpath) (rate, sig) = wav.read(path) print "___________________path_________________" print sig.shape print rate print path mfcc_feat = mfcc(sig, rate) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(sig, rate) print "file:feature.py line:24" print fbank_feat.shape return fbank_feat
def extract_features(audio, rate): """extract 20 dim mfcc features from an audio, performs CMS and combines delta to make it 40 dim feature vector""" mfcc_feature = mfcc.mfcc(audio, rate, 0.025, 0.01, 26, nfft=1200, preemph=0.97, appendEnergy=True) # mfcc_feature = preprocessing.scale(mfcc_feature) mfcc_feature1 = mfcc.logfbank(audio, rate, 0.025, 0.01, 26, nfft=1200) mfcc_feature2 = mfcc.ssc(audio, rate, 0.025, 0.01, 26, nfft=1200) delta = mfcc.delta(mfcc_feature, 26) combined = np.hstack((mfcc_feature, delta, mfcc_feature1, mfcc_feature2)) return combined
def extract_feature(self, path): fs, y = wavfile.read(path) y = y / np.max(abs(y)) mfcc_feat = mfcc(y, fs) mfcc_feat = delta(mfcc_feat, 2) data = pad_sequences(mfcc_feat.T, self.max_frames, dtype=float, padding='post', truncating='post').T if (path.split('/')[-2] == '0.Background'): target = 0 elif (path.split('/')[-2] == '1.Bat den'): target = 1 elif (path.split('/')[-2] == '2.Tat den'): target = 2 elif (path.split('/')[-2] == '3.Bat dieu hoa'): target = 3 elif (path.split('/')[-2] == '4.Tat dieu hoa'): target = 4 elif (path.split('/')[-2] == '5.Bat quat'): target = 5 elif (path.split('/')[-2] == '6.Tat quat'): target = 6 elif (path.split('/')[-2] == '7.Bat tivi'): target = 7 elif (path.split('/')[-2] == '8.Tat tivi'): target = 8 elif (path.split('/')[-2] == '9.Mo cua'): target = 9 elif (path.split('/')[-2] == '10.Dong cua'): target = 10 elif (path.split('/')[-2] == '11.Khoa cua'): target = 11 elif (path.split('/')[-2] == '12.Mo cong'): target = 12 elif (path.split('/')[-2] == '13.Dong cong'): target = 13 elif (path.split('/')[-2] == '14.Khoa cong'): target = 14 elif (path.split('/')[-2] == '15.Doremon'): target = 15 return data, target
def extract_features(self, signal, rate, winlen=WINLEN, winstep=WINSTEP, nfft=NFFT, n=N): # signal = self._scale_signal(signal) # obniza o 4% !? # signal = self._cut_the_tips(signal) # obniza o 1% mfcc_values = mfcc(signal=signal, samplerate=rate, winlen=winlen, winstep=winstep, nfft=nfft) dmfcc_values = delta(mfcc_values, n) result = np.append(mfcc_values, dmfcc_values, axis=1) return result
def mfcc_features(self, audio, rate, numcep = 20, nfft = 2000, N = 2): """ Returns the MFCC and delta MFCC features of the given audio, stacked together horizontally Parameters: :audio: The audio file for which MFCC features must be computed :rate: The sample rate of the audio file :numcep: The number of cepstrum to return, default 20 :nfft: The FFT size, default 2000 :N: Calculate delta features based on preceding and following N frames, default 2 Return Value: A numpy array which has the scaled MFCC and delta MFCC features, stacked horizontally """ self.mfcc = python_speech_features.mfcc(audio, rate, numcep = numcep, nfft = nfft) #self.mfcc = preprocessing.scale(self.mfcc) self.delta_mfcc = python_speech_features.delta(self.mfcc, N) self.mfcc_feature = np.hstack((self.mfcc, self.delta_mfcc)) return self.mfcc_feature
def load_all_wav_into_csv(): metadata = open_file_read('../accents_data/speakers_all.csv') count = 0 for file in metadata: if file[FILE_MISSING_IDX] == "FALSE" and file[ FIRST_LANGUAGE_IDX] in ACCEPTED_LANGUAGES: count += 1 filename_wav = os.path.join( dirname, '../accents_data/recordings_wav/' + file[FILE_NAME_IDX] + '.wav') rate, sig = make_standard_length(filename_wav) mfcc_feat = mfcc(sig, rate, nfft=1200, nfilt=13) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(sig, rate, nfft=1200, nfilt=13) write_to_csv('../accents_data/recordings_csv/' + file[3] + '.csv', fbank_feat) progress(count, NUMBER_OF_FILES, "Generating MFCCs")
def run(sec) : n = int(rate * sec) # 冒頭無発話区間とフレームレートをかけあわせて,計算範囲だけを残す for x, pcmname in enumerate(pcmnames) : data = read_pcm(pcmname) # pcmファイルを読み込み data = data[: n] # secに合わせて,冒頭区間のみを残す print("[%s]\nstart: open %s" %(ctime(), pcmname)) # MFCCの計算とその平均 mfcc_feature = mfcc(data, rate, winlen=length, winstep=step, numcep=n_feature) mfcc_mean = np.mean(mfcc_feature.T, axis=1).astype(np.str) # MFCCからデルタとその平均を求める d_mfcc_feat = delta(mfcc_feature, 2) d_mfcc_mean = np.mean(d_mfcc_feat.T, axis=1).astype(np.str) #print(d_mfcc_mean) # delta-deltaを計算する #d2_mfcc_feat = delta(d_mfcc_feat, 2) #d2_mfcc_mean = np.mean(d2_mfcc_feat.T, axis=1).astype(np.str) # 結果(pcmファイル名,MFCC,デルタ)をまとめる rslt = np.array([pcmname], dtype=np.str) rslt = np.append(rslt, mfcc_mean) rslt = np.append(rslt, d_mfcc_mean) # 結果を1つの大きな配列に結合する if x == 0 : out = np.array([rslt]) else : out = np.append(out, np.array([out]), axis=0) print("[%s]\ndone: get features from %s" %(ctime(), pcmname)) # 全てのpcmファイルを読み込んだら,tsv形式で出力する ms = sec * 1000 outname = "mfcc_feature_%sms.tsv" %ms np.savetxt(outname, out, delimiter="\t", fmt="%s")
def __mfcc(audio_data, sampling_rate, win_len, win_step, num_features, n_fft, f_min, f_max): """ Convert a wav signal into Mel Frequency Cepstral Coefficients (MFCC). Args: audio_data (np.ndarray): Wav signal. sampling_rate (int): Sampling rate. win_len (float): Window length in seconds. win_step (float): Window stride in seconds. num_features (int): Number of features to generate. n_fft (int): Number of Fast Fourier Transforms. f_min (float): Minimum frequency to consider. f_max (float): Maximum frequency to consider. Returns: np.ndarray: MFCC feature vectors. Shape: [time, num_features] """ if num_features % 2 != 0: raise ValueError('num_features is not a multiple of 2.') # Compute MFCC features. mfcc = psf.mfcc(signal=audio_data, samplerate=sampling_rate, winlen=win_len, winstep=win_step, numcep=num_features // 2, nfilt=num_features, nfft=n_fft, lowfreq=f_min, highfreq=f_max, preemph=0.97, ceplifter=22, appendEnergy=True) # And the first-order differences (delta features). mfcc_delta = psf.delta(mfcc, 2) # Combine MFCC with MFCC_delta return np.concatenate([mfcc, mfcc_delta], axis=1)
def process_timit_psf(path, output_path): for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: if filename[-4:] == ".wav": full_path = os.path.join(dirpath, filename) wave, sr = librosa.load(full_path, mono=True, sr=16000) mfcc_features = python_speech_features.mfcc(wave, samplerate=sr, numcep=13, nfilt=26, appendEnergy=True, winlen=0.025, winstep=0.01) delta_features = python_speech_features.delta(mfcc_features, 9) output_filename = os.path.join(dirpath, filename[:-4] + "_mfcc") print(output_filename) concat_features = np.concatenate( (mfcc_features, delta_features), axis=1).T print(concat_features.shape) np.save(output_filename, concat_features, allow_pickle=False)
def MFCC(LOCATION, SAVELOCATION): # NOTE: Currently using a library for generating MFCCs, to make sure that # the implementation of the EM algorithm is based on correct values. Own # implementation of MFCCs can be found commented out below. data = genfromtxt(LOCATION, delimiter=",") signal = np.zeros([64000, 1]) for i in range(0, 64000): signal[i, 0] = data[i] mfcc_feat = np.transpose(mfcc(signal, 16000)) d_mfcc_feat = (delta(mfcc_feat, 2)) rows, columns = np.shape(mfcc_feat) # Concatenate matrices # MFCC matrix is on top of the delta matrix mfcc_deltas = np.zeros([rows * 2, columns]) for i in range(0, rows): mfcc_deltas[i, :] = mfcc_feat[i, :] mfcc_deltas[i + rows, :] = d_mfcc_feat[i, :] np.savetxt(SAVELOCATION, mfcc_deltas, delimiter=",") return mfcc_deltas
#!/usr/bin/env python from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank import scipy.io.wavfile as wav (rate,sig) = wav.read("english.wav") mfcc_feat = mfcc(sig,rate) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(sig,rate) print(fbank_feat[1:3,:])