def test_dtw_symmetric(self): x = mfcc(self.tidigits[17]['samples']) y = mfcc(self.tidigits[3]['samples']) dist = lambda x, y: np.linalg.norm(x - y, ord=2) d1 = dtw(x, y, dist=dist, debug=False) d2 = dtw(y, x, dist=dist, debug=False) self.assertEqual(d2, d1)
def test_dtw(self): x = mfcc(self.tidigits[12]['samples']) y = mfcc(self.tidigits[22]['samples']) dist = lambda x, y: np.linalg.norm(x - y, ord=2) d1, cost1, acc_cost1, path1 = dtw_mod.dtw(x, y, dist=dist) d2 = dtw(x, y, dist=dist, debug=False) assert_almost_equal(d1, d2)
def extract_feature(path, states): data = [] for root, dirs, files in os.walk(path): for file in files: if file.endswith('.wav'): filename = os.path.join(root, file) samples, samplingrate = loadAudio(filename) print(filename + '... ', end='') # feature extraction (=> inputs for DNN) lmfcc = mfcc(samples) mspec_ = mspec(samples) # forced alignment (=> targets for DNN) wordTrans = list( path2info(filename) [2]) # word transcription (contained in the filename) phoneTrans = words2phones( wordTrans, prondict) # word transcription => phone transcription targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans) targets = np.array([states.index(t) for t in targets ]) # save targets as indeces data.append({ 'filename': filename, 'lmfcc': lmfcc, 'mspec': mspec_, 'targets': targets }) print('done') return np.array(data)
def test_dtw_zero(self): x = mfcc(self.tidigits[7]['samples']) dist = lambda x, y: np.linalg.norm(x - y, ord=2) d1, cost1, acc_cost1, path1 = dtw_mod.dtw(x, x, dist=dist) d2 = dtw(x, x, dist=dist, debug=False) assert_almost_equal(d1, d2) # assert_allclose(d2, 0.0) self.assertEqual(d2, 0.0)
def q6_speech_segment_GMM(): mfcc_features = [] for i in range(data_dict.shape[0]): mfcc_features.append(mfcc(data_dict[0]['samples'])) mfcc_features = np.vstack(mfcc_features) n_components = [4, 8, 16, 32] idx_seven = [16, 17, 38, 39] seven_list = data_dict[idx_seven] test_mfcc_seven = [] for i in range(len(idx_seven)): test_mfcc_seven.append(mfcc(seven_list[i]['samples'])) for comp in n_components: #train GMM model gmm = GaussianMixture(n_components=comp, covariance_type='diag') gmm.fit(mfcc_features) for i in range(len(idx_seven)): test_data = test_mfcc_seven[i] prob = gmm.predict_proba(test_data) #compute posterior plot_p_color_mesh( prob, 'GMM posterior, n_component=%d, seven #%d' % (comp, i))
def q5_feature_correlation(): mfcc_features_list = [] mspec_features_list = [] for sample in data_dict: mfcc_feature = mfcc(sample["samples"]) mfcc_features_list.append(mfcc_feature) mspec_feature = mspec(sample["samples"]) mspec_features_list.append(mspec_feature) mfcc_features_list = np.vstack(mfcc_features_list) mspec_features_list = np.vstack(mspec_features_list) mfcc_cor = np.corrcoef(mfcc_features_list, rowvar=False) mspec_cor = np.corrcoef(mspec_features_list, rowvar=False) plot_p_color_mesh(mfcc_cor, 'MxM Mfcc correlations') plot_p_color_mesh(mspec_cor, 'MxM mspec correlations')
def concat_all_features(data, feature="mfcc"): assert feature in ["mfcc", "mspec"] all_features = None for d in data: sample = d['samples'] sampling_rate = d['samplingrate'] if feature == "mfcc": features = mfcc(sample, samplingrate=sampling_rate) elif feature == "mspec": features = mspec(sample, samplingrate=sampling_rate) if all_features is None: all_features = features else: all_features = np.concatenate((all_features, features), axis=0) return all_features
def feature_extraction_and_force_alignment(filepath, nstates, phoneHMMs): """ handle one .wav file """ samples, samplingrate = loadAudio(filepath) wordTrans = list(path2info(filepath)[2]) phoneTrans = words2phones(wordTrans, prondict) stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans for stateid in range(nstates[phone])] lmfcc_result = mfcc(samples) mspec_result = mspec(samples) targets = [] _, viterbi_path = forcedAlignment(lmfcc_result, phoneHMMs, phoneTrans) targets = [stateTrans[idx] for idx in viterbi_path.astype(np.int16)] return lmfcc_result, mspec_result, targets
def q7_global_distance(): global_distances = np.zeros((44, 44)) all_mfcc = [] for i in range(44): all_mfcc.append(mfcc(data_dict[i]['samples'])) for i in range(44): for j in range(44): if i == j: continue elif global_distances[j, i] != 0: global_distances[i, j] = global_distances[j, i] else: global_d, _, acc_d, _ = dtw(all_mfcc[i], all_mfcc[j], euclidean) global_distances[i, j] = global_d plot_p_color_mesh(global_distances, 'global distance matrix') np.save('global distance.npy', global_distances)
nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones} stateList = [p + '_' + str(i) for p in phones for i in range(nstates[p])] ######################## # 4.2 Forced alignment # ######################## if DEBUG: # load correct example example = np.load('data/lab3_example.npz', allow_pickle=True)['example'].item() # feature extraction filename = 'data/tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav' samples, samplingrate = loadAudio(filename) lmfcc = mfcc(samples) # transcription wordTrans = list(path2info(filename) [2]) # word transcription (contained in the filename) phoneTrans = words2phones( wordTrans, prondict) # word transcription => phone transcription stateTrans = [ p + '_' + str(i) for p in phoneTrans for i in range(nstates[p]) ] # phone transcription => state transcription # combined HMM for utterance utteranceHMM = concatHMMs(phoneHMMs, phoneTrans) # Viterbi decoder obsloglik = log_multivariate_normal_density_diag(lmfcc,
# E.g. # of states of ah = 3; ah -> ['ah_0', 'ah_1', 'ah_2'] stateList = list() for ph in phoneHMMs.keys(): for i in range(nstates[ph]): stateList.append('%s_%d' % (ph, i)) # -------------------------------------------------------------- data = list() for root, dirs, files in walk(folder_to_extract): for f in tqdm(files): if not f.endswith('.wav'): continue # do our work filename = os.path.join(root, f) sample, srate = loadAudio(filename) mspec_x = mspec(sample, samplingrate=srate) lmfcc_x = mfcc(sample, samplingrate=srate) wordTrans = list(path2info(filename)[2]) phoneTrans = words2phones(wordTrans, prondict) targets = forcedAlignment(lmfcc_x, phoneHMMs, phoneTrans) # convert the targets from str to int idx_targets = [stateList.index(t) for t in targets] data.append({ 'filename': filename, 'lmfcc': lmfcc_x, 'mspec': mspec_x, 'targets': idx_targets }) kwargs = {data_type: data} np.savez(dump_file_name, **kwargs)
Calculate a global distance matrix and save the gdist file """ import numpy as np import config from lab1_proto import dtw from lab1_proto import mfcc from scipy.cluster.hierarchy import dendrogram, linkage from matplotlib import pyplot as plt if __name__ == "__main__": data = np.load('data/lab1_data.npz')['data'] # calcualte the global distance matrix ndata = len(data) # global_dist matrix is symmetric # the diagonal terms are zeros global_dist = np.zeros((ndata, ndata)) cnt = 0 for i, j in zip(*np.triu_indices(ndata, k=1)): feature_i = mfcc(data[i]['samples']) feature_j = mfcc(data[j]['samples']) d = dtw(feature_i, feature_j) global_dist[i, j] = d global_dist[j, i] = d cnt += 1 if cnt % 100 == 0: print("Calculated %d global distances" % cnt, flush=True) np.save(config.gdist_npy_file, global_dist)
customPlot(fft_, 'spec:abs(FFT)^2', True) logMel = logMelSpectrum(fft_, sampling_rate, 512) if (np.allclose(example['mspec'], logMel, atol=1e-08)): customPlot(logMel, 'mspec:Mel Filterbank', True) mfcc_ = cepstrum(logMel, 13) if (np.allclose(example['mfcc'], mfcc_, atol=1e-08)): customPlot(mfcc_, 'mfcc:MFCCs', True) lmfcc_ = lifter(mfcc_) if (np.allclose(example['lmfcc'], lmfcc_, atol=1e-08)): customPlot(lmfcc_, 'lmfcc:Liftered MFCCs', True) from lab1_proto import mfcc, mspec data = np.load('lab1_data.npz', allow_pickle=True)['data'] for i in range(data.shape[0]): samples = data[i]['samples'] s = mfcc(samples) t = mspec(samples) if (i == 0): data_mfcc = s data_mspec = t else: data_mfcc = np.append(data_mfcc, s, axis=0) data_mspec = np.append(data_mspec, t, axis=0) plt.pcolormesh(np.corrcoef(data_mfcc.T)) ## how corrcoef works ? plt.pcolormesh(np.corrcoef(data_mspec.T)) from sklearn.mixture import GaussianMixture gmm = GaussianMixture(n_components=4).fit(data_mfcc) labels = gmm.predict(data_mfcc)
verbose=1) # fix the initialization for repetition # train the GMM with all data clf.fit(all_features) for digit in ['1', '4', '7']: test_data = pick_data_by_digit(data, digit=digit) fig, axes = plt.subplots(nrows=len(test_data), ncols=1, sharex=True, sharey=True, figsize=(12, 8)) # prediction and plot the posterior matrix for i, d in enumerate(test_data): features = mfcc(d['samples'], samplingrate=d['samplingrate']) posterior_prob = clf.predict_proba(features) ax = axes[i] title = 'Posterior for digit {digit} by {speaker} ({gender})'.format( **d) ax.set_title(title) im = ax.matshow(posterior_prob.T) ax.xaxis.tick_bottom() ax.set_aspect('auto') for ax in axes.flat: ax.set(xlabel='frame', ylabel='classes') # Hide x labels and tick labels for top plots and y ticks for right plots. for ax in axes.flat: ax.label_outer()