예제 #1
0
 def test_dtw_symmetric(self):
     x = mfcc(self.tidigits[17]['samples'])
     y = mfcc(self.tidigits[3]['samples'])
     dist = lambda x, y: np.linalg.norm(x - y, ord=2)
     d1 = dtw(x, y, dist=dist, debug=False)
     d2 = dtw(y, x, dist=dist, debug=False)
     self.assertEqual(d2, d1)
예제 #2
0
 def test_dtw(self):
     x = mfcc(self.tidigits[12]['samples'])
     y = mfcc(self.tidigits[22]['samples'])
     dist = lambda x, y: np.linalg.norm(x - y, ord=2)
     d1, cost1, acc_cost1, path1 = dtw_mod.dtw(x, y, dist=dist)
     d2 = dtw(x, y, dist=dist, debug=False)
     assert_almost_equal(d1, d2)
예제 #3
0
def extract_feature(path, states):
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.wav'):
                filename = os.path.join(root, file)
                samples, samplingrate = loadAudio(filename)
                print(filename + '... ', end='')

                # feature extraction (=> inputs for DNN)
                lmfcc = mfcc(samples)
                mspec_ = mspec(samples)

                # forced alignment (=> targets for DNN)
                wordTrans = list(
                    path2info(filename)
                    [2])  # word transcription (contained in the filename)
                phoneTrans = words2phones(
                    wordTrans,
                    prondict)  # word transcription => phone transcription
                targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
                targets = np.array([states.index(t) for t in targets
                                    ])  # save targets as indeces

                data.append({
                    'filename': filename,
                    'lmfcc': lmfcc,
                    'mspec': mspec_,
                    'targets': targets
                })
                print('done')
    return np.array(data)
예제 #4
0
 def test_dtw_zero(self):
     x = mfcc(self.tidigits[7]['samples'])
     dist = lambda x, y: np.linalg.norm(x - y, ord=2)
     d1, cost1, acc_cost1, path1 = dtw_mod.dtw(x, x, dist=dist)
     d2 = dtw(x, x, dist=dist, debug=False)
     assert_almost_equal(d1, d2)
     # assert_allclose(d2, 0.0)
     self.assertEqual(d2, 0.0)
def q6_speech_segment_GMM():
    mfcc_features = []
    for i in range(data_dict.shape[0]):
        mfcc_features.append(mfcc(data_dict[0]['samples']))
    mfcc_features = np.vstack(mfcc_features)

    n_components = [4, 8, 16, 32]
    idx_seven = [16, 17, 38, 39]
    seven_list = data_dict[idx_seven]
    test_mfcc_seven = []
    for i in range(len(idx_seven)):
        test_mfcc_seven.append(mfcc(seven_list[i]['samples']))

    for comp in n_components:
        #train GMM model
        gmm = GaussianMixture(n_components=comp, covariance_type='diag')
        gmm.fit(mfcc_features)

        for i in range(len(idx_seven)):
            test_data = test_mfcc_seven[i]
            prob = gmm.predict_proba(test_data)  #compute posterior
            plot_p_color_mesh(
                prob, 'GMM posterior, n_component=%d, seven #%d' % (comp, i))
def q5_feature_correlation():
    mfcc_features_list = []
    mspec_features_list = []
    for sample in data_dict:
        mfcc_feature = mfcc(sample["samples"])
        mfcc_features_list.append(mfcc_feature)
        mspec_feature = mspec(sample["samples"])
        mspec_features_list.append(mspec_feature)

    mfcc_features_list = np.vstack(mfcc_features_list)
    mspec_features_list = np.vstack(mspec_features_list)

    mfcc_cor = np.corrcoef(mfcc_features_list, rowvar=False)
    mspec_cor = np.corrcoef(mspec_features_list, rowvar=False)
    plot_p_color_mesh(mfcc_cor, 'MxM Mfcc correlations')
    plot_p_color_mesh(mspec_cor, 'MxM mspec correlations')
예제 #7
0
def concat_all_features(data, feature="mfcc"):
    assert feature in ["mfcc", "mspec"]
    all_features = None
    for d in data:
        sample = d['samples']
        sampling_rate = d['samplingrate']
        if feature == "mfcc":
            features = mfcc(sample, samplingrate=sampling_rate)
        elif feature == "mspec":
            features = mspec(sample, samplingrate=sampling_rate)

        if all_features is None:
            all_features = features
        else:
            all_features = np.concatenate((all_features, features), axis=0)
    return all_features
def feature_extraction_and_force_alignment(filepath, nstates, phoneHMMs):
   """
   handle one .wav file
   """
   samples, samplingrate = loadAudio(filepath)
   wordTrans = list(path2info(filepath)[2])
   phoneTrans = words2phones(wordTrans, prondict)
   stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans
            for stateid in range(nstates[phone])]
   lmfcc_result = mfcc(samples)
   mspec_result = mspec(samples)
   targets = []

   _, viterbi_path = forcedAlignment(lmfcc_result, phoneHMMs, phoneTrans)
   targets = [stateTrans[idx] for idx in viterbi_path.astype(np.int16)] 
   
   return lmfcc_result, mspec_result, targets
def q7_global_distance():
    global_distances = np.zeros((44, 44))
    all_mfcc = []
    for i in range(44):
        all_mfcc.append(mfcc(data_dict[i]['samples']))

    for i in range(44):
        for j in range(44):
            if i == j:
                continue
            elif global_distances[j, i] != 0:
                global_distances[i, j] = global_distances[j, i]
            else:
                global_d, _, acc_d, _ = dtw(all_mfcc[i], all_mfcc[j],
                                            euclidean)
                global_distances[i, j] = global_d
    plot_p_color_mesh(global_distances, 'global distance matrix')
    np.save('global distance.npy', global_distances)
예제 #10
0
nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones}
stateList = [p + '_' + str(i) for p in phones for i in range(nstates[p])]

########################
# 4.2 Forced alignment #
########################

if DEBUG:
    # load correct example
    example = np.load('data/lab3_example.npz',
                      allow_pickle=True)['example'].item()

    # feature extraction
    filename = 'data/tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
    samples, samplingrate = loadAudio(filename)
    lmfcc = mfcc(samples)

    # transcription
    wordTrans = list(path2info(filename)
                     [2])  # word transcription (contained in the filename)
    phoneTrans = words2phones(
        wordTrans, prondict)  # word transcription => phone transcription
    stateTrans = [
        p + '_' + str(i) for p in phoneTrans for i in range(nstates[p])
    ]  # phone transcription => state transcription

    # combined HMM for utterance
    utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)

    # Viterbi decoder
    obsloglik = log_multivariate_normal_density_diag(lmfcc,
예제 #11
0
    # E.g. # of states of ah = 3; ah -> ['ah_0', 'ah_1', 'ah_2']
    stateList = list()
    for ph in phoneHMMs.keys():
        for i in range(nstates[ph]):
            stateList.append('%s_%d' % (ph, i))
    # --------------------------------------------------------------
    data = list()
    for root, dirs, files in walk(folder_to_extract):
        for f in tqdm(files):
            if not f.endswith('.wav'):
                continue
            # do our work
            filename = os.path.join(root, f)
            sample, srate = loadAudio(filename)
            mspec_x = mspec(sample, samplingrate=srate)
            lmfcc_x = mfcc(sample, samplingrate=srate)
            wordTrans = list(path2info(filename)[2])
            phoneTrans = words2phones(wordTrans, prondict)
            targets = forcedAlignment(lmfcc_x, phoneHMMs, phoneTrans)
            # convert the targets from str to int
            idx_targets = [stateList.index(t) for t in targets]
            data.append({
                'filename': filename,
                'lmfcc': lmfcc_x,
                'mspec': mspec_x,
                'targets': idx_targets
            })

    kwargs = {data_type: data}
    np.savez(dump_file_name, **kwargs)
예제 #12
0
Calculate a global distance matrix and save the gdist file
"""
import numpy as np
import config
from lab1_proto import dtw
from lab1_proto import mfcc
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

if __name__ == "__main__":
    data = np.load('data/lab1_data.npz')['data']
    # calcualte the global distance matrix
    ndata = len(data)

    # global_dist matrix is symmetric
    # the diagonal terms are zeros
    global_dist = np.zeros((ndata, ndata))

    cnt = 0
    for i, j in zip(*np.triu_indices(ndata, k=1)):
        feature_i = mfcc(data[i]['samples'])
        feature_j = mfcc(data[j]['samples'])
        d = dtw(feature_i, feature_j)
        global_dist[i, j] = d
        global_dist[j, i] = d
        cnt += 1
        if cnt % 100 == 0:
            print("Calculated %d global distances" % cnt, flush=True)

    np.save(config.gdist_npy_file, global_dist)
    customPlot(fft_, 'spec:abs(FFT)^2', True)
logMel = logMelSpectrum(fft_, sampling_rate, 512)
if (np.allclose(example['mspec'], logMel, atol=1e-08)):
    customPlot(logMel, 'mspec:Mel Filterbank', True)
mfcc_ = cepstrum(logMel, 13)
if (np.allclose(example['mfcc'], mfcc_, atol=1e-08)):
    customPlot(mfcc_, 'mfcc:MFCCs', True)
lmfcc_ = lifter(mfcc_)
if (np.allclose(example['lmfcc'], lmfcc_, atol=1e-08)):
    customPlot(lmfcc_, 'lmfcc:Liftered MFCCs', True)

from lab1_proto import mfcc, mspec
data = np.load('lab1_data.npz', allow_pickle=True)['data']
for i in range(data.shape[0]):
    samples = data[i]['samples']
    s = mfcc(samples)
    t = mspec(samples)
    if (i == 0):
        data_mfcc = s
        data_mspec = t
    else:
        data_mfcc = np.append(data_mfcc, s, axis=0)
        data_mspec = np.append(data_mspec, t, axis=0)

plt.pcolormesh(np.corrcoef(data_mfcc.T))  ## how corrcoef works ?
plt.pcolormesh(np.corrcoef(data_mspec.T))

from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4).fit(data_mfcc)
labels = gmm.predict(data_mfcc)
예제 #14
0
            verbose=1)  # fix the initialization for repetition

        # train the GMM with all data
        clf.fit(all_features)

        for digit in ['1', '4', '7']:
            test_data = pick_data_by_digit(data, digit=digit)

            fig, axes = plt.subplots(nrows=len(test_data),
                                     ncols=1,
                                     sharex=True,
                                     sharey=True,
                                     figsize=(12, 8))
            # prediction and plot the posterior matrix
            for i, d in enumerate(test_data):
                features = mfcc(d['samples'], samplingrate=d['samplingrate'])
                posterior_prob = clf.predict_proba(features)
                ax = axes[i]
                title = 'Posterior for digit {digit} by {speaker} ({gender})'.format(
                    **d)
                ax.set_title(title)
                im = ax.matshow(posterior_prob.T)
                ax.xaxis.tick_bottom()
                ax.set_aspect('auto')

            for ax in axes.flat:
                ax.set(xlabel='frame', ylabel='classes')
            # Hide x labels and tick labels for top plots and y ticks for right plots.
            for ax in axes.flat:
                ax.label_outer()