def mfcc(y1, y2, y3, sr1, sr2, sr3, yTest, srTest): # import dtw # import editdistance # Convert the data to mfcc: mfcc1 = librosa.feature.mfcc(y1, sr1, n_mfcc=20) mfcc2 = librosa.feature.mfcc(y2, sr2, n_mfcc=20) mfcc3 = librosa.feature.mfcc(y3, sr3, n_mfcc=20) mfccTest = librosa.feature.mfcc(yTest, srTest) # Remove mean and normalize each column of MFCC import copy def preprocess_mfcc(mfcc): mfcc_cp = copy.deepcopy(mfcc) for i in xrange(mfcc.shape[1]): mfcc_cp[:, i] = mfcc[:, i] - np.mean(mfcc[:, i]) mfcc_cp[:, i] = mfcc_cp[:, i] / np.max(np.abs(mfcc_cp[:, i])) return mfcc_cp mfcc1 = preprocess_mfcc(mfcc1) mfcc2 = preprocess_mfcc(mfcc2) mfcc3 = preprocess_mfcc(mfcc3) mfccTest = preprocess_mfcc(mfccTest) window_size = mfcc1.shape[1] dists = np.zeros(mfccTest.shape[1] - window_size) for i in range(len(dists)): mfcci = mfccTest[:, i:i + window_size] dist1i = librosa.dtw( mfcc1.T, mfcci.T, dist=lambda x, y: np.exp(np.linalg.norm(x - y, ord=1)))[0] dist2i = librosa.dtw( mfcc2.T, mfcci.T, dist=lambda x, y: np.exp(np.linalg.norm(x - y, ord=1)))[0] dist3i = librosa.dtw( mfcc3.T, mfcci.T, dist=lambda x, y: np.exp(np.linalg.norm(x - y, ord=1)))[0] dists[i] = (dist1i + dist2i + dist3i) / 3 import matplotlib.pyplot as plt plt.plot(dists) # select minimum distance window word_match_idx = dists.argmin() # convert MFCC to time domain word_match_idx_bnds = np.array( [word_match_idx, np.ceil(word_match_idx + window_size)]) samples_per_mfcc = 512 word_samp_bounds = (2 / 2) + (word_match_idx_bnds * samples_per_mfcc) word = yTest[word_samp_bounds[0]:word_samp_bounds[1]]
def test_dtw_subseq_sym(): Y = np.array([10., 10., 0., 1., 2., 3., 10., 10.]) X = np.arange(4) gt_wp_XY = np.array([[3, 5], [2, 4], [1, 3], [0, 2]]) gt_wp_YX = np.array([[5, 3], [4, 2], [3, 1], [2, 0]]) _, mut_wp_XY = librosa.dtw(X, Y, subseq=True) _, mut_wp_YX = librosa.dtw(Y, X, subseq=True) assert np.array_equal(gt_wp_XY, mut_wp_XY) assert np.array_equal(gt_wp_YX, mut_wp_YX)
def test_dtw_subseq_sym(): Y = np.array([10., 10., 0., 1., 2., 3., 10., 10.]) X = np.arange(4) gt_wp_XY = np.array([[3, 5], [2, 4], [1, 3], [0, 2]]) gt_wp_YX = np.array([[5, 3], [4, 2], [3, 1], [2, 0]]) _, mut_wp_XY = librosa.dtw(X, Y, subseq=True) _, mut_wp_YX = librosa.dtw(Y, X, subseq=True) assert np.array_equal(gt_wp_XY, mut_wp_XY) assert np.array_equal(gt_wp_YX, mut_wp_YX)
def mfcc_dtw(y, sr,yTest,srTest): # Convert the data to mfcc: mfcc = librosa.feature.mfcc(y, sr, n_mfcc=24,n_fft=2048, hop_length=512) # n_fft=10240, hop_length=2560 mfccTest = librosa.feature.mfcc(yTest, srTest, n_mfcc=24, n_fft=2048, hop_length=512) # get delta mfccs mfcc_delta=librosa.feature.delta(mfcc) mfccTest_delta=librosa.feature.delta(mfccTest) # then merge mfcc=np.concatenate((mfcc,mfcc_delta),axis=0) mfccTest = np.concatenate((mfccTest, mfccTest_delta), axis=0) # mfcc = mfcc1.mean(1) # mfccTest = mfccTest.mean(1) # Remove mean and normalize each column of MFCC import copy def preprocess_mfcc(mfcc): mfcc_cp = copy.deepcopy(mfcc) for i in xrange(mfcc.shape[1]): mfcc_cp[:, i] = mfcc[:, i] - np.mean(mfcc[:, i]) mfcc_cp[:, i] = mfcc_cp[:, i] / np.max(np.abs(mfcc_cp[:, i])) return mfcc_cp mfcc = preprocess_mfcc(mfcc) mfccTest = preprocess_mfcc(mfccTest) #average MFCC over all frames mfcc=mfcc.mean(1) mfccTest=mfccTest.mean(1) #Calculate the distances from the test signal d, wp = librosa.dtw(mfccTest, mfcc, metric='euclidean') return d[d.shape[0] - 1][d.shape[1] - 1]
def plotAndSave(self, Y, Z, fig_name="compare", extension="png"): status = False fig_name = fig_name + '.' + extension try: if self.fileExists(TEMP_FOLDER, fig_name): os.remove(TEMP_FOLDER + fig_name) except Exception as e: print(e) try: D, wp = librosa.dtw(Y, Z, subseq=True) [N, M] = D.shape # print(D[N-1,M-1]) plt.figure(fig_name) plt.subplot(2, 1, 1) librosa.display.specshow(D, x_axis='frames', y_axis='frames') plt.plot(wp[:, 1], wp[:, 0], label='Optimal path', color='y') plt.legend() plt.subplot(2, 1, 2) plt.plot(D[-1, :] / wp.shape[0]) plt.xlim([0, Y.shape[1]]) plt.ylim([0, 2]) plt.title('Matching cost function') plt.tight_layout() plt.savefig(TEMP_FOLDER + fig_name) plt.clf() status = True except Exception as e: print('Plotting failed.') status = False return status
def MCC_with_DTW(sample, dest): ''' This function check simillarity of sound between sample and dest. Ignoring magnitude between sample and dest. Args : sample, dest sample - sound to compare. dest - sound to compare. Returns: simillarity of sample and dest. Raises: nothing. ''' # MCC : Magnitude Control Compare. largest_sample = 0.000000001 for i in range(0, len(sample)): if largest_sample < sample[i]: largest_sample = sample[i] largest_dest = 0.000000001 for i in range(0, len(dest)): if largest_dest < dest[i]: largest_dest = dest[i] # Comapre largest value and multiply to one. temp = [] for i in range(0, len(dest)): temp.append(dest[i] * largest_sample / largest_dest) dtwed, _ = lb.dtw(sample, temp, subseq=True) # dtwed[-1, -1] is simillarity of sounds. return abs(dtwed[-1, -1])
def dtw_score(X, Y): from librosa import dtw D, wp = dtw(X, Y) minpath = max([X.shape[1], Y.shape[1]]) return float(D[-1, -1] / minpath)
def mfcc_dtw(y, sr, yTest, srTest): # Calculate MFCC of test and reference, return the DTW distance between them # First convert the data to mfcc: mfcc = librosa.feature.mfcc(y, sr, n_mfcc=24,n_fft=2048, hop_length=512) # n_fft=10240, hop_length=2560 mfccTest = librosa.feature.mfcc(yTest, srTest, n_mfcc=24, n_fft=2048, hop_length=512) # get delta mfccs mfcc_delta=librosa.feature.delta(mfcc) mfccTest_delta=librosa.feature.delta(mfccTest) # then merge mfcc=np.concatenate((mfcc,mfcc_delta),axis=0) mfccTest = np.concatenate((mfccTest, mfccTest_delta), axis=0) # Remove mean and normalize each column of MFCC import copy def preprocess_mfcc(mfcc): mfcc_cp = copy.deepcopy(mfcc) for i in xrange(mfcc.shape[1]): mfcc_cp[:, i] = mfcc[:, i] - np.mean(mfcc[:, i]) mfcc_cp[:, i] = mfcc_cp[:, i] / np.max(np.abs(mfcc_cp[:, i])) return mfcc_cp mfcc = preprocess_mfcc(mfcc) mfccTest = preprocess_mfcc(mfccTest) #average MFCC over all frames mfcc=mfcc.mean(1) mfccTest=mfccTest.mean(1) # Calculate the distances from the test signal d, wp = librosa.dtw(mfccTest, mfcc, metric='euclidean') return d[d.shape[0] - 1][d.shape[1] - 1]
def test_dtw_global(): # Example taken from: # Meinard Mueller, Fundamentals of Music Processing X = np.array([[1, 3, 3, 8, 1]]) Y = np.array([[2, 0, 0, 8, 7, 2]]) gt_D = np.array([[1., 2., 3., 10., 16., 17.], [2., 4., 5., 8., 12., 13.], [3., 5., 7., 10., 12., 13.], [9., 11., 13., 7., 8., 14.], [10, 10., 11., 14., 13., 9.]]) mut_D, _ = librosa.dtw(X, Y) assert np.array_equal(gt_D, mut_D) # Check that it works without backtracking mut_D2 = librosa.dtw(X, Y, backtrack=False) assert np.array_equal(mut_D, mut_D2)
def test_dtw_global(): # Example taken from: # Meinard Mueller, Fundamentals of Music Processing X = np.array([[1, 3, 3, 8, 1]]) Y = np.array([[2, 0, 0, 8, 7, 2]]) gt_D = np.array([[1., 2., 3., 10., 16., 17.], [2., 4., 5., 8., 12., 13.], [3., 5., 7., 10., 12., 13.], [9., 11., 13., 7., 8., 14.], [10, 10., 11., 14., 13., 9.]]) mut_D, _ = librosa.dtw(X, Y) assert np.array_equal(gt_D, mut_D) # Check that it works without backtracking mut_D2 = librosa.dtw(X, Y, backtrack=False) assert np.array_equal(mut_D, mut_D2)
def mfcc(y1,y2,y3,sr1,sr2,sr3,yTest,srTest): # import dtw # import editdistance # Convert the data to mfcc: mfcc1 = librosa.feature.mfcc(y1, sr1,n_mfcc=20) mfcc2 = librosa.feature.mfcc(y2, sr2,n_mfcc=20) mfcc3 = librosa.feature.mfcc(y3, sr3,n_mfcc=20) mfccTest = librosa.feature.mfcc(yTest, srTest) # Remove mean and normalize each column of MFCC import copy def preprocess_mfcc(mfcc): mfcc_cp = copy.deepcopy(mfcc) for i in xrange(mfcc.shape[1]): mfcc_cp[:, i] = mfcc[:, i] - np.mean(mfcc[:, i]) mfcc_cp[:, i] = mfcc_cp[:, i] / np.max(np.abs(mfcc_cp[:, i])) return mfcc_cp mfcc1 = preprocess_mfcc(mfcc1) mfcc2 = preprocess_mfcc(mfcc2) mfcc3 = preprocess_mfcc(mfcc3) mfccTest = preprocess_mfcc(mfccTest) window_size = mfcc1.shape[1] dists = np.zeros(mfccTest.shape[1] - window_size) for i in range(len(dists)): mfcci = mfccTest[:, i:i + window_size] dist1i = librosa.dtw(mfcc1.T, mfcci.T, dist=lambda x, y: np.exp(np.linalg.norm(x - y, ord=1)))[0] dist2i = librosa.dtw(mfcc2.T, mfcci.T, dist=lambda x, y: np.exp(np.linalg.norm(x - y, ord=1)))[0] dist3i = librosa.dtw(mfcc3.T, mfcci.T, dist=lambda x, y: np.exp(np.linalg.norm(x - y, ord=1)))[0] dists[i] = (dist1i + dist2i + dist3i) / 3 import matplotlib.pyplot as plt plt.plot(dists) # select minimum distance window word_match_idx = dists.argmin() # convert MFCC to time domain word_match_idx_bnds = np.array([word_match_idx, np.ceil(word_match_idx + window_size)]) samples_per_mfcc = 512 word_samp_bounds = (2 / 2) + (word_match_idx_bnds * samples_per_mfcc) word = yTest[word_samp_bounds[0]:word_samp_bounds[1]]
def compare_test_record_2_learning_list(self, test_element): dist = list() for i in range(0, len(self.learning_list)): y = self.learning_list[i].mfcc D, wp = librosa.dtw(test_element, y, subseq=True) dist.append(D[-1, -1]) if min(dist) > MIN_DIST: return "nie rozpoznano" else: return self.learning_list[dist.index(min(dist))].name
def test_dtw_global_diagonal(): # query is a linear ramp X = np.linspace(0.1, 1, 10) Y = X gt_wp = list(zip(list(range(10)), list(range(10))))[::-1] mut_D, mut_wp = librosa.dtw(X, Y, subseq=True, metric='cosine', step_sizes_sigma=np.array([[1, 1]]), weights_mul=np.array([1, ])) assert np.array_equal(np.asarray(gt_wp), np.asarray(mut_wp))
def test_dtw_global_diagonal(): # query is a linear ramp X = np.linspace(0.1, 1, 10) Y = X gt_wp = list(zip(list(range(10)), list(range(10))))[::-1] mut_D, mut_wp = librosa.dtw(X, Y, subseq=True, metric='cosine', step_sizes_sigma=np.array([[1, 1]]), weights_mul=np.array([1, ])) assert np.array_equal(np.asarray(gt_wp), np.asarray(mut_wp))
def compareto(audio, reference): xy, xsr = audio yy, ysr = reference mfccX = feature.mfcc(y=xy, sr=xsr) mfccY = feature.mfcc(y=yy, sr=ysr) chromaX = feature.chroma_cqt(y=xy, sr=xsr) chromaY = feature.chroma_cqt(y=yy, sr=ysr) distances = [] score = 0 D, wp = dtw(mfccX[0], mfccY[0]) score += getscore(wp) * 2 D, wp = dtw(chromaX, chromaY) score += getscore(wp) distances.append(score / 3) return sum(distances) / len(distances)
def FindTask(Record_File_Path): Compare_File_Path = './comparing voice data/' Language_test, fs0 = lib.load(Record_File_Path) Language_ch, fs1 = lib.load(Compare_File_Path + 'translate_ch.wav') Language_en, fs2 = lib.load(Compare_File_Path + 'translate_en.wav') Language_jp, fs3 = lib.load(Compare_File_Path + 'translate_jp.wav') Time_ch, fs4 = lib.load(Compare_File_Path + 'time_ch.wav') Time_en, fs5 = lib.load(Compare_File_Path + 'time_en.wav') Time_jp, fs6 = lib.load(Compare_File_Path + 'time_jp.wav') MFCC_test = lib.feature.mfcc(y=pre_emphasis(signal = Language_test), sr=fs0, n_mfcc=20) MFCC_lang_ch = lib.feature.mfcc(y=Language_ch, sr=fs1, n_mfcc=20) MFCC_lang_en = lib.feature.mfcc(y=Language_en, sr=fs2, n_mfcc=20) MFCC_lang_jp = lib.feature.mfcc(y=Language_jp, sr=fs3, n_mfcc=20) MFCC_time_ch = lib.feature.mfcc(y=Time_ch, sr=fs4, n_mfcc=20) MFCC_time_en = lib.feature.mfcc(y=Time_en, sr=fs5, n_mfcc=20) MFCC_time_jp = lib.feature.mfcc(y=Time_jp, sr=fs6, n_mfcc=20) D_lang_ch, wp_lang_ch = lib.dtw(MFCC_test, MFCC_lang_ch) D_lang_en, wp_lang_en = lib.dtw(MFCC_test, MFCC_lang_en) D_lang_jp, wp_lang_jp = lib.dtw(MFCC_test, MFCC_lang_jp) D_time_ch, wp_time_ch = lib.dtw(MFCC_test, MFCC_time_ch) D_time_en, wp_time_en = lib.dtw(MFCC_test, MFCC_time_en) D_time_jp, wp_time_jp = lib.dtw(MFCC_test, MFCC_time_jp) # D1 = D_lang_ch[wp_lang_ch[-1, 0], wp_lang_ch[-1, 1]] # D2 = D_lang_en[wp_lang_en[-1, 0], wp_lang_en[-1, 1]] # D3 = D_lang_jp[wp_lang_jp[-1, 0], wp_lang_jp[-1, 1]] # D4 = D_time_ch[wp_time_ch[-1, 0], wp_time_ch[-1, 1]] # D5 = D_time_en[wp_time_en[-1, 0], wp_time_en[-1, 1]] # D6 = D_time_jp[wp_time_jp[-1, 0], wp_time_jp[-1, 1]] # # Shortest_D = min(D1, D2, D3, D4, D5, D6) # if(Shortest_D==D1): # FindLanguage(Record_File_Path, 0) # elif(Shortest_D==D2): # FindLanguage(Record_File_Path, 1) # elif(Shortest_D==D3): # FindLanguage(Record_File_Path, 2) # elif(Shortest_D==D4): # FindTime(0) # elif(Shortest_D==D5): # FindTime(1) # elif(Shortest_D==D6): # FindTime(2) Shortest_D = min(D_lang_ch[-1,-1], D_lang_en[-1,-1], D_lang_jp[-1,-1], D_time_ch[-1,-1], D_time_en[-1,-1], D_time_jp[-1,-1]) if(Shortest_D==D_lang_ch[-1,-1]): FindLanguage(Record_File_Path, 0) elif(Shortest_D==D_lang_en[-1,-1]): FindLanguage(Record_File_Path, 1) elif(Shortest_D==D_lang_jp[-1,-1]): FindLanguage(Record_File_Path, 2) elif(Shortest_D==D_time_ch[-1,-1]): FindTime(0) elif(Shortest_D==D_time_en[-1,-1]): FindTime(1) elif(Shortest_D==D_time_jp[-1,-1]): FindTime(2)
def test_dtw_subseq(): # query is a linear ramp X = np.linspace(0, 1, 100) # database is query surrounded by noise noise_len = 200 noise = np.random.rand(noise_len) Y = np.concatenate((noise, noise, X, noise)) _, mut_wp = librosa.dtw(X, Y, subseq=True) # estimated sequence has to match original sequence # note the +1 due to python indexing mut_X = Y[mut_wp[-1][1]:mut_wp[0][1]+1] assert np.array_equal(X, mut_X)
def test_dtw_global_constrained(): # Example taken from: # Meinard Mueller, Fundamentals of Music Processing X = np.array([[1, 3, 3, 8, 1]]) Y = np.array([[2, 0, 0, 8, 7, 2]]) # With band_rad = 0.5, the GT distance array is gt_D = np.array([[1., 2., 3., np.inf, np.inf, np.inf], [2., 4., 5., 8., np.inf, np.inf], [np.inf, 5., 7., 10., 12., np.inf], [np.inf, np.inf, 13., 7., 8., 14.], [np.inf, np.inf, np.inf, 14., 13., 9.]]) mut_D = librosa.dtw(X, Y, backtrack=False, global_constraints=True, band_rad=0.5) assert np.array_equal(gt_D, mut_D)
def MCC_with_DTW(sample, dest) : largest_sample = 0 for i in range(0, len(sample)) : if largest_sample < sample[i] : largest_sample = sample[i] largest_dest = 0 for i in range(0, len(dest)) : if largest_dest < dest[i] : largest_dest = dest[i] temp = [] for i in range(0, len(dest)) : temp.append(dest[i] * largest_sample / largest_dest) #MCC Code above. #Magnitude Control Compare. D, wp = librosa.dtw(sample, temp, subseq = True) return abs(D[-1,-1])
def test_dtw_global_supplied_distance_matrix(): # Example taken from: # Meinard Mueller, Fundamentals of Music Processing X = np.array([[1, 3, 3, 8, 1]]) Y = np.array([[2, 0, 0, 8, 7, 2]]) # Precompute distance matrix. C = cdist(X.T, Y.T, metric='euclidean') gt_D = np.array([[1., 2., 3., 10., 16., 17.], [2., 4., 5., 8., 12., 13.], [3., 5., 7., 10., 12., 13.], [9., 11., 13., 7., 8., 14.], [10, 10., 11., 14., 13., 9.]]) # Supply precomputed distance matrix and specify an invalid distance # metric to verify that it isn't used. mut_D, _ = librosa.dtw(C=C, metric='invalid') assert np.array_equal(gt_D, mut_D)
def test_dtw_global_constrained(): # Example taken from: # Meinard Mueller, Fundamentals of Music Processing X = np.array([[1, 3, 3, 8, 1]]) Y = np.array([[2, 0, 0, 8, 7, 2]]) # With band_rad = 0.5, the GT distance array is gt_D = np.array([[1., 2., 3., np.inf, np.inf, np.inf], [2., 4., 5., 8., np.inf, np.inf], [np.inf, 5., 7., 10., 12., np.inf], [np.inf, np.inf, 13., 7., 8., 14.], [np.inf, np.inf, np.inf, 14., 13., 9.]]) mut_D = librosa.dtw(X, Y, backtrack=False, global_constraints=True, band_rad=0.5) assert np.array_equal(gt_D, mut_D)
def MCC_with_DTW(sample, dest): largest_sample = 0 for i in range(0, len(sample)): if largest_sample < sample[i]: largest_sample = sample[i] largest_dest = 0 for i in range(0, len(dest)): if largest_dest < dest[i]: largest_dest = dest[i] temp = [] print(largest_sample / largest_dest) for i in range(0, len(dest)): temp.append(dest[i] * largest_sample / largest_dest) #MCC Code above. #Magnitude Control Compare. print("Start Calc DTW.") D, wp = librosa.dtw(sample, temp, subseq=True) return D, wp
def test_dtw_global_supplied_distance_matrix(): # Example taken from: # Meinard Mueller, Fundamentals of Music Processing X = np.array([[1, 3, 3, 8, 1]]) Y = np.array([[2, 0, 0, 8, 7, 2]]) # Precompute distance matrix. C = cdist(X.T, Y.T, metric='euclidean') gt_D = np.array([[1., 2., 3., 10., 16., 17.], [2., 4., 5., 8., 12., 13.], [3., 5., 7., 10., 12., 13.], [9., 11., 13., 7., 8., 14.], [10, 10., 11., 14., 13., 9.]]) # Supply precomputed distance matrix and specify an invalid distance # metric to verify that it isn't used. mut_D, _ = librosa.dtw(C=C, metric='invalid') assert np.array_equal(gt_D, mut_D)
def FindTask(Record_File_Path): Compare_File_Path = './comparing voice data/' Language_test, fs0 = lib.load(Record_File_Path) Language_ch, fs1 = lib.load(Compare_File_Path + 'translate_ch.wav') Language_en, fs2 = lib.load(Compare_File_Path + 'translate_en.wav') Language_jp, fs3 = lib.load(Compare_File_Path + 'translate_jp.wav') Time_ch, fs4 = lib.load(Compare_File_Path + 'time_ch2.wav') Time_en, fs5 = lib.load(Compare_File_Path + 'time_en.wav') Time_jp, fs6 = lib.load(Compare_File_Path + 'time_jp.wav') MFCC_test = lib.feature.mfcc(y=pre_emphasis(signal=Language_test), sr=fs0, n_mfcc=20) MFCC_lang_ch = lib.feature.mfcc(y=Language_ch, sr=fs1, n_mfcc=20) MFCC_lang_en = lib.feature.mfcc(y=Language_en, sr=fs2, n_mfcc=20) MFCC_lang_jp = lib.feature.mfcc(y=Language_jp, sr=fs3, n_mfcc=20) MFCC_time_ch = lib.feature.mfcc(y=Time_ch, sr=fs4, n_mfcc=20) MFCC_time_en = lib.feature.mfcc(y=Time_en, sr=fs5, n_mfcc=20) MFCC_time_jp = lib.feature.mfcc(y=Time_jp, sr=fs6, n_mfcc=20) D_lang_ch, wp_ch = lib.dtw(MFCC_test, MFCC_lang_ch) D_lang_en, wp_en = lib.dtw(MFCC_test, MFCC_lang_en) D_lang_jp, wp_jp = lib.dtw(MFCC_test, MFCC_lang_jp) D_time_ch, wp_ch = lib.dtw(MFCC_test, MFCC_time_ch) D_time_en, wp_en = lib.dtw(MFCC_test, MFCC_time_en) D_time_jp, wp_jp = lib.dtw(MFCC_test, MFCC_time_jp) g = D_lang_ch[-1, -1] gg = D_lang_en[-1, -1] ggg = D_lang_jp[-1, -1] gggg = D_time_ch[-1, -1] ggggg = D_time_en[-1, -1] gggggg = D_time_jp[-1, -1] Shortest_D = min(D_lang_ch[-1, -1], D_lang_en[-1, -1], D_lang_jp[-1, -1], D_time_ch[-1, -1], D_time_en[-1, -1], D_time_jp[-1, -1]) if (Shortest_D == D_lang_ch[-1, -1]): FindLanguage(Record_File_Path, 0) elif (Shortest_D == D_lang_en[-1, -1]): FindLanguage(Record_File_Path, 1) elif (Shortest_D == D_lang_jp[-1, -1]): FindLanguage(Record_File_Path, 2) elif (Shortest_D == D_time_ch[-1, -1]): FindTime(0) elif (Shortest_D == D_time_en[-1, -1]): FindTime(1) else: FindTime(2)
def test_dtw_incompatible_args_02(): librosa.dtw(C=None, X=None, Y=None)
def test_dtw_incompatible_args_01(): librosa.dtw(C=1, X=1, Y=1)
def librosa_dtw(X,Y): D, wp = librosa.dtw(X, Y, subseq=True) return D,wp
def test_dtw_incompatible_args_01(): librosa.dtw(C=1, X=1, Y=1)
def FindLanguage(Record_File_Path, Language_code): Response_File_Path = './response voice data/' if(Language_code==0): playsound(Response_File_Path+'language_response_ch.wav') time.sleep(0.5) playsound(Response_File_Path+'language_select_ch.wav') char_append = 'ch' elif(Language_code==1): playsound(Response_File_Path+'language_response_en.wav') time.sleep(0.5) playsound(Response_File_Path+'language_select_en.wav') char_append = 'en' else: playsound(Response_File_Path+'language_response_jp.wav') time.sleep(0.5) playsound(Response_File_Path+'language_select_jp.wav') char_append = 'jp' DetectSound(Record_File_Path) Compare_File_Path = './comparing voice data/' Language_test, fs0 = lib.load(Record_File_Path) MFCC_test = lib.feature.mfcc(y=pre_emphasis(signal = Language_test), sr=fs0, n_mfcc=20) if(Language_code==0): Language_en_ch, fs1 = lib.load(Compare_File_Path + 'language_en_ch.wav') Language_jp_ch, fs2 = lib.load(Compare_File_Path + 'language_jp_ch.wav') MFCC_lang_en_ch = lib.feature.mfcc(y=Language_en_ch, sr=fs1, n_mfcc=20) MFCC_lang_jp_ch = lib.feature.mfcc(y=Language_jp_ch, sr=fs2, n_mfcc=20) D_lang_en_ch, wp_en_ch = lib.dtw(MFCC_test, MFCC_lang_en_ch) D_lang_jp_ch, wp_jp_ch = lib.dtw(MFCC_test, MFCC_lang_jp_ch) compare1 = D_lang_en_ch[-1, -1] compare2 = D_lang_jp_ch[-1, -1] if(compare1<compare2): translate_lang = 1 else: translate_lang = 2 elif(Language_code==1): Language_ch_en, fs1 = lib.load(Compare_File_Path + 'language_ch_en.wav') Language_jp_en, fs2 = lib.load(Compare_File_Path + 'language_jp_en.wav') MFCC_lang_ch_en = lib.feature.mfcc(y=Language_ch_en, sr=fs1, n_mfcc=20) MFCC_lang_jp_en = lib.feature.mfcc(y=Language_jp_en, sr=fs2, n_mfcc=20) D_lang_ch_en, wp_ch_en = lib.dtw(MFCC_test, MFCC_lang_ch_en) D_lang_jp_en, wp_jp_en = lib.dtw(MFCC_test, MFCC_lang_jp_en) compare1 = D_lang_ch_en[-1, -1] compare2 = D_lang_jp_en[-1, -1] if(compare1<compare2): translate_lang = 0 else: translate_lang = 2 else: Language_ch_jp, fs1 = lib.load(Compare_File_Path + 'language_ch_jp.wav') Language_en_jp, fs2 = lib.load(Compare_File_Path + 'language_en_jp.wav') MFCC_lang_ch_jp = lib.feature.mfcc(y=Language_ch_jp, sr=fs1, n_mfcc=20) MFCC_lang_en_jp = lib.feature.mfcc(y=Language_en_jp, sr=fs2, n_mfcc=20) D_lang_ch_jp, wp_ch_jp = lib.dtw(MFCC_test, MFCC_lang_ch_jp) D_lang_en_jp, wp_en_jp = lib.dtw(MFCC_test, MFCC_lang_en_jp) compare1 = D_lang_ch_jp[-1, -1] compare2 = D_lang_en_jp[-1, -1] if(compare1<compare2): translate_lang = 0 else: translate_lang = 1 playsound(Response_File_Path+'say_number_'+char_append+'.wav') TranslateNumber(Language_code, translate_lang, Record_File_Path)
def alignment_dtw(gt_cens_all, \ gt_tempo, \ gt_start_frame, \ gt_end_frame, \ input_clip_cens, \ input_clip_tempo, \ tempo_adj_man, \ clip_length, \ cens_fps, \ tempo_change_ratio_limit_dtw, \ tempo_change_ratio_limit_clip, \ tempo_max_song, \ tempo_min_song, \ ): # find estemated ground truth frame from all song frame est_gt_cens = gt_cens_all[:, gt_start_frame:gt_end_frame] # calculate scaled input gt_audio_length = est_gt_cens.shape[1] input_audio_row_num = input_clip_cens.shape[0] scaled_input = align_2_target(input_clip_cens, input_audio_row_num, gt_audio_length) # run DTW(scaled input, estimated GT audio) here cost_matrix, wp = librosa.dtw(scaled_input, est_gt_cens, \ global_constraints=True, \ band_rad=tempo_change_ratio_limit_dtw, \ subseq=True) pre_reg_x = wp[:, 1] pre_reg_x = pre_reg_x[::-1] pre_reg_x_with_coef = np.vstack([pre_reg_x, np.ones(len(pre_reg_x))]).T pre_reg_y = wp[:, 0] pre_reg_y = pre_reg_y[::-1] reg_slope, reg_coef = np.linalg.lstsq(pre_reg_x_with_coef, pre_reg_y)[0] reg_residuals = np.linalg.lstsq(pre_reg_x_with_coef, pre_reg_y)[1] start_chp = int(0.03 * len(pre_reg_x)) end_chp = int(0.97 * len(pre_reg_x)) x_change_point = [] # find all x change point for i in range(start_chp, end_chp): if pre_reg_x[i + 1] > pre_reg_x[i]: x_change_point.append(i + 1) slope_x_length = np.int(len(x_change_point) * 0.032) #slope_x_length = 20 slope_list = [] for j in range(0, len(x_change_point) - slope_x_length - 1): delta_x = np.float(pre_reg_x[x_change_point[j + slope_x_length]] - pre_reg_x[x_change_point[j]]) delta_y = np.float(pre_reg_y[x_change_point[j + slope_x_length]] - pre_reg_y[x_change_point[j]]) if (delta_x > 0) and (delta_y > 0): slop_n = delta_y / delta_x slope_list.append(slop_n) sorted_slope_list = quicksort(slope_list) slope_list_len = len(sorted_slope_list) start_list = int(slope_list_len * 0.25) end_list = int(slope_list_len * 0.75) final_slope = np.mean(sorted_slope_list[start_list:end_list]) middle_x = np.int((pre_reg_x[len(pre_reg_x) - 1] + pre_reg_x[0]) / 2) middle_x_index = 0 for k in range(0, len(pre_reg_x)): if (pre_reg_x[k] == middle_x): middle_x_index = k old_line_y = reg_slope * pre_reg_x[middle_x_index] + reg_coef new_line_y = final_slope * pre_reg_x[middle_x_index] + reg_coef line_dy_center = old_line_y - new_line_y zzz_dtw_input_is_faster = 0 zzz_dtw_input_is_slower = 0 if (final_slope * est_gt_cens.shape[1] + reg_coef + line_dy_center > est_gt_cens.shape[1]): zzz_dtw_input_is_slower = 1 else: zzz_dtw_input_is_faster = 1 zzz_dtw_cal_tempo_ratio = est_gt_cens.shape[1] / np.float( final_slope * est_gt_cens.shape[1] + reg_coef + line_dy_center) # set change ratio limit zzz_dtw_cal_tempo_ratio = min((1.0 + tempo_change_ratio_limit_clip), zzz_dtw_cal_tempo_ratio) zzz_dtw_cal_tempo_ratio = max(1.0 / (1 + tempo_change_ratio_limit_clip), zzz_dtw_cal_tempo_ratio) #print (zzz_dtw_cal_tempo_ratio) # manually overwrite tempo zzz_dtw_cal_tempo_ratio = zzz_dtw_cal_tempo_ratio * ( 1.0 + float(tempo_adj_man) / 100.0) zzz_dtw_cal_input_tempo = np.float( input_clip_tempo) * zzz_dtw_cal_tempo_ratio # force output tempo in a range zzz_dtw_cal_input_tempo = max(tempo_min_song, zzz_dtw_cal_input_tempo) zzz_dtw_cal_input_tempo = min(tempo_max_song, zzz_dtw_cal_input_tempo) zzz_est_gt_endframe = gt_start_frame + np.int( est_gt_cens.shape[1] * zzz_dtw_cal_tempo_ratio) zzz_pre_reg_x = pre_reg_x zzz_pre_reg_y = pre_reg_y zzz_final_reg_slope = final_slope zzz_line_offset = reg_coef + line_dy_center zzz_reg_residuals = reg_residuals # calculate total cost value wp_length = wp.shape[0] X_start = wp[wp_length - 1, 0] X_end = wp[0, 0] Y_start = wp[wp_length - 1, 1] Y_end = wp[0, 1] total_best_path_cost = abs(cost_matrix[X_end, Y_end] - cost_matrix[X_start, Y_start]) zzz_dtw_cost = total_best_path_cost zzz_gt_length = est_gt_cens.shape[1] return zzz_pre_reg_x, \ zzz_pre_reg_y, \ zzz_final_reg_slope, \ zzz_line_offset, \ zzz_reg_residuals, \ zzz_gt_length, \ zzz_dtw_cost, \ zzz_dtw_cal_input_tempo, \ zzz_dtw_cal_tempo_ratio, \ zzz_est_gt_endframe, \ zzz_dtw_input_is_slower, \ zzz_dtw_input_is_faster
def get_alignment_from_audio(body, response): """Calculate alignment of an MEI file to an audio file. Returns a dictionary containing IDs of rests and notes as keys and their corresponding position in the audiofile as values.""" multipart_data = list(body.keys()) if 'mei' not in multipart_data or 'audio' not in multipart_data: response.status = HTTP_BAD_REQUEST return 'Please provide MEI and audio file.' # Work in temporary directory with tempfile.TemporaryDirectory() as temp_dir: # Load audiofile audio_path = os.path.join(temp_dir, 'audio') # Write audio to temporary file with open(audio_path, mode=('wb')) as audio_file: audio_file.write(body['audio']) # Read audiofile into array try: wave_data, sr = librosa.load(audio_path) except NoBackendError: response.status = HTTP_UNSUPPORTED_MEDIA_TYPE return 'Unsupported audio format.' # Generate timestamps for all notes and rests of the MEI file Mei = jpype.JPackage('meico').mei.Mei # Get Mei class try: mei_xml = body['mei'].decode('utf-8') # Extract MEI data from body mei = Mei(mei_xml, False) # Read in MEI data mei.addIds() mei.exportMsm( 720, True, False ) # Generate timestamps with ppq=720, no channel 10, no cleanup debug_mei_xml = mei.toXML() except jpype.JavaException as error: response.status = HTTP_BAD_REQUEST return traceback.format_exc() # Calculate MEI chroma features chroma_mei, id_to_chroma_index = from_meico(debug_mei_xml) # Calculate audio chroma features chroma_size = round(len(wave_data) / chroma_mei.shape[1]) chroma_audio = librosa.feature.chroma_stft(y=wave_data, sr=sr, hop_length=chroma_size) # Calculate warping path path = librosa.dtw(chroma_mei, chroma_audio)[1] path_dict = {key: value for (key, value) in path} # Extract mappings id_to_time = {} chroma_length = len(wave_data) / sr / chroma_audio.shape[1] for identifier in id_to_chroma_index: id_to_time[identifier] = path_dict[ id_to_chroma_index[identifier]] * chroma_length return id_to_time
def test_dtw_incompatible_sigma_diag(): X = np.array([[1, 3, 3, 8, 1, 2]]) Y = np.array([[2, 0, 0, 8, 7]]) librosa.dtw(X=X, Y=Y, step_sizes_sigma=np.ones((1, 2), dtype=int))
def get_alignment_from_yt(body, response): """Calculate alignment of an MEI file to an youtube video. Returns a dictionary containing IDs of rests and notes as keys and their corresponding position in the youtube video as values.""" multipart_data = list(body.keys()) if 'mei' not in multipart_data or 'youtube-url' not in body: response.status = HTTP_BAD_REQUEST return 'Please provide MEI and a valid YouTube link.' # Work in temporary directory with tempfile.TemporaryDirectory() as temp_dir: # Download YouTube video youtube_url = body['youtube-url'] video_path = os.path.join(temp_dir, 'youtube.audio') download_video( youtube_url, 249, video_path ) # 249 = <Stream: itag="249" mime_type="audio/webm" abr="50kbps" acodec="opus"> # Extract audio using FFmpeg audio_path = os.path.join(temp_dir, 'audio.wav') cmd = [ 'ffmpeg', '-i', video_path, '-acodec', 'pcm_s16le', '-ac', '2', audio_path ] subprocess.call(cmd) # Trim silence in audio file and read into numpy array audio = AudioSegment.from_file(audio_path, format='wav') audio = audio.split_to_mono()[0] sr = audio.frame_rate trim_start = detect_leading_silence(audio) trim_end = detect_leading_silence(audio.reverse()) trimmed = audio[trim_start:len(audio) - trim_end] wave_data = np.asarray(trimmed.get_array_of_samples(), dtype=np.float) # Generate timestamps for all notes and rests of the MEI file Mei = jpype.JPackage('meico').mei.Mei # Get Mei class try: mei_xml = body['mei'].decode('utf-8') # Extract MEI data from body mei = Mei(mei_xml, False) # Read in MEI data mei.addIds() mei.exportMsm( 720, True, False ) # Generate timestamps with ppq=720, no channel 10, no cleanup debug_mei_xml = mei.toXML() except jpype.JavaException as error: response.status = HTTP_BAD_REQUEST return traceback.format_exc() # Calculate MEI chroma features chroma_mei, id_to_chroma_index = from_meico(debug_mei_xml) # Calculate audio chroma features chroma_size = round(len(wave_data) / chroma_mei.shape[1]) chroma_audio = librosa.feature.chroma_stft(y=wave_data, sr=sr, hop_length=chroma_size) # Calculate warping path path = librosa.dtw(chroma_mei, chroma_audio)[1] path_dict = {key: value for (key, value) in path} # Extract mappings id_to_time = {} chroma_length = len(wave_data) / sr / chroma_audio.shape[1] for identifier in id_to_chroma_index: id_to_time[identifier] = path_dict[ id_to_chroma_index[identifier]] * chroma_length id_to_time[identifier] += trim_start / 1000 # Offset for trimmed audio return id_to_time
def dtw(n, m): D, wp = librosa.dtw(n, m) return wp
def TranslateNumber(MotherLan, TransLan, Record_File_Path): DetectSound(Record_File_Path) Response_File_Path = './response voice data/' if MotherLan == 0: MotherLanAppend = 'ch' elif MotherLan == 1: MotherLanAppend = 'en' elif MotherLan == 2: MotherLanAppend = 'jp' if TransLan == 0: TransLanAppend = 'ch' elif TransLan == 1: TransLanAppend = 'en' elif TransLan == 2: TransLanAppend = 'jp' Language_test, fs = lib.load(Record_File_Path) compare0, fs0 = lib.load(Response_File_Path + '0_response_' + MotherLanAppend + '.wav') compare1, fs1 = lib.load(Response_File_Path + '1_response_' + MotherLanAppend + '.wav') compare2, fs2 = lib.load(Response_File_Path + '2_response_' + MotherLanAppend + '.wav') compare3, fs3 = lib.load(Response_File_Path + '3_response_' + MotherLanAppend + '.wav') compare4, fs4 = lib.load(Response_File_Path + '4_response_' + MotherLanAppend + '.wav') compare5, fs5 = lib.load(Response_File_Path + '5_response_' + MotherLanAppend + '.wav') compare6, fs6 = lib.load(Response_File_Path + '6_response_' + MotherLanAppend + '.wav') compare7, fs7 = lib.load(Response_File_Path + '7_response_' + MotherLanAppend + '.wav') compare8, fs8 = lib.load(Response_File_Path + '8_response_' + MotherLanAppend + '.wav') compare9, fs9 = lib.load(Response_File_Path + '9_response_' + MotherLanAppend + '.wav') # plt.plot(Language_test) # plt.show() Language_test = pre_emphasis(signal = Language_test) # plt.plot(Language_test) # plt.show() # Language_test = butter_lowpass_filter(Language_test, 1000, fs, 6) # plt.plot(Language_test) # plt.show() test = Language_test # test = [] # for i in range(len(Language_test)-1): # if not((Language_test[i] < 0.005 and Language_test[i] > -0.005) and (Language_test[i+1] < 0.005 and Language_test[i+1] > -0.005)): # test = np.hstack((test,Language_test[i])) # plt.plot(test) # plt.show() D_compare0, wp_0 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare0, sr=fs0, n_mfcc=30)) D_compare1, wp_1 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare1, sr=fs1, n_mfcc=30)) D_compare2, wp_2 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare2, sr=fs2, n_mfcc=30)) D_compare3, wp_3 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare3, sr=fs3, n_mfcc=30)) D_compare4, wp_4 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare4, sr=fs4, n_mfcc=30)) D_compare5, wp_5 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare5, sr=fs5, n_mfcc=30)) D_compare6, wp_6 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare6, sr=fs6, n_mfcc=30)) D_compare7, wp_7 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare7, sr=fs7, n_mfcc=30)) D_compare8, wp_8 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare8, sr=fs8, n_mfcc=30)) D_compare9, wp_9 = lib.dtw(lib.feature.mfcc(y=test, sr=fs, n_mfcc=30), lib.feature.mfcc(y=compare9, sr=fs9, n_mfcc=30)) Shortest_D = min(D_compare0[-1,-1], D_compare1[-1,-1],\ D_compare2[-1,-1], D_compare3[-1,-1],\ D_compare4[-1,-1], D_compare5[-1,-1],\ D_compare6[-1,-1], D_compare7[-1,-1],\ D_compare8[-1,-1], D_compare9[-1,-1]) if Shortest_D == D_compare0[-1,-1]: playsound(Response_File_Path+'0_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare1[-1,-1]: playsound(Response_File_Path+'1_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare2[-1,-1]: playsound(Response_File_Path+'2_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare3[-1,-1]: playsound(Response_File_Path+'3_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare4[-1,-1]: playsound(Response_File_Path+'4_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare5[-1,-1]: playsound(Response_File_Path+'5_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare6[-1,-1]: playsound(Response_File_Path+'6_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare7[-1,-1]: playsound(Response_File_Path+'7_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare8[-1,-1]: playsound(Response_File_Path+'8_response_' + TransLanAppend + '.wav') elif Shortest_D == D_compare9[-1,-1]: playsound(Response_File_Path+'9_response_' + TransLanAppend + '.wav') return D_compare0[-1,-1], D_compare1[-1,-1], D_compare2[-1,-1], D_compare3[-1,-1], D_compare4[-1,-1], D_compare5[-1,-1], D_compare6[-1,-1], D_compare7[-1,-1], D_compare8[-1,-1], D_compare9[-1,-1]
def test_dtw_incompatible_sigma_add(): X = np.array([[1, 3, 3, 8, 1]]) Y = np.array([[2, 0, 0, 8, 7, 2]]) librosa.dtw(X=X, Y=Y, weights_add=np.arange(10))
def TranslateNumber(MotherLan, TransLan, Record_File_Path): DetectSound(Record_File_Path) Response_File_Path = './response voice data/' if MotherLan == 0: Language_test, fs = lib.load(Record_File_Path) compare0, fs0 = lib.load(Response_File_Path + '0_response_ch.wav') compare1, fs1 = lib.load(Response_File_Path + '1_response_ch.wav') compare2, fs2 = lib.load(Response_File_Path + '2_response_ch.wav') compare3, fs3 = lib.load(Response_File_Path + '3_response_ch.wav') compare4, fs4 = lib.load(Response_File_Path + '4_response_ch.wav') compare5, fs5 = lib.load(Response_File_Path + '5_response_ch.wav') compare6, fs6 = lib.load(Response_File_Path + '6_response_ch.wav') compare7, fs7 = lib.load(Response_File_Path + '7_response_ch.wav') compare8, fs8 = lib.load(Response_File_Path + '8_response_ch.wav') compare9, fs9 = lib.load(Response_File_Path + '9_response_ch.wav') MFCC_test = lib.feature.mfcc(y=pre_emphasis(signal=Language_test), sr=fs, n_mfcc=20) D_compare0, wp_0 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare0, sr=fs0, n_mfcc=20)) D_compare1, wp_1 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare1, sr=fs1, n_mfcc=20)) D_compare2, wp_2 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare2, sr=fs2, n_mfcc=20)) D_compare3, wp_3 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare3, sr=fs3, n_mfcc=20)) D_compare4, wp_4 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare4, sr=fs4, n_mfcc=20)) D_compare5, wp_5 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare5, sr=fs5, n_mfcc=20)) D_compare6, wp_6 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare6, sr=fs6, n_mfcc=20)) D_compare7, wp_7 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare7, sr=fs7, n_mfcc=20)) D_compare8, wp_8 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare8, sr=fs8, n_mfcc=20)) D_compare9, wp_9 = lib.dtw( MFCC_test, lib.feature.mfcc(y=compare9, sr=fs9, n_mfcc=20)) Shortest_D = min(D_compare0[-1,-1], D_compare1[-1,-1],\ D_compare2[-1,-1], D_compare3[-1,-1],\ D_compare4[-1,-1], D_compare5[-1,-1],\ D_compare6[-1,-1], D_compare7[-1,-1],\ D_compare8[-1,-1], D_compare9[-1,-1]) if Shortest_D == D_compare0[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '0_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '0_response_jp.wav') elif Shortest_D == D_compare1[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '1_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '1_response_jp.wav') elif Shortest_D == D_compare2[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '2_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '2_response_jp.wav') elif Shortest_D == D_compare3[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '3_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '3_response_jp.wav') elif Shortest_D == D_compare4[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '4_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '4_response_jp.wav') elif Shortest_D == D_compare5[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '5_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '5_response_jp.wav') elif Shortest_D == D_compare6[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '6_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '6_response_jp.wav') elif Shortest_D == D_compare7[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '7_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '7_response_jp.wav') elif Shortest_D == D_compare8[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '8_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '8_response_jp.wav') elif Shortest_D == D_compare9[-1, -1]: if TransLan == 1: playsound(Response_File_Path + '9_response_en.wav') elif TransLan == 2: playsound(Response_File_Path + '9_response_jp.wav') return D_compare0[-1, -1], D_compare1[-1, -1], D_compare2[ -1, -1], D_compare3[-1, -1], D_compare4[-1, -1], D_compare5[ -1, -1], D_compare6[-1, -1], D_compare7[-1, -1], D_compare8[ -1, -1], D_compare9[-1, -1] #elif MotherLan == 1: #elif MotherLan == 2: else: print('Invalid mother_lan paramater')
from DSPbox import MFCC import scipy.io.wavfile as wav import numpy as np import librosa rate, signal = wav.read('./Observation.wav') obser = MFCC(signal, rate) result = [] for i in range(5): rate, signal = wav.read('./{:d}.wav'.format(i + 1)) compare = MFCC(signal, rate) d = np.zeros((len(obser) + 1, len(compare) + 1)) for x in range(len(obser)): d[x + 1, 1] = abs(compare[0] - obser[x]) + d[x, 1] for y in range(len(compare)): d[1, y + 1] = abs(compare[y] - obser[0]) + d[1, y] for y in range(2, len(compare) + 1): for x in range(2, len(obser) + 1): d[x, y] = abs(compare[y - 1] - obser[x - 1]) + min( d[x - 1, y], d[x, y - 1], d[x - 1, y - 1]) result.append(d[-1, -1]) print(i + 1, "->", d[-1, -1]) print(i + 1, "->", librosa.dtw(obser, compare)[0][-1, -1], "(by librosa)") print("最相似:", np.argmin(result) + 1)
def test_1d_input(): X = np.array([[1], [3], [3], [8], [1]]) Y = np.array([[2], [0], [0], [8], [7], [2]]) librosa.dtw(X=X, Y=Y)
def DTW(mfcc1, mfcc2): # Calculate the distances from the test signal to ref d, wp = librosa.dtw(mfcc1, mfcc2, metric='euclidean') return d[d.shape[0] - 1][d.shape[1] - 1]
import time if __name__ == '__main__': fs, sig = wavfile.read('Observation.wav','r') obser = dsp.MFCC(sig,fs) #print(obser) file = ['1.wav', '2.wav', '3.wav', '4.wav', '5.wav'] Distance = [] tStart = time.time()#計時開始 for index in file: fs, sig = wavfile.read(index ,'r') obser_ = dsp.MFCC(sig,fs) D, wp =lb.dtw(obser, obser_)# Obervation與 wav1 都是音訊檔的MFCC特徵向量#D是路徑迴溯矩陣 #print('Wp=',wp) #print('D=',D) A = (D[D.shape[0]-1 ,D.shape[1]-1]) #最後的累積距離值A(N,M) print(index,'.wav A(N,M)=',A,sep='') Distance.append(A) #print (Distance) count =1 for index in Distance: ++count if index == min(Distance): print ('Ans = ',count,'.wav', sep='') tEnd = time.time() print ("DTW by python cost %f sec" % (tEnd - tStart))#會自動做近位
def test_dtw_incompatible_args_02(): librosa.dtw(C=None, X=None, Y=None)
def test_1d_input(): X = np.array([[1], [3], [3], [8], [1]]) Y = np.array([[2], [0], [0], [8], [7], [2]]) librosa.dtw(X=X, Y=Y)