def get_aligned_jointdata(orgdata, orgnpow, tardata, tarnpow, cvdata=None): """Get aligment between features Paramters --------- orgdata : array, shape (`T_org`, `dim`) Acoustic feature of source speaker orgnpow : array, shape (`T_org`) Normalized power of soruce speaker orgdata : array, shape (`T_tar`, `dim`) Acoustic feature of target speaker orgnpow : array, shape (`T_tar`) Normalized power of target speaker cvdata : array, optiona, shape (`T_org`, `dim`) Converted acoustic feature from source into target Returns --------- jdata : array, shape (`T_new` `dim * 2`) Joint feature vector between source and target twf : array, shape (`T_new`, `2`) Time warping function mcd : float, Mel-cepstrum distortion between source and target """ # extract extsddata org_extsddata = extfrm(static_delta(orgdata), orgnpow) tar_extsddata = extfrm(static_delta(tardata), tarnpow) if cvdata is None: # calculate twf and mel-cd twf = estimate_twf(org_extsddata, tar_extsddata, distance='melcd') mcd = melcd(org_extsddata[twf[0]], tar_extsddata[twf[1]]) else: if orgdata.shape != cvdata.shape: raise ValueError('Dimension mismatch between orgdata and cvdata: \ {} {}'.format(orgdata.shape, cvdata.shape)) # calculate twf and mel-cd with converted data cv_extsddata = extfrm(static_delta(cvdata), orgnpow) twf = estimate_twf(cv_extsddata, tar_extsddata, distance='melcd') mcd = melcd(cv_extsddata[twf[0]], tar_extsddata[twf[1]]) # concatenate joint feature data into joint feature matrix jdata = np.c_[org_extsddata[twf[0]], tar_extsddata[twf[1]]] return jdata, twf, mcd
def test_melcd(self): org = get_random_peseudo_mcep() tar = get_random_peseudo_mcep() # perform dtw for mel-cd function test def distance_func(x, y): return melcd(x, y) twf = estimate_twf(org, tar, fast=True) twf = estimate_twf(org, tar, fast=False) # align org and tar orgmod = org[twf[0]] tarmod = tar[twf[1]] assert orgmod.shape == tarmod.shape # test for mel-cd calculation flen = len(twf[0]) mcd = 0 for t in range(flen): mcd += melcd(orgmod[t], tarmod[t]) mcd1 = mcd / flen mcd2 = melcd(orgmod, tarmod) assert mcd1 - mcd2 < np.exp(-10)
def distance_func(x, y): return melcd(x, y)
def distance_func(x, y): return melcd(x, y) twf = estimate_twf(org, tar, fast=True)
def evaluation(source_data_dir, target_data_dir): sampling_rate = 16000 frame_period = 5.0 num_features = 24 f0_source = list() mceps_source = list() f0_target = list() mceps_target = list() source_path_list = os.listdir(source_data_dir) source_path_list.sort() target_path_list = os.listdir(target_data_dir) target_path_list.sort() for file_s in source_path_list: filepath_s = os.path.join(source_data_dir, file_s) wav, _ = librosa.load(filepath_s, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0_s, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period) f0_source.append(f0_s) #[640,] coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_features) mceps_source.append(coded_sp) #[640,24] #print(file_s) for file_t in target_path_list: filepath_t = os.path.join(target_data_dir, file_t) wav, _ = librosa.load(filepath_t, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0_t, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period) f0_target.append(f0_t) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_features) mceps_target.append(coded_sp) #print(file_t) # Calculate PCC: print("Calculating PCC") PCC = 0 PCC_sum = 0 PCC_average = 0 for f0_s, f0_t in zip(f0_source, f0_target): uv_s, cont_f0_s = convert_continuos_f0(f0_s) uv_t, cont_f0_t = convert_continuos_f0(f0_t) cont_f0_s = cont_f0_s[..., None] cont_f0_t = cont_f0_t[..., None] # dtw twf = estimate_twf(cont_f0_s, cont_f0_t, fast=True) orgmod = cont_f0_s[twf[0]] tarmod = cont_f0_t[twf[1]] assert orgmod.shape == tarmod.shape PCC, _ = pearsonr(orgmod, tarmod) PCC_sum += PCC print(PCC) PCC_average = PCC_sum / 10 print("Average PCC is", PCC_average) # Calculating MCD: print("Calculating MCD") mcd = 0 mcd_sum = 0 for mceps_s, mceps_t in zip(mceps_source, mceps_target): #dtw def distance_func(x, y): return melcd(x, y) mceps_s = mceps_s[:, 1:] mceps_t = mceps_t[:, 1:] twf = estimate_twf(mceps_s, mceps_t, fast=True) orgmod = mceps_s[twf[0]] tarmod = mceps_t[twf[1]] assert orgmod.shape == tarmod.shape mcd = melcd(orgmod, tarmod) mcd_sum += mcd print(mcd) mcd_average = mcd_sum / 10 print("Average MCD is", mcd_average) #Calculating RMSE: print("Calculating RMSE") RMSE = 0 RMSE_sum = 0 RMSE_average = 0 for f0_s, f0_t in zip(f0_source, f0_target): #uv_s, cont_f0_s = convert_continuos_f0(f0_s) #uv_t, cont_f0_t = convert_continuos_f0(f0_t) #log_f0_s = np.log(cont_f0_s) * np.squeeze(uv_s) #log_f0_t = np.log(cont_f0_t) * np.squeeze(uv_t) #log_f0_s = log_f0_s[..., None] #log_f0_t = log_f0_t[..., None] f0_s = f0_s[..., None] f0_t = f0_t[..., None] #dtw #twf = estimate_twf(log_f0_s, log_f0_t, fast=True) twf = estimate_twf(f0_s, f0_t, fast=True) orgmod = f0_s[twf[0]] tarmod = f0_t[twf[1]] assert orgmod.shape == tarmod.shape diff = orgmod - tarmod RMSE = np.sqrt(np.mean(diff**2)) RMSE_sum += RMSE print(RMSE) RMSE_average = RMSE_sum / 10 print("Average RMSE is", RMSE_average)
def distance_func(x, y): return melcd(x, y) else:
def get_alignment(odata, onpow, tdata, tnpow, opow=-20, tpow=-20, sd=0, cvdata=None, given_twf=None, otflag=None, distance='melcd'): """Get alignment between original and target Paramters --------- odata : array, shape (`T`, `dim`) Acoustic feature vector of original onpow : array, shape (`T`) Normalized power vector of original tdata : array, shape (`T`, `dim`) Acoustic feature vector of target tnpow : array, shape (`T`) Normalized power vector of target opow : float, optional, Power threshold of original Default set to -20 tpow : float, optional, Power threshold of target Default set to -20 sd : int , optional, Start dimension to be used for alignment Default set to 0 cvdata : array, shape (`T`, `dim`), optional, Converted original data Default set to None given_twf : array, shape (`T_new`, `dim * 2`), optional, Alignment given twf Default set to None otflag : str, optional Alignment into the length of specification 'org' : alignment into original length 'tar' : alignment into target length Default set to None distance : str, Distance function to be used Default set to 'melcd' Returns ------- jdata : array, shape (`T_new` `dim * 2`) Joint static and delta feature vector twf : array, shape (`T_new` `dim * 2`) Time warping function mcd : float, Mel-cepstrum distortion between arrays """ oexdata = extsddata(odata[:, sd:], onpow, power_threshold=opow) texdata = extsddata(tdata[:, sd:], tnpow, power_threshold=tpow) if cvdata is None: align_odata = oexdata else: cvexdata = extsddata(cvdata, onpow, power_threshold=opow) align_odata = cvexdata if given_twf is None: twf = estimate_twf(align_odata, texdata, distance=distance, otflag=otflag) else: twf = given_twf jdata = align_data(oexdata, texdata, twf) mcd = melcd(align_odata[twf[0]], texdata[twf[1]]) return jdata, twf, mcd