示例#1
0
def get_aligned_jointdata(orgdata, orgnpow, tardata, tarnpow, cvdata=None):
    """Get aligment between features

    Paramters
    ---------
    orgdata : array, shape (`T_org`, `dim`)
        Acoustic feature of source speaker
    orgnpow : array, shape (`T_org`)
        Normalized power of soruce speaker
    orgdata : array, shape (`T_tar`, `dim`)
        Acoustic feature of target speaker
    orgnpow : array, shape (`T_tar`)
        Normalized power of target speaker
    cvdata : array, optiona, shape (`T_org`, `dim`)
        Converted acoustic feature from source into target

    Returns
    ---------
    jdata : array, shape (`T_new` `dim * 2`)
        Joint feature vector between source and target
    twf : array, shape (`T_new`, `2`)
        Time warping function
    mcd : float,
        Mel-cepstrum distortion between source and target

    """

    # extract extsddata
    org_extsddata = extfrm(static_delta(orgdata), orgnpow)
    tar_extsddata = extfrm(static_delta(tardata), tarnpow)

    if cvdata is None:
        # calculate twf and mel-cd
        twf = estimate_twf(org_extsddata, tar_extsddata, distance='melcd')
        mcd = melcd(org_extsddata[twf[0]], tar_extsddata[twf[1]])
    else:
        if orgdata.shape != cvdata.shape:
            raise ValueError('Dimension mismatch between orgdata and cvdata: \
                             {} {}'.format(orgdata.shape, cvdata.shape))
        # calculate twf and mel-cd with converted data
        cv_extsddata = extfrm(static_delta(cvdata), orgnpow)
        twf = estimate_twf(cv_extsddata, tar_extsddata, distance='melcd')
        mcd = melcd(cv_extsddata[twf[0]], tar_extsddata[twf[1]])

    # concatenate joint feature data into joint feature matrix
    jdata = np.c_[org_extsddata[twf[0]], tar_extsddata[twf[1]]]

    return jdata, twf, mcd
示例#2
0
    def test_melcd(self):
        org = get_random_peseudo_mcep()
        tar = get_random_peseudo_mcep()

        # perform dtw for mel-cd function test
        def distance_func(x, y): return melcd(x, y)
        twf = estimate_twf(org, tar, fast=True)
        twf = estimate_twf(org, tar, fast=False)

        # align org and tar
        orgmod = org[twf[0]]
        tarmod = tar[twf[1]]
        assert orgmod.shape == tarmod.shape

        # test for mel-cd calculation
        flen = len(twf[0])
        mcd = 0
        for t in range(flen):
            mcd += melcd(orgmod[t], tarmod[t])
        mcd1 = mcd / flen
        mcd2 = melcd(orgmod, tarmod)
        assert mcd1 - mcd2 < np.exp(-10)
示例#3
0
 def distance_func(x, y):
     return melcd(x, y)
示例#4
0
 def distance_func(x, y): return melcd(x, y)
 twf = estimate_twf(org, tar, fast=True)
示例#5
0
def evaluation(source_data_dir, target_data_dir):

    sampling_rate = 16000
    frame_period = 5.0
    num_features = 24

    f0_source = list()
    mceps_source = list()
    f0_target = list()
    mceps_target = list()

    source_path_list = os.listdir(source_data_dir)
    source_path_list.sort()
    target_path_list = os.listdir(target_data_dir)
    target_path_list.sort()

    for file_s in source_path_list:

        filepath_s = os.path.join(source_data_dir, file_s)
        wav, _ = librosa.load(filepath_s, sr=sampling_rate, mono=True)
        wav = wav_padding(wav=wav,
                          sr=sampling_rate,
                          frame_period=frame_period,
                          multiple=4)
        f0_s, timeaxis, sp, ap = world_decompose(wav=wav,
                                                 fs=sampling_rate,
                                                 frame_period=frame_period)
        f0_source.append(f0_s)  #[640,]
        coded_sp = world_encode_spectral_envelop(sp=sp,
                                                 fs=sampling_rate,
                                                 dim=num_features)
        mceps_source.append(coded_sp)  #[640,24]
        #print(file_s)

    for file_t in target_path_list:

        filepath_t = os.path.join(target_data_dir, file_t)
        wav, _ = librosa.load(filepath_t, sr=sampling_rate, mono=True)
        wav = wav_padding(wav=wav,
                          sr=sampling_rate,
                          frame_period=frame_period,
                          multiple=4)
        f0_t, timeaxis, sp, ap = world_decompose(wav=wav,
                                                 fs=sampling_rate,
                                                 frame_period=frame_period)
        f0_target.append(f0_t)
        coded_sp = world_encode_spectral_envelop(sp=sp,
                                                 fs=sampling_rate,
                                                 dim=num_features)
        mceps_target.append(coded_sp)
        #print(file_t)

    # Calculate PCC:
    print("Calculating PCC")
    PCC = 0
    PCC_sum = 0
    PCC_average = 0
    for f0_s, f0_t in zip(f0_source, f0_target):
        uv_s, cont_f0_s = convert_continuos_f0(f0_s)
        uv_t, cont_f0_t = convert_continuos_f0(f0_t)
        cont_f0_s = cont_f0_s[..., None]
        cont_f0_t = cont_f0_t[..., None]
        # dtw
        twf = estimate_twf(cont_f0_s, cont_f0_t, fast=True)
        orgmod = cont_f0_s[twf[0]]
        tarmod = cont_f0_t[twf[1]]
        assert orgmod.shape == tarmod.shape
        PCC, _ = pearsonr(orgmod, tarmod)
        PCC_sum += PCC
        print(PCC)
    PCC_average = PCC_sum / 10
    print("Average PCC is", PCC_average)

    # Calculating MCD:
    print("Calculating MCD")
    mcd = 0
    mcd_sum = 0
    for mceps_s, mceps_t in zip(mceps_source, mceps_target):
        #dtw
        def distance_func(x, y):
            return melcd(x, y)

        mceps_s = mceps_s[:, 1:]
        mceps_t = mceps_t[:, 1:]
        twf = estimate_twf(mceps_s, mceps_t, fast=True)
        orgmod = mceps_s[twf[0]]
        tarmod = mceps_t[twf[1]]
        assert orgmod.shape == tarmod.shape
        mcd = melcd(orgmod, tarmod)
        mcd_sum += mcd
        print(mcd)
    mcd_average = mcd_sum / 10
    print("Average MCD is", mcd_average)

    #Calculating RMSE:
    print("Calculating RMSE")
    RMSE = 0
    RMSE_sum = 0
    RMSE_average = 0
    for f0_s, f0_t in zip(f0_source, f0_target):

        #uv_s, cont_f0_s = convert_continuos_f0(f0_s)
        #uv_t, cont_f0_t = convert_continuos_f0(f0_t)
        #log_f0_s = np.log(cont_f0_s) * np.squeeze(uv_s)
        #log_f0_t = np.log(cont_f0_t) * np.squeeze(uv_t)
        #log_f0_s = log_f0_s[..., None]
        #log_f0_t = log_f0_t[..., None]

        f0_s = f0_s[..., None]
        f0_t = f0_t[..., None]

        #dtw
        #twf = estimate_twf(log_f0_s, log_f0_t, fast=True)
        twf = estimate_twf(f0_s, f0_t, fast=True)
        orgmod = f0_s[twf[0]]
        tarmod = f0_t[twf[1]]
        assert orgmod.shape == tarmod.shape
        diff = orgmod - tarmod
        RMSE = np.sqrt(np.mean(diff**2))
        RMSE_sum += RMSE
        print(RMSE)
    RMSE_average = RMSE_sum / 10
    print("Average RMSE is", RMSE_average)
示例#6
0
     def distance_func(x, y): return melcd(x, y)
 else:
示例#7
0
def get_alignment(odata, onpow, tdata, tnpow, opow=-20, tpow=-20,
                  sd=0, cvdata=None, given_twf=None, otflag=None,
                  distance='melcd'):
    """Get alignment between original and target

    Paramters
    ---------
    odata : array, shape (`T`, `dim`)
        Acoustic feature vector of original
    onpow : array, shape (`T`)
        Normalized power vector of original
    tdata : array, shape (`T`, `dim`)
        Acoustic feature vector of target
    tnpow : array, shape (`T`)
        Normalized power vector of target
    opow : float, optional,
        Power threshold of original
        Default set to -20
    tpow : float, optional,
        Power threshold of target
        Default set to -20
    sd : int , optional,
        Start dimension to be used for alignment
        Default set to 0
    cvdata : array, shape (`T`, `dim`), optional,
        Converted original data
        Default set to None
    given_twf : array, shape (`T_new`, `dim * 2`), optional,
        Alignment given twf
        Default set to None
    otflag : str, optional
        Alignment into the length of specification
        'org' : alignment into original length
        'tar' : alignment into target length
        Default set to None
    distance : str,
        Distance function to be used
        Default set to 'melcd'

    Returns
    -------
    jdata : array, shape (`T_new` `dim * 2`)
        Joint static and delta feature vector
    twf : array, shape (`T_new` `dim * 2`)
        Time warping function
    mcd : float,
        Mel-cepstrum distortion between arrays

    """

    oexdata = extsddata(odata[:, sd:], onpow,
                        power_threshold=opow)
    texdata = extsddata(tdata[:, sd:], tnpow,
                        power_threshold=tpow)

    if cvdata is None:
        align_odata = oexdata
    else:
        cvexdata = extsddata(cvdata, onpow,
                             power_threshold=opow)
        align_odata = cvexdata

    if given_twf is None:
        twf = estimate_twf(align_odata, texdata,
                           distance=distance, otflag=otflag)
    else:
        twf = given_twf

    jdata = align_data(oexdata, texdata, twf)
    mcd = melcd(align_odata[twf[0]], texdata[twf[1]])

    return jdata, twf, mcd
def get_alignment(odata, onpow, tdata, tnpow, opow=-20, tpow=-20,
                  sd=0, cvdata=None, given_twf=None, otflag=None,
                  distance='melcd'):
    """Get alignment between original and target

    Paramters
    ---------
    odata : array, shape (`T`, `dim`)
        Acoustic feature vector of original
    onpow : array, shape (`T`)
        Normalized power vector of original
    tdata : array, shape (`T`, `dim`)
        Acoustic feature vector of target
    tnpow : array, shape (`T`)
        Normalized power vector of target
    opow : float, optional,
        Power threshold of original
        Default set to -20
    tpow : float, optional,
        Power threshold of target
        Default set to -20
    sd : int , optional,
        Start dimension to be used for alignment
        Default set to 0
    cvdata : array, shape (`T`, `dim`), optional,
        Converted original data
        Default set to None
    given_twf : array, shape (`T_new`, `dim * 2`), optional,
        Alignment given twf
        Default set to None
    otflag : str, optional
        Alignment into the length of specification
        'org' : alignment into original length
        'tar' : alignment into target length
        Default set to None
    distance : str,
        Distance function to be used
        Default set to 'melcd'

    Returns
    -------
    jdata : array, shape (`T_new` `dim * 2`)
        Joint static and delta feature vector
    twf : array, shape (`T_new` `dim * 2`)
        Time warping function
    mcd : float,
        Mel-cepstrum distortion between arrays

    """

    oexdata = extsddata(odata[:, sd:], onpow,
                        power_threshold=opow)
    texdata = extsddata(tdata[:, sd:], tnpow,
                        power_threshold=tpow)

    if cvdata is None:
        align_odata = oexdata
    else:
        cvexdata = extsddata(cvdata, onpow,
                             power_threshold=opow)
        align_odata = cvexdata

    if given_twf is None:
        twf = estimate_twf(align_odata, texdata,
                           distance=distance, otflag=otflag)
    else:
        twf = given_twf

    jdata = align_data(oexdata, texdata, twf)
    mcd = melcd(align_odata[twf[0]], texdata[twf[1]])

    return jdata, twf, mcd