Пример #1
0
def get_preprocessed_wav(wav_path, tg_path):
    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    _, duration, start, end = get_alignment(
        textgrid.get_tier_by_name('phones'))

    # Read and trim wav files
    sr, wav = read(wav_path)
    wav = wav[int(hparams.sampling_rate * start):int(hparams.sampling_rate *
                                                     end)].astype(np.float32)
    return wav, sr, duration
Пример #2
0
def process_utterance(in_dir, out_dir, basename):
    wav_path = os.path.join(in_dir, 'wavn', '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename)) 
    
    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(textgrid.get_tier_by_name('phones'))
    text = '{'+ ' '.join(phone) + '}'
    text = text.replace(' $ ', '} {') # $ represents silent phones
    if start >= end:
        return None
    
    # Read and trim wav files
    _, wav = read(wav_path)
    wav = wav[int(hp.sampling_rate*start):int(hp.sampling_rate*end)].astype(np.float32)
    
    # Compute fundamental frequency
    f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length/hp.sampling_rate*1000)
    f0 = f0[:sum(duration)]

    # Compute mel-scale spectrogram
    mel_spectrogram = Audio.tools.get_mel_from_wav(torch.FloatTensor(wav)).numpy().astype(np.float32)
    mel_spectrogram = mel_spectrogram[:, :sum(duration)]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # Compute energy
    energy = np.linalg.norm(mel_spectrogram, axis=0)

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False)
    
    return '|'.join([basename, text]), max(f0), min([f for f in f0 if f != 0]), max(energy), min(energy), mel_spectrogram.shape[1]
Пример #3
0
def process_utterance(in_dir, out_dir, dirname, basename):
    wav_path = os.path.join(in_dir, dirname, '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', dirname,
                           '{}.TextGrid'.format(basename))

    if not os.path.exists(tg_path):
        return None

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name('phones'))
    text = '{' + '}{'.join(
        phone) + '}'  # '{A}{B}{$}{C}', $ represents silent phones
    text = text.replace('{$}', ' ')  # '{A}{B} {C}'
    text = text.replace('}{', ' ')  # '{A B} {C}'

    if start >= end:
        return None

    # Read and trim wav files
    wav, _ = librosa.load(wav_path, sr=hp.sampling_rate)
    wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate *
                                                end)].astype(np.float32)

    # Compute fundamental frequency
    f0, _ = pw.dio(wav.astype(np.float64),
                   hp.sampling_rate,
                   frame_period=hp.hop_length / hp.sampling_rate * 1000)
    f0 = f0[:sum(duration)]

    # Compute mel-scale spectrogram and energy
    mel_spectrogram, energy = Audio.tools.get_mel_from_wav(
        torch.FloatTensor(wav))
    mel_spectrogram = mel_spectrogram.cpu().numpy().astype(
        np.float32)[:, :sum(duration)]
    energy = energy.cpu().numpy().astype(np.float32)[:sum(duration)]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # if the shape is not right, you can check get_alignment function
    try:
        assert (f0.shape[0] == energy.shape[0] == mel_spectrogram.shape[1])
    except AssertionError as e:
        print("duration problem: {}".format(wav_path))
        return None

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename),
            duration,
            allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename),
            energy,
            allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel', mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    try:
        return '|'.join([basename, text]), max(f0), min([
            f for f in f0 if f != 0
        ]), max(energy), min(energy), mel_spectrogram.shape[1]
    except:
        #print(basename)
        return None
Пример #4
0
def process_utterance(in_dir, out_dir, basename):
    wav_path = os.path.join(in_dir, 'wavs', '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename))

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    # phone: list<112, phone string>, 已去掉前后静音
    # duration: list<112, frames number per phone>, 每个里面是此phone持续的帧数
    # start, end: float,表示的是去掉音频文件中前后空白silence音后的区间。
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name('phones'))
    if start >= end:
        return None

    phone, duration = add_pad_between_word(phone, duration, textgrid)
    sum_duration = sum(duration)
    text = '{' + '}{'.join(
        phone) + '}'  # '{A}{B}{$}{C}', $ represents silent phones
    text = text.replace('{$}', ' ')  # '{A}{B} {C}'
    text = text.replace('}{', ' ')  # '{A B} {C}'

    # Read and trim wav files
    # wav ndarray<212893>
    _, wav = read(wav_path)
    wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate *
                                                end)].astype(np.float32)

    # Compute mel-scale spectrogram and energy
    # mel_spectrogram: ndarray<80, 831> 梅尔普,这里范围是0-8000HZ内,再分成80段,怎么分还不知道
    # energy: ndarray<831> 音量,这里范围是0到315
    mel_spectrogram, energy = Audio.tools.get_mel_from_wav(
        torch.FloatTensor(wav))
    mel_spectrogram = mel_spectrogram.numpy().astype(
        np.float32)[:, :sum_duration]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # energy = energy.numpy().astype(np.float32)[:sum_duration]
    #
    # # Compute fundamental frequency
    # # f0 ndarray<832>
    # f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000)
    # # f0 ndarray<831> 基础频率,也可以认为是声带振动的频率,人类一般是140HZ,这里范围是70-800HZ
    # f0 = f0[:sum_duration]

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename),
            duration,
            allow_pickle=False)

    # # Save fundamental prequency
    # f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    # np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)
    #
    # # Save energy
    # energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    # np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False)
    #
    # # Save spectrogram
    # mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    # np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False)

    # return '|'.join([basename, text]), max(f0), min([f for f in f0 if f != 0]), max(energy), min(energy), \
    #        mel_spectrogram.shape[1]

    return '|'.join([basename, text])
Пример #5
0
def process_utterance(in_dir, out_dir, basename, scalers):
    wav_bak_basename=basename.replace('.wav','')
    basename = wav_bak_basename[2:]
    wav_bak_path = os.path.join(in_dir, "wavs_bak", "{}.wav".format(wav_bak_basename))
    wav_path = os.path.join(in_dir, 'wavs', '{}.wav'.format(basename))

    # Convert kss data into PCM encoded wavs
    if not os.path.isfile(wav_path):
        os.system("ffmpeg -i {} -ac 1 -ar 22050 {}".format(wav_bak_path, wav_path))    
    tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename)) 
    
    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(textgrid.get_tier_by_name('phones'))
    text = '{'+ '}{'.join(phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones
    text = text.replace('{$}', ' ')    # '{A}{B} {C}'
    text = text.replace('}{', ' ')     # '{A B} {C}'

    if start >= end:
        return None

    # Read and trim wav files
    _, wav = read(wav_path)
    wav = wav[int(hp.sampling_rate*start):int(hp.sampling_rate*end)].astype(np.float32)

    # Compute fundamental frequency
    f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length/hp.sampling_rate*1000)
    f0 = f0[:sum(duration)]

    # Compute mel-scale spectrogram and energy
    mel_spectrogram, energy = Audio.tools.get_mel_from_wav(torch.FloatTensor(wav))
    mel_spectrogram = mel_spectrogram.numpy().astype(np.float32)[:, :sum(duration)]
    energy = energy.numpy().astype(np.float32)[:sum(duration)]

    f0, energy = remove_outlier(f0), remove_outlier(energy)
    f0, energy = average_by_duration(f0, duration), average_by_duration(energy, duration)

    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False)
   
    mel_scaler, f0_scaler, energy_scaler = scalers

    mel_scaler.partial_fit(mel_spectrogram.T)
    f0_scaler.partial_fit(f0[f0!=0].reshape(-1, 1))
    energy_scaler.partial_fit(energy[energy != 0].reshape(-1, 1))

    return '|'.join([basename, text]), mel_spectrogram.shape[1]
Пример #6
0
def process_utterance(in_dir, out_dir, basename):
    wav_path = os.path.join(in_dir, 'wavs', '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename))

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name('phones'))
    '''
    print("basename:",basename)
    print("phone:",phone)
    print("duration:",duration)
    print("start:",start)
    print("end",end)
    '''
    text = '{' + '}{'.join(
        phone) + '}'  # '{A}{B}{$}{C}', $ represents silent phones
    text = text.replace('{$}', ' ')  # '{A}{B} {C}'
    text = text.replace('}{', ' ')  # '{A B} {C}'

    if start >= end:
        return None

    # Read and trim wav files
    _, wav = read(wav_path)
    #print("len of wav(before):", len(wav))
    wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate *
                                                end)].astype(np.float32)

    #print(np.size(wav,0))           #自加: remove the wav files that are too short
    if np.size(wav, 0) < 1024:
        return None
    '''
    print("sum of duration:", sum(duration))
    print("len of wav(after)", len(wav))
    '''
    # Compute fundamental frequency
    f0, _ = pw.dio(wav.astype(np.float64),
                   hp.sampling_rate,
                   frame_period=hp.hop_length / hp.sampling_rate *
                   1000)  #change from dio to harvest
    f0 = f0[:sum(duration)]
    if max(f0) == 0:  #自加: remove the wav files which f0 are all 0
        return None

    # Compute mel-scale spectrogram and energy
    mel_spectrogram, energy = Audio.tools.get_mel_from_wav(
        torch.FloatTensor(wav))
    mel_spectrogram = mel_spectrogram.numpy().astype(
        np.float32)[:, :sum(duration)]
    energy = energy.numpy().astype(np.float32)[:sum(duration)]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None
    '''
    #added by eric
    print("wav:\n",wav)
    print("f0:\n",f0)
    print("mel_spectrogram:\n",mel_spectrogram)
    print("energy:",energy)
    '''
    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename),
            duration,
            allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename),
            energy,
            allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel', mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    return '|'.join([basename, text]), max(f0), min(
        [f for f in f0 if f > 0]), max(energy), min(
            energy), mel_spectrogram.shape[1]  #change: f0 can be zero
Пример #7
0
def process_utterance(in_dir, out_dir, dirname, basename):
    wav_path = os.path.join(in_dir, dirname, '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', dirname,
                           '{}.TextGrid'.format(basename))

    if not os.path.exists(tg_path):
        return None

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = utils.get_alignment(
        textgrid.get_tier_by_name('phones'))
    text = '{' + '}{'.join(
        phone) + '}'  # '{A}{B}{$}{C}', $ represents silent phones
    text = text.replace('{$}', ' ')  # '{A}{B} {C}'
    text = text.replace('}{', ' ')  # '{A B} {C}'

    if start >= end:
        return None

    # Read and trim wav files
    sr, wav = read(wav_path)
    wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate *
                                                end)].astype(np.float32)

    # Compute fundamental frequency
    f0, _ = pw.dio(wav.astype(np.float64),
                   hp.sampling_rate,
                   frame_period=hp.hop_length / hp.sampling_rate * 1000)
    f0 = f0[:sum(duration)]

    # Compute mel-scale spectrogram and energy
    mel_spectrogram, energy, _ = Audio.tools.get_mel_from_wav(
        torch.FloatTensor(wav))
    mel_spectrogram = mel_spectrogram.numpy().astype(
        np.float32)[:, :sum(duration)]
    energy = energy.numpy().astype(np.float32)[:sum(duration)]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename),
            duration,
            allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save normalized fundamental prequency
    f0_norm = utils.f0_normalization(f0)
    np.save(os.path.join(out_dir, 'f0_norm', f0_filename),
            f0_norm,
            allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename),
            energy,
            allow_pickle=False)

    # Save rescaled energy
    energy_0to1 = utils.energy_rescaling(energy)
    np.save(os.path.join(out_dir, 'energy_0to1', energy_filename),
            energy_0to1,
            allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel_clean', mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    return '|'.join([basename, text]), max(f0), min(
        [f for f in f0
         if f != 0]), max(energy), min(energy), mel_spectrogram.shape[1]
Пример #8
0
def process_utterance(in_dir, out_dir, basename):
    wav_path = os.path.join(in_dir, '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename))
    if not os.path.exists(tg_path):
        print(tg_path, ' is not found')
        return None
    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name('phones'))
    # phone = [i for i in phone if len(i) >= 1]
    for i, v in enumerate(phone):
        if len(v) == 0:
            phone[i] = '{sp}'
        elif 'sp' in v:
            phone[i] = '{sp}'
        elif v == 'sil':
            phone[i] = '{sp}'

    while '{sp}' in phone[-1]:
        phone = phone[:-1]

    text = ''.join(phone)

    # text = '{' + '}{'.join(phone) + '}'  # '{A}{B}{$}{C}', $ represents silent phones
    # text = text.replace('{$}', ' ')  # '{A}{B} {C}'
    # text = text.replace('}{', ' ')  # '{A B} {C}'
    duration = duration[:len(phone)]

    if start >= end:
        return None

    # Read and trim wav files
    _, wav = read(wav_path)
    # print(_)
    wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate *
                                                end)].astype(np.float32)
    # wav = wav[:, 0]
    # print(wav.shape, len(wav))
    # Compute fundamental frequency
    f0, _ = pw.dio(wav.astype(np.float64),
                   hp.sampling_rate,
                   frame_period=hp.hop_length / hp.sampling_rate * 1000)
    f0 = f0[:sum(duration)]

    # Compute mel-scale spectrogram and energy
    mel_spectrogram, energy = Audio.tools.get_mel_from_wav(
        torch.FloatTensor(wav))
    mel_spectrogram = mel_spectrogram.numpy().astype(
        np.float32)[:, :sum(duration)]
    energy = energy.numpy().astype(np.float32)[:sum(duration)]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename),
            duration,
            allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename),
            energy,
            allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel', mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    return '|'.join([basename, text]), max(f0), min([f for f in f0 if f != 0]), max(energy), min(energy), \
           mel_spectrogram.shape[1]