예제 #1
0
def _process_utterance(in_path, out_path, speaker, text):
    # Change sampling rate
    try:
        old_samplerate, old_audio = read_wav_np(in_path)
    except:
        return
    new_samplerate = hparams.sampling_rate

    if old_samplerate != new_samplerate:

        duration = old_audio.shape[0] / old_samplerate

        time_old = np.linspace(0, duration, old_audio.shape[0])
        time_new = np.linspace(
            0, duration,
            int(old_audio.shape[0] * new_samplerate / old_samplerate))

        interpolator = interpolate.interp1d(time_old, old_audio.T)
        new_audio = interpolator(time_new).T.astype(np.float32)
    else:
        new_audio = old_audio

    # Trim
    wav, _ = librosa.effects.trim(new_audio,
                                  top_db=25,
                                  frame_length=2048,
                                  hop_length=512)
    wav = torch.from_numpy(wav).unsqueeze(0)
    wav = wav.squeeze(0).numpy()
    write(out_path, 22050, wav)

    line = text.rstrip('\n')
    return (out_path, line, speaker)
예제 #2
0
def _process_utterance(in_path, out_path, spk_name_idx):
    # out_path = out_path.replace('pcm', 'wav')
    out_path = out_path.replace('wav', 'wav')
    dir = os.path.dirname(out_path)
    # wav is saved as int 16
    command = 'sox -L -c 1 -e signed -b 16 -t raw -r 44100 {} -c 1 -e signed -b 16 -t wav -r 22050 {}'\
        .format(in_path, out_path)
    os.system(command)
    # int16 is converted into float32 here
    sampling_rate, audio = read_wav_np(out_path)
    wav, _ = librosa.effects.trim(audio, top_db=25, frame_length=2048, hop_length=512)
    wav = torch.from_numpy(wav).unsqueeze(0)
    wav = wav.squeeze(0).numpy()
    # txt_file = in_path.replace('raw', 'script').replace('.pcm', '.pron')
    txt_file = in_path.replace('wav', 'script').replace('.pcm', '.pron')
    with open(txt_file, 'r', encoding='utf-8-sig') as f:
        line = f.readline()

    speaker = in_path.split('/')[spk_name_idx]
    write(out_path, 22050, wav)
    return (out_path, line.rstrip('\n'), speaker)
예제 #3
0
def _process_utterance(in_path):
    try:
        sr, wav = read_wav_np(in_path)
    except:
        return in_path + '\n'
    return ''