Exemplo n.º 1
0
def _process_utterance(out_dir, index, wav_path, text, phone):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.vocoder=="world":
        spectrogram = audio.spectrogram(wav).astype(np.float32)

        f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate)
        ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate)
        sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim)
        
        world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded])
        n_frames = world_spec.shape[0]
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-world-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False)

    else:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

        # Write the spectrograms to disk:
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-mel-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False)


    # Return a tuple describing this training example:
    return (spectrogram_filename, encoded_filename, n_frames, text, phone)
Exemplo n.º 2
0
def prepare_data(audio_path):

    rates = [0.25, 0.5, 1, 1.5, 2, 2.5]
    enc_path = './data/emb/'

    # Create directory for encoding
    if os.path.exists(enc_path) is False:
        os.makedirs(enc_path)

    pattern = audio_path + '*' + '.npz'
    file_list = glob.glob(pattern)

    for item in file_list:
        print(os.path.splitext(os.path.basename(item))[0])
        item_ndname = enc_path + os.path.splitext(
            os.path.basename(item))[0][:4]
        item = np.load(item)
        spec, piece = item['spec'], item['piece']

        # get original piece encoding
        #spec = torch.FloatTensor(torch.from_numpy(spec))
        enc = encode(spec.T)

        for rate in rates:
            s = librosa.effects.time_stretch(piece, rate)
            spec_s = audio.spectrogram(s).astype(np.float32)
            enc_s = encode(spec_s.T)
            enc_o = timestretch(enc.T, (1 / rate))

            #enc_s = torch.FloatTensor(enc_s)
            #enc = torch.FloatTensor(enc.T)
            #print('enc.T = ' , enc_o.T.shape , 'enc_s = ', enc_s.shape )
            new_item = item_ndname + '_' + str(rate)

            np.savez(new_item, input=enc_o.T, target=enc_s)
Exemplo n.º 3
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'vctk-spec-%05d.npy' % index
    mel_filename = 'vctk-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return spectrogram_filename, mel_filename, n_frames, text, speaker_id
Exemplo n.º 4
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'nikl-single-spec-%05d.npy' % index
    mel_filename = 'nikl-single-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemplo n.º 5
0
def gen_samples(out_dir, wav_path, n_samples):
    wav = audio.load_wav(wav_path)
    hop_size = hparams.hop_length
    seg_len = hparams.seg_len
    spec_len = hparams.spec_len
    # not sure why we have to minus 1 here ?
    wav_len = wav.shape[0] // hop_size * hop_size - 1
    wav = wav[:wav_len]
    spec = audio.spectrogram(wav)
    mel = audio.melspectrogram(wav)
    max_val = spec.shape[1] - 1 - spec_len
    if max_val < 0:
        return []
    idx = np.random.randint(0, max_val, size=(n_samples))
    d = []
    i = 0
    for offset in idx:
        i += 1
        w = wav[offset * hop_size:offset * hop_size + seg_len]
        s = spec[:, offset:offset + spec_len]
        m = mel[:, offset:offset + spec_len]
        wav_name = wav_path.split('/')[-1].split('.')[0]
        file_path = "{0}/{1}_{2:03d}.npz".format(out_dir, wav_name, i)
        np.savez(file_path, wav=w, spec=s, mel=m)
        d.append(file_path)
    return d
Exemplo n.º 6
0
def _process_utterance(out_dir, in_dir, source_wav_name, target_wav_name,
                       emotion_id):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    source_wav = audio.load_wav(os.path.join(in_dir, source_wav_name))
    target_wav = audio.load_wav(os.path.join(in_dir, target_wav_name))

    if hparams.rescaling:
        source_wav = source_wav / np.abs(
            source_wav).max() * hparams.rescaling_max
        target_wav = target_wav / np.abs(
            target_wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    #s_spectrogram = audio.spectrogram(source_wav).astype(np.float32)
    t_spectrogram = audio.spectrogram(target_wav).astype(np.float32)

    # Compute a mel-scale spectrogram from the wav:
    smel_spectrogram = audio.melspectrogram(source_wav).astype(np.float32)
    tmel_spectrogram = audio.melspectrogram(target_wav).astype(np.float32)
    s_n_frames = smel_spectrogram.shape[1]
    t_n_frames = tmel_spectrogram.shape[1]

    # Write the spectrograms to disk:
    #s_spectrogram_filename = 'source-spec-{}.npy'.format(source_wav_name)
    t_spectrogram_filename = 'target-spec-{}.npy'.format(
        target_wav_name.replace('.wav', ''))
    smel_filename = 'source-mel-{}.npy'.format(
        source_wav_name.replace('.wav', ''))
    tmel_filename = 'target-mel-{}.npy'.format(
        target_wav_name.replace('.wav', ''))
    #np.save(os.path.join(out_dir, s_spectrogram_filename), s_spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, t_spectrogram_filename),
            t_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, smel_filename),
            smel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tmel_filename),
            tmel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (emotion_id, t_spectrogram_filename, smel_filename, tmel_filename,
            s_n_frames, t_n_frames)
Exemplo n.º 7
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:

    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    #world parameters
    f0, sp, ap = audio.world(wav, hparams.sample_rate)
    f0 = (f0 / hparams.f0_norm).astype(np.float32)  #normalize
    sp = audio._normalize(sp).astype(np.float32)
    ap = ap.astype(np.float32)  #apは0~1の範囲しか値を取らないので正規化不要
    world_frames = f0.shape[0]

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index

    f0_filename = 'ljspeech-f0-%05d.npy' % index
    sp_filename = 'ljspeech-sp-%05d.npy' % index
    ap_filename = 'ljspeech-ap-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, f0_filename), f0, allow_pickle=False)
    np.save(os.path.join(out_dir, sp_filename), sp, allow_pickle=False)
    np.save(os.path.join(out_dir, ap_filename), ap, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, f0_filename,
            sp_filename, ap_filename, world_frames, text)
    '''
Exemplo n.º 8
0
def _process_utterance(out_dir, out_path, wav_path, text, stft):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    wav = wav / np.abs(wav).max() * 0.999
    #stft = audio.taco_stft()

    # delete the silence in back of the audio file.
    wav = librosa.effects.trim(wav,
                               top_db=23,
                               frame_length=1024,
                               hop_length=256)[0]

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav,
                                           stft).numpy().astype(np.float32)

    # Write the spectrograms to disk:
    # spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    parts = out_path.strip().split('/')
    mel_filename = parts[4] + parts[5] + parts[6]
    o_path = os.path.join(parts[0], parts[1], parts[4])

    #    print(o_path)
    #    mel_filename = 'nam_speech-mel-%05d.npy' % index
    #  print(out_path)

    if (not os.path.exists(o_path)):
        os.mkdir(o_path)
    o_path = os.path.join(o_path, parts[5])
    if (not os.path.exists(o_path)):
        os.mkdir(o_path)
    o_path = os.path.join(o_path, parts[6])

    np.save(o_path, mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    # return (spectrogram_filename, mel_filename, n_frames, text)
    return (mel_filename, n_frames, text)
Exemplo n.º 9
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    try:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    except Exception as e:
        print("Problem with :", wav_path)
        print(e)

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # mdda added START :
    wav_filename = mel_filename.replace('-mel-', '-audio-')
    #wav_samples = hparams.fft_size + (n_frames-1)*hparams.hop_size  # No : 3 extra frames added : Don't bother chomping
    np.save(os.path.join(out_dir, wav_filename),
            wav.astype(np.float32),
            allow_pickle=False)
    spectrogramraw_filename = 'ljspeech-specraw-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogramraw_filename),
            spectrogram_raw(wav).T,
            allow_pickle=False)
    # mdda added END

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemplo n.º 10
0
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams):
    # modified version of LJSpeech _process_utterance
    audio.set_hparams(hparams)

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    sr = hparams.sample_rate
    # Added from the multispeaker version
    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0] + '.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        wav = clean_by_phoneme(labels, wav, sr)
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)
    # End added from the multispeaker version

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.max_audio_length != 0 and librosa.core.get_duration(
            y=wav, sr=sr) > hparams.max_audio_length:
        return None
    if hparams.min_audio_length != 0 and librosa.core.get_duration(
            y=wav, sr=sr) < hparams.min_audio_length:
        return None

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]
    spectrogram_filename = 'spec-{}.npy'.format(wav_name)
    mel_filename = 'mel-{}.npy'.format(wav_name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemplo n.º 11
0
def _process_utterance(out_dir, text, wav_path, speaker_id=None):

    # check whether singlespeaker_mode
    if speaker_id is None:
        return _process_utterance_single(out_dir, text, wav_path)
    # modified version of VCTK _process_utterance
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0] + '.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]

    # case if wave files across different speakers have the same naming format.
    # e.g. Recording0.wav
    spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name)
    mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
Exemplo n.º 12
0
def compute_spectrograms(batches, sample_rate, frame_len, fps, bins=None):
    """
    Computes spectrograms from the signals in `batches` at a given sample rate
    (in Hz), frame length (in samples) and frame rate (in Hz).
    """
    plans = audio.spectrogram_plans(frame_len, dtype=np.float32)
    for wavs, labels in batches:
        spects = [audio.spectrogram(np.asanyarray(wav).ravel(),
                                    sample_rate, frame_len, fps,
                                    dtype=np.float32, bins=bins, plans=plans)
                  for wav in wavs]
        yield spects, labels
Exemplo n.º 13
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate
    filename = os.path.basename(wav_path).replace('.wav', '')

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    # Librosa trim seems to cut off the ending part of speech
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Save trimmed wav
    save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path)
    dir = os.path.dirname(save_wav_path)
    if not os.path.exists(dir):
        os.system('mkdir {} -p'.format(dir))
    audio.save_wav(wav, save_wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = '{}-spec.npy'.format(filename)
    mel_filename = '{}-mel.npy'.format(filename)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def copy_synthesis(wav_file, out_path):
    """Perform copy synthesis on the wav file and write the synthesized wav to disk at out_path
    """
    filename = os.path.splitext(os.path.basename(wav_file))[0]

    y = audio.load_wav(wav_file)
    if cfg.rescaling:
        y = y / np.abs(y).max() * cfg.rescaling_max

    mag = audio.spectrogram(y)

    y_hat = audio.inv_spectrogram(mag)

    out_path = os.path.join(out_path, filename + "_synthesized.wav")
    print(f"Writing {out_path} to disk")
    audio.save_wav(y_hat, out_path)
Exemplo n.º 15
0
def parse_line(line):
    # Parse line from csv
    filename, sentence, duration = line.decode('ascii').split('\t')

    # Audio file
    wav_path = os.path.join(hyperparams.dataset_path, filename + '.wav')
    wave = audio.read_audio(wav_path, hyperparams.sample_rate)
    audio_length = wave.shape[0] / hyperparams.sample_rate

    # Calculate spectrum
    mel, linear = audio.spectrogram(hyperparams, wave)

    # Encode sentence
    tokens = text.encode(sentence)

    return mel.T, linear.T, tokens, np.int32(
        tokens.size), np.float32(audio_length)
Exemplo n.º 16
0
def _process_utterance(out_dir,
                       index,
                       speaker_id,
                       wav_path,
                       text,
                       hparams=hparams):
    sr = hparams.sample_rate
    audio.set_hparams(hparams)

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'vctk-spec-%05d.npy' % index
    mel_filename = 'vctk-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
Exemplo n.º 17
0
def _process_utterance(out_dir, index, wav_path, pinyin):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

    Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # rescale wav for unified measure for all clips
    wav = wav / np.abs(wav).max() * 0.999

    # trim silence
    wav = audio.trim_silence(wav)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    if n_frames > hp.max_frame_num:
        return None

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'biaobei-spec-%05d.npy' % index
    mel_filename = 'biaobei-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, pinyin)
Exemplo n.º 18
0
def generate(model_path,model_name, generate_path, generate_name, piece):
    
    """Synthesize audio from an array of embeddings.
    
    Args:
    encodings: Numpy array with shape [batch_size, time, dim].
    save_paths: Iterable of output file names.
    checkpoint_path: Location of the pretrained model. [model.ckpt-200000]
    samples_per_save: Save files after every amount of generated samples.

    """
    
    # Create directory for encoding
    if os.path.exists(generate_path) is False:
        os.makedirs(generate_path)

    net = AutoEncoder()
    net = load_model(net,model_path,model_name)
    cuda_available = torch.cuda.is_available()
    if cuda_available is True:
        net = net.cuda()

    net.eval()

    # Load audio for encoding
    piece = audio.load_wav(piece)
    spec = audio.spectrogram(piece).astype(np.float32)
    spec = torch.from_numpy(spec.T)
    spec = torch.FloatTensor(spec)
    
    spec = torch.unsqueeze(spec, 0)
    spec = Variable(spec, volatile=True).contiguous()

    if cuda_available is True:
        spec = spec.cuda()

    generated_spec = net(spec)
    generated_spec = generated_spec.data.cpu().numpy()
    generated_spec = np.squeeze(generated_spec)
    
    waveform = audio.inv_spectrogram(generated_spec.T)
    wav_name = generate_path + generate_name + '.wav'

    audio.save_wav(waveform , wav_name)    
Exemplo n.º 19
0
def _process_utterance(wav_path):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    # n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Return a tuple describing this training example:
    return spectrogram.T, mel_spectrogram.T
Exemplo n.º 20
0
def _process_utterance(audio_path, data_dir, tokens, loss_coeff):
    audio_name = os.path.basename(audio_path)

    filename = audio_name.rsplit('.', 1)[0] + ".npz"
    numpy_path = os.path.join(data_dir, filename)

    if not os.path.exists(numpy_path):
        wav = load_audio(audio_path)

        try:
          linear_spectrogram = spectrogram(wav).astype(np.float32)
          mel_spectrogram = melspectrogram(wav).astype(np.float32)
        except:
          return 0

        data = {
            "linear": linear_spectrogram.T,
            "mel": mel_spectrogram.T,
            "tokens": tokens,
            "loss_coeff": loss_coeff,
        }

        n_frame = linear_spectrogram.shape[1]

        if hparams.skip_inadequate:
            min_n_frame = hparams.reduction_factor * hparams.min_iters
            max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor

            if min_n_frame <= n_frame <= max_n_frame and len(tokens) >= hparams.min_tokens:
                return None

        np.savez(numpy_path, **data, allow_pickle=False)
    else:
        try:
            data = np.load(numpy_path)
            n_frame = data["linear"].shape[0]
        except:
            remove_file(numpy_path)
            return _process_utterance(audio_path, data_dir, tokens, loss_coeff)

    return n_frame
Exemplo n.º 21
0
def _process_utterance(out_dir, in_dir, wav_name):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(os.path.join(in_dir, wav_name))

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    spectrogram = audio.spectrogram(wav).astype(np.float32)

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    n_frames = mel_spectrogram.shape[1]

    # Write the spectrograms to disk:
    spectrogram_filename = 'spec-{}.npy'.format(wav_name.replace('.wav', ''))
    mel_filename = 'mel-{}.npy'.format(wav_name.replace('.wav', ''))
    dur_filename = 'dur-{}.npy'.format(wav_name.replace('.wav', ''))
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, dur_filename, n_frames)
Exemplo n.º 22
0
    def get_mel(self, filename):
        if not self.load_mel_from_disk:
            wav, _ = librosa.load(filename, self.sampling_rate)
            wav = torch.from_numpy(wav).float().unsqueeze(0)
            #audio_norm = wav / self.max_wav_value
            #audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(wav, requires_grad=False)
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)
            wav, sr = librosa.load(filename, sr=self.hparms.se_sample_rate)
            wav, _ = librosa.effects.trim(wav, top_db=20)
            audios = split_audio(
                wav,
                sr=self.hparms.se_sample_rate,
            )
            mels = get_split_mels(
                audios,
                # sr=self.hparms.se_sample_rate,
                # n_fft=self.hparms.se_n_fft,
                # win_length=self.hparms.se_window,
                # hop_length=self.hparms.se_hop,
                mel=self.hparms.num_mel)
            if len(mels) == 0:
                print(filename)

            mels = np.stack(mels)
            mels = torch.from_numpy(mels).float()
            mels = mels.permute(0, 2, 1)
            x, _ = self.speaker_encoder(mels, return_sim=False)
            speaker_encoder = x.mean(0)  # final speaker encode from an audio
            # reference from gst
            spectrogram = audio.spectrogram(wav).astype(np.float32)
            spectrogram = spectrogram.transpose(1, 0)
        else:
            melspec = torch.from_numpy(np.load(filename))
            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))

        return speaker_encoder, spectrogram, melspec
Exemplo n.º 23
0
def _process_utterance(mag_dir, mel_dir, wav_path, text):
    """Preprocesses a single utterance audio/text pair.
    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      mag_dir: The directory to write the log magnitude spectrograms into
      mel_dir: The directory to write the mel spectrograms into
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (filename, text, num_frames) tuple to write to train.txt
    """
    filename = os.path.splitext(os.path.basename(wav_path))[0]

    # Load the audio to a numpy array
    wav = audio.load_wav(wav_path)

    if cfg.rescaling:
        wav = wav / np.abs(wav).max() * cfg.rescaling_max

    # Compute the linear-scale spectrogram from the wav
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    num_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk
    np.save(os.path.join(mag_dir, filename + ".npy"),
            spectrogram.T,
            allow_pickle=False)

    np.save(os.path.join(mel_dir, filename + ".npy"),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (filename, text, num_frames)
Exemplo n.º 24
0
def _process_utterance(out_dir, index, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        assert labels[0][-1] == "silB"
        assert labels[-1][-1] == "silE"
        b = int(labels[0][1] * 1e-7 * sr)
        e = int(labels[-1][0] * 1e-7 * sr)
        wav = wav[b:e]
    else:
        wav, _ = librosa.effects.trim(wav, top_db=30)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'jsut-spec-%05d.npy' % index
    mel_filename = 'jsut-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, labels_path, text):
    # Load the wav file and trim silence from the ends:
    wav = audio.load_wav(wav_path)
    start_offset, end_offset = _parse_labels(labels_path)
    start = int(start_offset * hp.sr)
    end = int(end_offset * hp.sr) if end_offset is not None else -1
    wav = wav[start:end]
    max_samples = _max_out_length * hp.frame_shift * hp.sr
    if len(wav) > max_samples:
        # print(wav_path + ": wav too long")
        return None
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'blizzard-spec-%05d.npy' % index
    mel_filename = 'blizzard-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemplo n.º 26
0
def test_audio_conv(audio_path):
    # Audio file
    wav_path = os.path.join(hyperparams.dataset_path, filename + '.wav')
    wave = audio.read_audio(wav_path, hyperparams.sample_rate)
    audio_length = wave.shape[0] / hyperparams.sample_rate

    # Calculate spectrum
    mel, linear = audio.spectrogram(hyperparams, wave)

    #plt.imshow(mel)
    from_mel = audio.mel_to_linear(mel, (hyperparams.num_freq - 1) * 2,
                                   hyperparams.sample_rate,
                                   hyperparams.num_mels)
    plt.imshow(from_mel)
    plt.show()
    plt.imshow(linear)
    plt.show()

    signal = audio.reconstruct(hyperparams, linear)
    audio.write_audio('test.wav', signal, hyperparams.sample_rate)

    signal = audio.reconstruct(hyperparams, mel, from_mel=True)
    audio.write_audio('test_mel.wav', signal, hyperparams.sample_rate)
Exemplo n.º 27
0
def encode(model_name, piece, encoding_name):

    model_path = './restore/'
    encoding_path = './encoding/'
    
    # Create directory for encoding
    if os.path.exists(encoding_path) is False:
        os.makedirs(encoding_path)
    
    net = AutoEncoder()
    net = load_model(net,model_path,model_name)
    cuda_available = torch.cuda.is_available()
    if cuda_available is True:
        net = net.cuda()
        
    net.eval()
    
    # Load audio for encoding
    piece = audio.load_wav(piece)
    spec = audio.spectrogram(piece).astype(np.float32)

    spec = torch.from_numpy(spec.T)
    spec = torch.FloatTensor(spec)
    spec = torch.unsqueeze(spec, 0)
    spec = Variable(spec, volatile=True).contiguous()

    if cuda_available is True:
        spec = spec.cuda()
        
    # Pass input audio to net forward pass    
    encoding = net.encoder(spec)
    encoding = encoding.data.cpu().numpy()
    #encoding = np.squeeze(encoding)
    
    encoding_ndarray = encoding_path + encoding_name+ '.npy'
    np.save(encoding_ndarray, encoding)
Exemplo n.º 28
0
def preprocess(audio_dir, ndarray_dir , window_length):
    pattern = audio_dir + '*' + '.wav'
    file_list = glob.glob(pattern)

    for item in file_list:
        item_ndname = ndarray_dir + os.path.splitext(os.path.basename(item))[0]
        item = audio.load_wav(item)
        
        item_iter = 0
        
        while(len(item)) > window_length:

            piece = item[: (window_length - 1)]
            spec = audio.spectrogram(piece).astype(np.float32)
            #spec = audio.melspectrogram(piece).astype(np.float32)
            
            #spec = torch.FloatTensor(torch.from_numpy(spec.T))
            #piece = torch.FloatTensor(torch.from_numpy(item))

            item = item[window_length:]
            new_item_ndname = item_ndname + str(item_iter) + '.npy'
            
            np.savez(new_item_ndname , piece = piece, spec = spec)
            item_iter += 1
Exemplo n.º 29
0
def wav2linear_for_ppg_cbhg(wav_arr):
    return spectrogram(wav_arr)['magnitude']
Exemplo n.º 30
0
#****************************

wav_path = './p225/1.wav'
wav = audio.load_wav(wav_path)
melspectrogram = audio.melspectrogram(wav).astype(np.float32)  # # (80, 448)
n_frames = melspectrogram.shape[1]

print("melspectrogram.shape = ", melspectrogram.shape)

print("n_frames = ", n_frames)

mag = audio._mel_to_linear(melspectrogram)

print("mag.shape = ", mag.shape)  # mag.shape =  (1025, 448)

orisp = audio.spectrogram(wav)
print("orisp.shape = ", orisp.shape)  # orisp.shape =  (1025, 448)

# wav = audio.griffin_lim(orisp)
wav = audio._griffin_lim(orisp)
audio.save_wav(wav, './ori-sp-to-wav.wav')

#
# wav = melspectrogram2wav(melspectrogram)
#
# audio.save_wav(wav, './hello-taco.wav')
'''
melspectrogram.shape =  (80, 448)
n_frames =  448
mag.shape =  (1025, 448)
orisp.shape =  (1025, 448)