Exemplo n.º 1
0
def _get_mfcc_and_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length):

    # Pre-emphasis
    y_preem = preemphasis(wav, coeff=preemphasis_coeff)

    # Get spectrogram
    D = librosa.stft(y=y_preem,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     win_length=win_length)
    mag = np.abs(D)

    # Get mel-spectrogram
    mel_basis = librosa.filters.mel(hp.default.sr, hp.default.n_fft,
                                    hp.default.n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag)  # (n_mels, t) # mel spectrogram

    # Get mfccs, amp to db
    mag_db = amp2db(mag)
    mel_db = amp2db(mel)
    mfccs = np.dot(librosa.filters.dct(hp.default.n_mfcc, mel_db.shape[0]),
                   mel_db)

    # Normalization (0 ~ 1)
    mag_db = normalize_0_1(mag_db, hp.default.max_db, hp.default.min_db)
    mel_db = normalize_0_1(mel_db, hp.default.max_db, hp.default.min_db)

    return mfccs.T, mag_db.T, mel_db.T  # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
Exemplo n.º 2
0
def wav2mfcc(wav,
             sr,
             n_fft,
             win_length,
             hop_length,
             n_mels,
             n_mfccs,
             preemphasis_coeff=0.97,
             time_first=True,
             **kwargs):

    # Pre-emphasis
    wav_preem = preemphasis(wav, coeff=preemphasis_coeff)

    # Decibel-scaled mel-spectrogram
    mel_db = wav2melspec_db(wav_preem,
                            sr,
                            n_fft,
                            win_length,
                            hop_length,
                            n_mels,
                            time_first=False,
                            **kwargs)

    # MFCCs
    mfccs = np.dot(librosa.filters.dct(n_mfccs, mel_db.shape[0]), mel_db)

    # Time-axis first
    if time_first:
        mfccs = mfccs.T  # (t, n_mfccs)

    return mfccs
Exemplo n.º 3
0
def _get_mfcc_and_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length):

    # Pre-emphasis
    y_preem = preemphasis(wav, coeff=preemphasis_coeff)

    # Get spectrogram
    D = librosa.stft(y=y_preem,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     win_length=win_length)
    mag = np.abs(D)

    # Get mel-spectrogram
    # Must have librosa==0.6.3
    # https://github.com/librosa/librosa/issues/1160
    # https://blog.csdn.net/Drifter_Galaxy/article/details/105077454
    mel_basis = librosa.filters.mel(hp.default.sr, hp.default.n_fft,
                                    hp.default.n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag)  # (n_mels, t) # mel spectrogram

    # Get mfccs, amp to db
    mag_db = amp2db(mag)
    mel_db = amp2db(mel)

    mfccs = np.dot(librosa.filters.dct(hp.default.n_mfcc, mel_db.shape[0]),
                   mel_db)

    # Normalization (0 ~ 1)
    mag_db = normalize_0_1(mag_db, hp.default.max_db, hp.default.min_db)
    mel_db = normalize_0_1(mel_db, hp.default.max_db, hp.default.min_db)

    return mfccs.T, mag_db.T, mel_db.T  # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
Exemplo n.º 4
0
def do_convert(predictor, input_name, logdir2):
    convert_s = datetime.datetime.now()

    # Load input audio
    input_audio, _ = librosa.load(input_name, sr=hp.default.sr, dtype=np.float64)

    # Extract F0 from input audio first
    input_f0, t_table = pw.dio(input_audio, hp.default.sr)
    input_f0 = pw.stonemask(input_audio, input_f0, t_table, hp.default.sr)

    # Get MFCC, Spectral Envelope, and Aperiodicity
    mfcc = _get_mfcc(input_audio, hp.default.n_fft, hp.default.win_length, hp.default.hop_length)
    mfcc = np.expand_dims(mfcc, axis=0)

    input_ap = pw.d4c(input_audio, input_f0, t_table, hp.default.sr, fft_size=hp.default.n_fft)

    input_sp_en = _get_spectral_envelope(preemphasis(input_audio, coeff=hp.default.preemphasis), hp.default.n_fft)
    plt.imsave('./converted/debug/input_sp_en_original.png', input_sp_en, cmap='binary')
    input_sp_en = np.expand_dims(input_sp_en, axis=0)

    # Convert Spectral Envelope
    output_sp_en, ppgs = convert_spectral_envelope(predictor, mfcc, input_sp_en)
    output_sp_en = np.squeeze(output_sp_en.astype(np.float64), axis=0)

    preproc_s = datetime.datetime.now()
    # Denormalization
    output_sp_en = denormalize_db(output_sp_en, hp.default.max_db, hp.default.min_db)

    # Db to amp
    output_sp_en = librosa.db_to_amplitude(output_sp_en)

    # Emphasize the magnitude
    output_sp_en = np.power(output_sp_en, hp.convert.emphasis_magnitude)

    preproc_e = datetime.datetime.now()
    preproc_t = preproc_e - preproc_s
    print("Pre-Processing time:{}s".format(preproc_t.seconds))

    # F0 transformation with WORLD Vocoder
    output_f0 = f0_adapt(input_f0, logdir2)

    # Synthesize audio and de-emphasize
    output_audio = pw.synthesize(output_f0, output_sp_en, input_ap, hp.default.sr)
    output_audio = inv_preemphasis(output_audio, coeff=hp.default.preemphasis)

    # Saving output_audio to 32-bit Float wav file
    output_audio = output_audio.astype(np.float32)
    librosa.output.write_wav(path="./converted/"+input_name,y=output_audio,sr=hp.default.sr)

    # Saving PPGS data to Grayscale Image and raw binary file
    ppgs = np.squeeze(ppgs, axis=0)
    plt.imsave('./converted/debug/'+input_name+'.png', ppgs, cmap='binary')
    np.save('./converted/debug/'+input_name+'.npy', ppgs)

    convert_e = datetime.datetime.now()
    convert_time = convert_e - convert_s
    print("Total Converting Time:{}s".format(convert_time.seconds))
Exemplo n.º 5
0
def _get_mfcc_and_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length):

    # Pre-emphasis
    y_preem = preemphasis(wav, coeff=preemphasis_coeff)

    # Get spectrogram and power energy
    D = librosa.stft(y=y_preem,
                     n_fft=n_fft,
                     hop_length=hop_length,
                     win_length=win_length)
    mag = np.abs(D)
    mag_e = np.sum(mag**2, axis=0)
    mag_e = np.where(mag_e == 0, np.finfo(float).eps, mag_e)

    # Get mel-spectrogram
    mel_basis = librosa.filters.mel(hp.default.sr, hp.default.n_fft,
                                    hp.default.n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag)  # (n_mels, t) # mel spectrogram
    mel = np.where(mel == 0, np.finfo(float).eps, mel)

    # Get mfccs, amp to db
    mag_db = amp2db(mag)
    mel_db = amp2db(mel)
    mfccs = np.dot(librosa.filters.dct(hp.default.n_mfcc, mel_db.shape[0]),
                   mel_db)

    mfccs[0, :] = np.log(mag_e)
    """
    # Normalizing mfcc so that has zero mean and unit variance
    mfccs_norm = (mfccs - np.mean(mfccs)) / np.std(mfccs)
    """

    # Normalization (0 ~ 1)
    mag_db = normalize_0_1(mag_db, hp.default.max_db, hp.default.min_db)
    mel_db = normalize_0_1(mel_db, hp.default.max_db, hp.default.min_db)

    return mfccs.T, mag_db.T, mel_db.T  # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
Exemplo n.º 6
0
def _process_utterance(mel_dir,
                       linear_dir,
                       wav_dir,
                       index,
                       wav_path,
                       text,
                       hparams,
                       step_factor=1):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate * step_factor)
        if step_factor > 1: wav = wav[::step_factor]
        audio_time = len(wav) / hparams.sample_rate
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

#Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

#Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        #Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

#Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        #[0, quantize_channels)
        out = mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        preem_wav = preem_wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif is_mulaw(hparams.input_type):
        #[-1, 1]
        out = mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

# Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

#Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                            audio.get_hop_size(hparams),
                                            hparams.wavenet_pad_sides)

        #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (wav_path, audio_filename, mel_filename, linear_filename,
            time_steps, mel_frames, audio_time, text, len(text))