def main():
    data_foler = "data"
    wavs = [
        os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler)
        if file.endswith(".wav")
    ]
    outputs_lws = [file + ".lws.gen.wav" for file in wavs]
    wavs = [
        audio.load_wav(wav_path + ".wav", hparams.sample_rate)
        for wav_path in wavs
    ]

    lws_processor = lws.lws(
        512, 128, mode="speech")  # 512: window length; 128: window shift
    i = 0
    for x in wavs:
        X = lws_processor.stft(x)  # where x is a single-channel waveform
        X0 = np.abs(X)  # Magnitude spectrogram
        print('{:6}: {:5.2f} dB'.format('Abs(X)',
                                        lws_processor.get_consistency(X0)))
        X1 = lws_processor.run_lws(
            X0
        )  # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram)
        print(X1.shape)
        print('{:6}: {:5.2f} dB'.format('LWS',
                                        lws_processor.get_consistency(X1)))
        print(X1.shape)
        wav = lws_processor.istft(X1).astype(np.float32)

        audio.save_wav(wav, outputs_lws[i])
        i += 1
Пример #2
0
def extract_mel(wav_filename, out_wav_path, out_dir, key, hparams, args):
    if not os.path.exists(wav_filename):
        print("Wav file {} doesn't exists.".format(wav_filename))
        return None

    wav = audio.load_wav(wav_filename, sr=hparams.sample_rate)
    # Process wav samples
    wav = audio.trim_silence(wav, hparams)
    n_samples = len(wav)

    # Extract mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    n_frames = mel_spectrogram.shape[1]
    if n_frames > hparams.max_acoustic_length:
        print(
            "Ignore wav {} because the frame number {} is too long (Max {} frames in hparams.yaml)."
            .format(wav_filename, n_frames, hparams.max_acoustic_length))
        return None

    # Align features
    desired_frames = int(min(n_samples / hparams.hop_size, n_frames))
    wav = wav[:desired_frames * hparams.hop_size]
    mel_spectrogram = mel_spectrogram[:, :desired_frames]
    n_samples = wav.shape[0]
    n_frames = mel_spectrogram.shape[1]
    assert (n_samples / hparams.hop_size == n_frames)

    # Save intermediate acoustic features
    mel_filename = os.path.join(out_dir, key + '.npy')
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
    audio.save_wav(wav, out_wav_path, hparams)

    return (wav_filename, mel_filename, n_samples, n_frames)
Пример #3
0
def _process_utterance(out_dir, index, wav_path, text):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- out-dir: the directory to write the spectograms into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file

	Returns:
		- A tuple: (mel_filename, n_frames, text)
	"""

    # Load the audio as numpy array
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav to calculate n_frames
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrogram to disk
    mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (mel_filename, n_frames, text)
Пример #4
0
def infer(model, src_pth):
    src = load_wav(src_pth, seg=False)
    mel = melspectrogram(src).astype(np.float32)
    mel = mode(torch.Tensor([mel]))
    with torch.no_grad():
        res = model.infer(mel)[0]
    return [src, to_arr(res)]
Пример #5
0
 def wav2spec(self, wav_path):
     wav = audio.load_wav(wav_path)
     spec = audio.melspectrogram(wav).astype(np.float32)
     spec = spec.transpose()
     feat_size = spec.shape[1]
     pad_spec = np.zeros(
         [(len(spec) + self.outputs_per_step - 1) // self.outputs_per_step *
          self.outputs_per_step, feat_size],
         dtype='float32')
     pad_spec[:len(spec)] = spec
     return pad_spec.reshape([-1, self.outputs_per_step * feat_size])
Пример #6
0
def _process_wav(wav_path, audio_path, spc_path):
    wav = audio.load_wav(wav_path)
    wav1, wav2, wav3, wav4 = audio.subband(wav)

    if hparams.feature_type == 'mcc':
        # Extract mcc and f0
        spc = audio.extract_mcc(wav)
    else:
        # Extract mels
        spc = audio.melspectrogram(wav).astype(np.float32)

    # Align audios and mels
    hop_length = int(hparams.frame_shift_ms / 4000 * hparams.sample_rate)
    length_diff_1 = len(spc) * hop_length - len(wav1)
    length_diff_2 = len(spc) * hop_length - len(wav2)
    length_diff_3 = len(spc) * hop_length - len(wav3)
    length_diff_4 = len(spc) * hop_length - len(wav4)
    wav1 = wav1.reshape(-1,1)
    if length_diff_1 > 0:
        wav1 = np.pad(wav1, [[0, length_diff_1], [0, 0]], 'constant')
    elif length_diff_1 < 0:
        wav1 = wav1[: hop_length * spc.shape[0]]
    wav2 = wav2.reshape(-1,1)
    if length_diff_2 > 0:
        wav2 = np.pad(wav2, [[0, length_diff_2], [0, 0]], 'constant')
    elif length_diff_2 < 0:
        wav2 = wav2[: hop_length * spc.shape[0]]
    wav3 = wav3.reshape(-1,1)
    if length_diff_3 > 0:
        wav3 = np.pad(wav1, [[0, length_diff_3], [0, 0]], 'constant')
    elif length_diff_3 < 0:
        wav3 = wav3[: hop_length * spc.shape[0]]
    wav4 = wav4.reshape(-1,1)
    if length_diff_4 > 0:
        wav4 = np.pad(wav4, [[0, length_diff_4], [0, 0]], 'constant')
    elif length_diff_4 < 0:
        wav4 = wav4[: hop_length * spc.shape[0]]
    fid1 = os.path.basename(audio_path).replace('.npy', '_band1.npy')
    fid2 = os.path.basename(audio_path).replace('.npy', '_band2.npy')
    fid3 = os.path.basename(audio_path).replace('.npy', '_band3.npy')
    fid4 = os.path.basename(audio_path).replace('.npy', '_band4.npy')

    fid1 = os.path.join('training_data/audios', fid1)
    
    fid2 = os.path.join('training_data/audios', fid2)
    fid3 = os.path.join('training_data/audios', fid3)
    fid4 = os.path.join('training_data/audios', fid4)
    
    np.save(fid1, wav1)
    np.save(fid2, wav2)
    np.save(fid3, wav3)
    np.save(fid4, wav4)
    np.save(spc_path, spc)
    return (fid1, fid2, fid3, fid4, spc_path, spc.shape[0])
Пример #7
0
 def __getitem__(self, index):
     if hps.prep:
         wav, mel = self.f_list[index]
         seg_ml = hps.seg_l // hps.frame_shift + 1
         ms = np.random.randint(0, mel.shape[1] -
                                seg_ml) if mel.shape[1] > seg_ml else 0
         ws = hps.frame_shift * ms
         wav = wav[ws:ws + hps.seg_l]
         mel = mel[:, ms:ms + seg_ml]
     else:
         wav = load_wav(self.f_list[index])
         mel = melspectrogram(wav).astype(np.float32)
     return wav, mel
Пример #8
0
def infer(wav_path, text, model):
	sequence = text_to_sequence(text, hps.text_cleaners)
	sequence = to_var(torch.IntTensor(sequence)[None, :]).long()
	mel = melspectrogram(load_wav(wav_path))
	r = mel.shape[1]%hps.n_frames_per_step
	mel_in = to_var(torch.Tensor([mel[:, :-r]]))
	if mel_in.shape[2] < 1:
		return None
	sequence = torch.cat([sequence, sequence], 0)
	mel_in = torch.cat([mel_in, mel_in], 0)
	_, mel_outputs_postnet, _, _ = model.teacher_infer(sequence, mel_in)
	ret = mel
	ret[:, :-r] = to_arr(mel_outputs_postnet[0])
	return ret
Пример #9
0
def build_mels(corpus_list=None):
    from utils.audio import get_spectrograms, load_wav
    if corpus_list is None:
        corpus_list = glob.iglob(os.path.join(transformed_path, '*'))
    else:
        corpus_list = [os.path.join(transformed_path, c) for c in corpus_list]

    for f in corpus_list:
        os.makedirs(os.path.join(f, 'mels'), exist_ok=True)
        lines = open(os.path.join(f, "metadata.csv"),
                     encoding='utf-8').read().splitlines()
        for l in tqdm.tqdm(lines):
            l = l.split('|')
            wav_path = os.path.join(f, 'proc_wavs', l[0] + '.wav')
            wav = load_wav(wav_path)
            mel = get_spectrograms(wav)
            np.save(os.path.join(f, 'mels', l[0] + '.npy'), mel)
Пример #10
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    # print(len(spectrogram))
    # print(len(spectrogram[0]))
    # print(type(spectrogram))
    # print(np.shape(spectrogram))
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    # print(np.shape(mel_spectrogram))
    # print()

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Пример #11
0
def files_to_list(fdir):
    f_list = []
    with open(os.path.join(fdir, 'metadata.csv'), encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('|')
            wav_path = os.path.join(fdir, 'wavs', '%s.wav' % parts[0])
            if hps.prep:
                wav = load_wav(wav_path, False)
                if wav.shape[0] < hps.seg_l:
                    wav = np.pad(wav, (0, hps.seg_l - wav.shape[0]),
                                 'constant',
                                 constant_values=(0, 0))
                mel = melspectrogram(wav).astype(np.float32)
                f_list.append([wav, mel])
            else:
                f_list.append(wav_path)
    if hps.prep and hps.pth is not None:
        with open(hps.pth, 'wb') as w:
            pickle.dump(f_list, w)
    return f_list
Пример #12
0
def _process_wav(wav_path, audio_path, spc_path):
    wav = audio.load_wav(wav_path)
    if hparams.feature_type == 'mcc':
        # Extract mcc and f0
        spc = audio.extract_mcc(wav)
    else:
        # Extract mels
        spc = audio.melspectrogram(wav).astype(np.float32)

    # Align audios and mels
    hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    length_diff = len(spc) * hop_length - len(wav)
    wav = wav.reshape(-1, 1)
    if length_diff > 0:
        wav = np.pad(wav, [[0, length_diff], [0, 0]], 'constant')
    elif length_diff < 0:
        wav = wav[:hop_length * spc.shape[0]]

    np.save(audio_path, wav)
    np.save(spc_path, spc)
    return (audio_path, spc_path, spc.shape[0])
Пример #13
0
def _process_utterance(out_dir, index, wav_path, text):

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'meta_spec_%05d.npy' % index
    mel_filename = 'meta_mel_%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Пример #14
0
def _process_utterance(mfcc_dir, wav_dir, index, wav_path, hparams, mode):
	"""
	Preprocesses a single utterance wav/text pair

	this writes the mfcc to disk and return a tuple to write
	to the train.txt file

	Args:
		- mfcc_dir: the directory to write the mfcc into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectrogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mfcc_filename, linear_filename, time_steps, mfcc_frames, linear_frames, text)
	"""

	try:
		# Load the audio as numpy array
		wav_full = audio.load_wav(wav_path, sr=hparams.sample_rate)
	except FileNotFoundError: #catch missing wav exception
		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
			wav_path))
		return None

	#M-AILABS extra silence specific
	if hparams.trim_silence:
		wav_full = audio.trim_silence(wav_full, hparams)

	# Preprocess Audio & Extract MFCC (mfcc + d + a)
	sample_idx = 0
	sample_metadata = []

	if (mode == "train") or (mode == "post_train"):
		# Add the same size slice from the end
		if wav_full.shape[0] >= hparams.sample_size:
			n_slice = int(np.floor(wav_full.shape[0]/hparams.sample_size))
			samples = wav_full[:n_slice * hparams.sample_size].reshape((n_slice, hparams.sample_size))
			if wav_full.shape[0] % hparams.sample_size != 0:
				## FOR UNIT SEARCH : slice each audio by sample_size
				last_slice = wav_full[::-1][:hparams.sample_size]
				samples = np.vstack((samples, last_slice))
		else:
			samples = [wav_full]
	else:
		samples = [wav_full]


	for wav in samples:

		#Pre-emphasize
		preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

		#rescale wav
		if hparams.rescale:
			wav = wav / np.abs(wav).max() * hparams.rescaling_max
			preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

			#Assert all audio is in [-1, 1]
			if (wav > 1.).any() or (wav < -1.).any():
				raise RuntimeError('wav has invalid value: {}'.format(wav_path))
			if (preem_wav > 1.).any() or (preem_wav < -1.).any():
				raise RuntimeError('wav has invalid value: {}'.format(wav_path))

		#Mu-law quantize
		if is_mulaw_quantize(hparams.input_type):
			#[0, quantize_channels)
			out = mulaw_quantize(wav, hparams.quantize_channels)

			# #Trim silences
			# start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
			# wav = wav[start: end]
			# preem_wav = preem_wav[start: end]
			# out = out[start: end]

			constant_values = mulaw_quantize(0, hparams.quantize_channels)
			out_dtype = np.int16

		elif is_mulaw(hparams.input_type):
			#[-1, 1]
			out = mulaw(wav, hparams.quantize_channels)
			constant_values = mulaw(0., hparams.quantize_channels)
			out_dtype = np.float32

		else:
			#[-1, 1]
			out = wav
			constant_values = 0.
			out_dtype = np.float32

		# Compute mfcc
		mfcc = audio.mfcc(wav, hparams)
		mfcc_frames = mfcc.shape[0]

		# # Compute the mel scale spectrogram from the wav
		# mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
		# mel_frames = mel_spectrogram.shape[1]

		if mfcc_frames > hparams.max_mel_frames and hparams.clip_mels_length:
			return None

		#Ensure time resolution adjustement between audio and mel-spectrogram
		l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
		#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
		out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)

		assert len(out) >= mfcc_frames * audio.get_hop_size(hparams)

		#time resolution adjustement
		#ensure length of raw audio is multiple of hop size so that we can use
		out = out[:int(np.ceil(mfcc_frames/hparams.vqvae_down_freq) * hparams.vqvae_down_freq * audio.get_hop_size(hparams))]
		assert len(out) % audio.get_hop_size(hparams) == 0
		time_steps = len(out)

		# Write the spectrogram and audio to disk
		audio_filename = os.path.join(wav_dir, 'audio-{}-{}.npy'.format(index, sample_idx))
		mfcc_filename = os.path.join(mfcc_dir, 'mfcc-{}-{}.npy'.format(index, sample_idx))
		np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
		np.save(mfcc_filename, mfcc, allow_pickle=False)

		#global condition features
		if hparams.gin_channels > 0:
			if (mode == "train") or (mode == "post_train"):
				speaker_id = hparams.speakers.index(index[:4])
			elif mode == "synth":
				speaker_id = 0
			else:
				speaker_id = '<no_g>'

		sample_metadata.append((audio_filename, mfcc_filename, mfcc_filename, speaker_id, time_steps, mfcc_frames))
		sample_idx += 1


	return sample_metadata
def get_mel(filename):
    wav = load_wav(filename)
    mel = melspectrogram(wav).astype(np.float32)
    return mel
Пример #16
0
def _process_utterance(out_dir, index, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  # catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    # Trim lead/trail silences
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)

    # Pre-emphasize
    preem_wav = audio.preemphasis(wav, hparams.preemphasis,
                                  hparams.preemphasize)

    # rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
        preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

        # Assert all audio is in [-1, 1]
        if (wav > 1.).any() or (wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))
        if (preem_wav > 1.).any() or (preem_wav < -1.).any():
            raise RuntimeError('wav has invalid value: {}'.format(wav_path))

    # [-1, 1]
    out = wav
    constant_values = 0.

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(preem_wav,
                                           hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if (mel_frames > hparams.max_mel_frames and hparams.clip_mels_length) or (
            hparams.min_text_tokens > len(text)
            or hparams.min_mel_frames > mel_frames):
        return None

    # Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(preem_wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    # sanity check
    assert linear_frames == mel_frames

    # Ensure time resolution adjustement between audio and mel-spectrogram
    l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.hop_size,
                                        hparams.pad_sides)

    # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad),
                 mode='constant',
                 constant_values=constant_values)

    assert len(out) >= mel_frames * hparams.hop_size

    # time resolution adjustement
    # ensure length of raw audio is multiple of hop size so that we can use
    # transposed convolution to upsample
    out = out[:mel_frames * hparams.hop_size]
    assert len(out) % hparams.hop_size == 0
    time_steps = len(out)
    npz_filename = '{}.npz'.format(index)
    mel_spectrogram = mel_spectrogram.T
    linear_spectrogram = linear_spectrogram.T

    r = hparams.reduction_factor
    if hparams.symmetric_mels:
        _pad_value = -hparams.max_abs_value
    else:
        _pad_value = 0.
    target_length = len(linear_spectrogram)
    mel_spectrogram = np.pad(mel_spectrogram, [[r, r], [0, 0]],
                             "constant",
                             constant_values=_pad_value)
    linear_spectrogram = np.pad(linear_spectrogram, [[r, r], [0, 0]],
                                "constant",
                                constant_values=_pad_value)
    target_length = target_length + 2 * r
    padded_target_length = (target_length // r + 1) * r
    num_pad = padded_target_length - target_length
    stop_token_target = np.pad(np.zeros(padded_target_length - 1,
                                        dtype=np.float32), (0, 1),
                               "constant",
                               constant_values=1)
    mel_spectrogram = np.pad(mel_spectrogram, ((0, num_pad), (0, 0)),
                             "constant",
                             constant_values=_pad_value)
    linear_spectrogram = np.pad(linear_spectrogram, ((0, num_pad), (0, 0)),
                                "constant",
                                constant_values=_pad_value)

    data = {
        'mel': mel_spectrogram,
        'linear': linear_spectrogram,
        'input_data': text_to_sequence(text),  # eos(~)
        'time_steps': time_steps,
        'stop_token_target': stop_token_target,
        'mel_frames': padded_target_length,
        'text': text,
    }
    np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False)
    # Return a tuple describing this training example
    return npz_filename, time_steps, padded_target_length, text
Пример #17
0
            print("Completed generation " + str(i + 1) + ".")


# File processing
if __name__ == '__main__':
    random.seed(2018)
    print(
        "\nDue to dependence on open-source libraries, warning messages may appear."
    )

    print("Loading .wav files...")
    filenames = os.listdir("input")
    filenames = [f for f in filenames if f.endswith('.wav')]
    wavs = []
    for f in filenames:
        wav = audio.load_wav("input/" + f, hparams.sample_rate)
        wavs.append(wav)

    species_names = filenames
    GA = genetic_algorithm(wavs)
    GA.iterate()

    print("Saving gene spectrograms...")
    fig = plt.subplots()
    plt.draw()
    for i in range(len(GA.originals)):
        genes = GA.gene_pool[i]
        for n in range(len(genes)):
            w, h = genes[n]
            plt.subplot(len(genes), 2, 2 * n + 1)
            spectrum = np.log10(np.maximum(1e-5, w))
# -*- coding: utf-8 -*-
import numpy as np
from utils import audio
from hparams import hparams as hps

path = r'./data/000001.wav'

# 第一步,加载语音,数据本来就是[-1,1],所以不需要归一化
wav = audio.load_wav(path, hps.sample_rate)

# 第二步,去除前后的静音
if hps.trim_silence:
    wav = audio.trim_silence(wav, hps)

# 第三步,计算mel图谱
mel_spectrogram = audio.melspectrogram(wav, hps).astype(np.float32)

#第四步,计算线性图谱(声谱图)
linear_spectrogram = audio.linearspectrogram(wav, hps).astype(np.float32)

savename = path.split('/')[-1].split('.')[0]
mel_filename = './data/mel-{}.npy'.format(savename)
linear_filename = './data/linear-{}.npy'.format(savename)

np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
np.save(linear_filename, linear_spectrogram.T, allow_pickle=False)
from utils import audio
from hparams import hparams
import numpy as np
from griffin_lim import inv_spectrogram, tf
import os

if __name__ == '__main__':
    data_foler = "data"
    wavs = [
        os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler)
        if file.endswith(".wav")
    ]
    outputs_py = [file + ".py.gen.wav" for file in wavs]
    outputs_tf = [file + ".tf.gen.wav" for file in wavs]
    wavs = [
        audio.load_wav(wav_path + ".wav", hparams.sample_rate)
        for wav_path in wavs
    ]
    spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs]
    print("Linear spectrograms dim: ")
    print(spectrogram[0].shape)
    # --------------------------------- librosa Version ---------------------------------
    # convert back
    gens = [audio.inv_spectrogram(s) for s in spectrogram]

    for gen, output in zip(gens, outputs_py):
        audio.save_wav(gen, output)

    # --------------------------------- TensorFlow Version ---------------------------------

    samples = [inv_spectrogram(spec) for spec in spectrogram]
Пример #20
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'cmu_arctic-audio-%05d.npy' % index
    mel_filename = 'cmu_arctic-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text, speaker_id)
Пример #21
0
_max_out_length = 700

f = open(tdd_file, encoding='utf-8')
ctr = 0
for line in f:
 if len(line) > 2:
    ctr += 1
    line = line.split('\n')[0]

    fname = line.split()[0]
    phones = ' '.join(k for k in line.split()[1:])

    if generate_feats_flag:
       wav_fname = wav_dir + '/' + fname + '.wav'
       wav = audio.load_wav(wav_fname)
       max_samples = _max_out_length * 5 / 1000 * 16000
       spectrogram = audio.spectrogram(wav).astype(np.float32)
       n_frames = spectrogram.shape[1]
       mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
       lspec_fname = lspec_dir + '/' + fname + '_lspec.npy'
       mspec_fname = mspec_dir + '/' + fname + '_mspec.npy'
       np.save(lspec_fname, spectrogram.T, allow_pickle=False)
       np.save(mspec_fname, mel_spectrogram.T, allow_pickle=False)

       g = open(data_file, 'a')
       g.write(lspec_fname + '|' + mspec_fname + '|' + str(n_frames) + '| ' + phones  + '\n')
       g.close()

       g = open(feats_dir + '/' + fname + '.feats', 'w')
       for phone in phones.split():
Пример #22
0
def extract_audio_mels(audio_path):
    wav = audio.load_wav(audio_path)
    mels = audio.melspectrogram(wav)
    return mels
Пример #23
0
def _process_utterance(out_dir, index, audio_filepath, text):
    # Load the audio to a numpy array:
    wav_whole = audio.load_wav(audio_filepath)

    if hparams.rescaling:
        wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max

    # This is a librivox source, so the audio files are going to be v. long
    # compared to a typical 'utterance' : So split the wav into chunks

    tup_results = []

    n_samples = int(8.0 * hparams.sample_rate)  # All 8 second utterances
    n_chunks = wav_whole.shape[0] // n_samples

    for chunk_idx in range(n_chunks):
        chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples
        if chunk_idx == n_chunks - 1:  # This is the last chunk - allow it to extend to the end of the file
            chunk_end = None
        wav = wav_whole[chunk_start: chunk_end]

        # Mu-law quantize
        if is_mulaw_quantize(hparams.input_type):
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels)

            # Trim silences
            start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
            wav = wav[start:end]
            out = out[start:end]
            constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
            out_dtype = np.int16
        elif is_mulaw(hparams.input_type):
            # [-1, 1]
            out = P.mulaw(wav, hparams.quantize_channels)
            constant_values = P.mulaw(0.0, hparams.quantize_channels)
            out_dtype = np.float32
        else:
            # [-1, 1]
            out = wav
            constant_values = 0.0
            out_dtype = np.float32

        # Compute a mel-scale spectrogram from the trimmed wav:
        # (N, D)
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
        # lws pads zeros internally before performing stft
        # this is needed to adjust time resolution between audio and mel-spectrogram
        l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

        # zero pad for quantized signal
        out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
        N = mel_spectrogram.shape[0]
        assert len(out) >= N * audio.get_hop_size()

        # time resolution adjustment
        # ensure length of raw audio is multiple of hop_size so that we can use
        # transposed convolution to upsample
        out = out[:N * audio.get_hop_size()]
        assert len(out) % audio.get_hop_size() == 0

        timesteps = len(out)

        # Write the spectrograms to disk:
        audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,)
        mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,)
        text_idx = '%s - %05d' % (text, chunk_idx,)
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype), allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.astype(np.float32), allow_pickle=False)

        # Add results tuple describing this training example:
        tup_results.append((audio_filename, mel_filename, timesteps, text_idx))

    # Return all the audio results tuples (unpack in caller)
    return tup_results
Пример #24
0
from utils.audio import melspectrogram,inv_mel_spectrogram,load_wav,save_wav


wav_path = "LJ001-0008.wav"
raw_wav = load_wav(wav_path)
mel_spec = melspectrogram(raw_wav)
inv_wav = inv_mel_spectrogram(mel_spec)
save_wav(inv_wav,"inv.wav")

Пример #25
0
def get_mel(wav_path):
    wav = load_wav(wav_path)
    return torch.Tensor(melspectrogram(wav).astype(np.float32))
Пример #26
0
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'bznsyp-audio-%05d.npy' % index
    mel_filename = 'bznsyp-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
Пример #27
0
def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate) #1차원짜리 wav파일 뽑아옴
        #Load an audio file as a floating point time series.
        #Audio will be automatically resampled to the given rate (default sr=22050).
        #To preserve the native sampling rate of the file, use sr=None. 
        #print('====wav====')
        #print(wav,wav.shape) (240001,)
    except FileNotFoundError: #catch missing wav exception
        print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
            wav_path))
        return None

    #rescale wav
    if hparams.rescaling:   # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    #We rescale because it is assumed in Wavenet training that wavs are in [-1, 1] when computing the mixture loss. This is mainly coming from PixelCNN implementation.
    #https://github.com/Rayhane-mamah/Tacotron-2/issues/69

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav, hparams)   # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    #The quantization noise is from the analog to digital conversion. The mu-law compression actually reduces the noise and increases the dynamic range.
    #If you search a little bit in the code you will find that the input is always mu-law encoded here.
    #scalar_input only determines if the model uses a one-hot encoding for every data point of the input waveform, or just uses floating point values for each sample.
    if hparams.input_type=='mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start: end]
        out = out[start: end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type=='mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrograFm from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    #print('====mel_spectrogram====')
    #print(mel_spectrogram,mel_spectrogram.shape) #(80,797),(80,801) ...
    mel_frames = mel_spectrogram.shape[1]
    #print('===mel frame====')
    #print(mel_frames) 801, 797 ,...
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:   # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
    #print('====linear_spectrogram====')
    #print(linear_spectrogram,linear_spectrogram.shape) #(1025,787),(1025,801)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:    # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #1024 == 2048//2 == fft_size//2
        #print('===pad===')
        #print(pad) 
        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        #print(out,out.shape) #(240001,)
        out = np.pad(out, pad, mode='reflect') #shape : (242049,) - 패딩
        #print(out,out.shape) #(242049,)
        #print('===out====')
        #print(out,out.shape)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)] #240300으로 맞춤(자름)
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)
    #print(audio.get_hop_size(hparams)) : 300
    #print(out,out.shape) #(240300,) = 801*300
    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0] #확장자 제외하고 파일 이름 얻기
    #print('====wav_id====')
    #print(wav_id)
    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag=True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,  
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),   # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }
        #print('=====data====')
        #print(data)
        np.savez(os.path.join(out_dir,npz_filename ), **data, allow_pickle=False) #여러개의 배열을 1개의 압축되지 않은 *.npz 포맷 파일로 저장하기
    else:
        np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example
    #print('====mel_frames====')
    #print(mel_frames)
    #print('====time_steps====')
    #print(time_steps)
    return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text,npz_filename)
Пример #28
0
 def get_mel(self, filename):
     wav = load_wav(filename)
     mel = melspectrogram(wav).astype(np.float32)
     return torch.Tensor(mel)
Пример #29
0
def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescaling:  # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav,
                                 hparams)  # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    if hparams.input_type == 'mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type == 'mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = audio.mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:  # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:  # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0]

    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag = True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),  # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }

        np.savez(os.path.join(out_dir, npz_filename),
                 **data,
                 allow_pickle=False)
    else:
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype),
                allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.T,
                allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename),
                linear_spectrogram.T,
                allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text, npz_filename)
Пример #30
0
# encoding: utf-8
from utils import audio
from hparams import hparams
import numpy as np
import io
from griffin_lim import inv_spectrogram, tf

if __name__ == '__main__':
    wavs = ["data/000001.wav", "data/000002.wav"]
    outputs_py = ["data/000001.gen.wav", "data/000002.gen.wav"]
    outputs_tf = ["data/000001.gen.tf.wav", "data/000002.gen.tf.wav"]
    wavs = [audio.load_wav(wav_path, hparams.sample_rate) for wav_path in wavs]
    spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs]
    print(spectrogram[0].shape)
    print(spectrogram[1].shape)

    # --------------------------------- librosa Version ---------------------------------
    # convert back
    gens = [audio.inv_spectrogram(s) for s in spectrogram]

    for gen, output in zip(gens, outputs_py):
        out = io.BytesIO()
        audio.save_wav(gen, out)

        with open(output, "wb") as f:
            f.write(out.getvalue())

    # --------------------------------- TensorFlow Version ---------------------------------

    samples = [inv_spectrogram(spec) for spec in spectrogram]