def main(): data_foler = "data" wavs = [ os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav") ] outputs_lws = [file + ".lws.gen.wav" for file in wavs] wavs = [ audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs ] lws_processor = lws.lws( 512, 128, mode="speech") # 512: window length; 128: window shift i = 0 for x in wavs: X = lws_processor.stft(x) # where x is a single-channel waveform X0 = np.abs(X) # Magnitude spectrogram print('{:6}: {:5.2f} dB'.format('Abs(X)', lws_processor.get_consistency(X0))) X1 = lws_processor.run_lws( X0 ) # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram) print(X1.shape) print('{:6}: {:5.2f} dB'.format('LWS', lws_processor.get_consistency(X1))) print(X1.shape) wav = lws_processor.istft(X1).astype(np.float32) audio.save_wav(wav, outputs_lws[i]) i += 1
def extract_mel(wav_filename, out_wav_path, out_dir, key, hparams, args): if not os.path.exists(wav_filename): print("Wav file {} doesn't exists.".format(wav_filename)) return None wav = audio.load_wav(wav_filename, sr=hparams.sample_rate) # Process wav samples wav = audio.trim_silence(wav, hparams) n_samples = len(wav) # Extract mel spectrogram mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) n_frames = mel_spectrogram.shape[1] if n_frames > hparams.max_acoustic_length: print( "Ignore wav {} because the frame number {} is too long (Max {} frames in hparams.yaml)." .format(wav_filename, n_frames, hparams.max_acoustic_length)) return None # Align features desired_frames = int(min(n_samples / hparams.hop_size, n_frames)) wav = wav[:desired_frames * hparams.hop_size] mel_spectrogram = mel_spectrogram[:, :desired_frames] n_samples = wav.shape[0] n_frames = mel_spectrogram.shape[1] assert (n_samples / hparams.hop_size == n_frames) # Save intermediate acoustic features mel_filename = os.path.join(out_dir, key + '.npy') np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) audio.save_wav(wav, out_wav_path, hparams) return (wav_filename, mel_filename, n_samples, n_frames)
def _process_utterance(out_dir, index, wav_path, text): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - out-dir: the directory to write the spectograms into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file Returns: - A tuple: (mel_filename, n_frames, text) """ # Load the audio as numpy array wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav to calculate n_frames spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrogram to disk mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (mel_filename, n_frames, text)
def infer(model, src_pth): src = load_wav(src_pth, seg=False) mel = melspectrogram(src).astype(np.float32) mel = mode(torch.Tensor([mel])) with torch.no_grad(): res = model.infer(mel)[0] return [src, to_arr(res)]
def wav2spec(self, wav_path): wav = audio.load_wav(wav_path) spec = audio.melspectrogram(wav).astype(np.float32) spec = spec.transpose() feat_size = spec.shape[1] pad_spec = np.zeros( [(len(spec) + self.outputs_per_step - 1) // self.outputs_per_step * self.outputs_per_step, feat_size], dtype='float32') pad_spec[:len(spec)] = spec return pad_spec.reshape([-1, self.outputs_per_step * feat_size])
def _process_wav(wav_path, audio_path, spc_path): wav = audio.load_wav(wav_path) wav1, wav2, wav3, wav4 = audio.subband(wav) if hparams.feature_type == 'mcc': # Extract mcc and f0 spc = audio.extract_mcc(wav) else: # Extract mels spc = audio.melspectrogram(wav).astype(np.float32) # Align audios and mels hop_length = int(hparams.frame_shift_ms / 4000 * hparams.sample_rate) length_diff_1 = len(spc) * hop_length - len(wav1) length_diff_2 = len(spc) * hop_length - len(wav2) length_diff_3 = len(spc) * hop_length - len(wav3) length_diff_4 = len(spc) * hop_length - len(wav4) wav1 = wav1.reshape(-1,1) if length_diff_1 > 0: wav1 = np.pad(wav1, [[0, length_diff_1], [0, 0]], 'constant') elif length_diff_1 < 0: wav1 = wav1[: hop_length * spc.shape[0]] wav2 = wav2.reshape(-1,1) if length_diff_2 > 0: wav2 = np.pad(wav2, [[0, length_diff_2], [0, 0]], 'constant') elif length_diff_2 < 0: wav2 = wav2[: hop_length * spc.shape[0]] wav3 = wav3.reshape(-1,1) if length_diff_3 > 0: wav3 = np.pad(wav1, [[0, length_diff_3], [0, 0]], 'constant') elif length_diff_3 < 0: wav3 = wav3[: hop_length * spc.shape[0]] wav4 = wav4.reshape(-1,1) if length_diff_4 > 0: wav4 = np.pad(wav4, [[0, length_diff_4], [0, 0]], 'constant') elif length_diff_4 < 0: wav4 = wav4[: hop_length * spc.shape[0]] fid1 = os.path.basename(audio_path).replace('.npy', '_band1.npy') fid2 = os.path.basename(audio_path).replace('.npy', '_band2.npy') fid3 = os.path.basename(audio_path).replace('.npy', '_band3.npy') fid4 = os.path.basename(audio_path).replace('.npy', '_band4.npy') fid1 = os.path.join('training_data/audios', fid1) fid2 = os.path.join('training_data/audios', fid2) fid3 = os.path.join('training_data/audios', fid3) fid4 = os.path.join('training_data/audios', fid4) np.save(fid1, wav1) np.save(fid2, wav2) np.save(fid3, wav3) np.save(fid4, wav4) np.save(spc_path, spc) return (fid1, fid2, fid3, fid4, spc_path, spc.shape[0])
def __getitem__(self, index): if hps.prep: wav, mel = self.f_list[index] seg_ml = hps.seg_l // hps.frame_shift + 1 ms = np.random.randint(0, mel.shape[1] - seg_ml) if mel.shape[1] > seg_ml else 0 ws = hps.frame_shift * ms wav = wav[ws:ws + hps.seg_l] mel = mel[:, ms:ms + seg_ml] else: wav = load_wav(self.f_list[index]) mel = melspectrogram(wav).astype(np.float32) return wav, mel
def infer(wav_path, text, model): sequence = text_to_sequence(text, hps.text_cleaners) sequence = to_var(torch.IntTensor(sequence)[None, :]).long() mel = melspectrogram(load_wav(wav_path)) r = mel.shape[1]%hps.n_frames_per_step mel_in = to_var(torch.Tensor([mel[:, :-r]])) if mel_in.shape[2] < 1: return None sequence = torch.cat([sequence, sequence], 0) mel_in = torch.cat([mel_in, mel_in], 0) _, mel_outputs_postnet, _, _ = model.teacher_infer(sequence, mel_in) ret = mel ret[:, :-r] = to_arr(mel_outputs_postnet[0]) return ret
def build_mels(corpus_list=None): from utils.audio import get_spectrograms, load_wav if corpus_list is None: corpus_list = glob.iglob(os.path.join(transformed_path, '*')) else: corpus_list = [os.path.join(transformed_path, c) for c in corpus_list] for f in corpus_list: os.makedirs(os.path.join(f, 'mels'), exist_ok=True) lines = open(os.path.join(f, "metadata.csv"), encoding='utf-8').read().splitlines() for l in tqdm.tqdm(lines): l = l.split('|') wav_path = os.path.join(f, 'proc_wavs', l[0] + '.wav') wav = load_wav(wav_path) mel = get_spectrograms(wav) np.save(os.path.join(f, 'mels', l[0] + '.npy'), mel)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) # print(len(spectrogram)) # print(len(spectrogram[0])) # print(type(spectrogram)) # print(np.shape(spectrogram)) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # print(np.shape(mel_spectrogram)) # print() # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def files_to_list(fdir): f_list = [] with open(os.path.join(fdir, 'metadata.csv'), encoding='utf-8') as f: for line in f: parts = line.strip().split('|') wav_path = os.path.join(fdir, 'wavs', '%s.wav' % parts[0]) if hps.prep: wav = load_wav(wav_path, False) if wav.shape[0] < hps.seg_l: wav = np.pad(wav, (0, hps.seg_l - wav.shape[0]), 'constant', constant_values=(0, 0)) mel = melspectrogram(wav).astype(np.float32) f_list.append([wav, mel]) else: f_list.append(wav_path) if hps.prep and hps.pth is not None: with open(hps.pth, 'wb') as w: pickle.dump(f_list, w) return f_list
def _process_wav(wav_path, audio_path, spc_path): wav = audio.load_wav(wav_path) if hparams.feature_type == 'mcc': # Extract mcc and f0 spc = audio.extract_mcc(wav) else: # Extract mels spc = audio.melspectrogram(wav).astype(np.float32) # Align audios and mels hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) length_diff = len(spc) * hop_length - len(wav) wav = wav.reshape(-1, 1) if length_diff > 0: wav = np.pad(wav, [[0, length_diff], [0, 0]], 'constant') elif length_diff < 0: wav = wav[:hop_length * spc.shape[0]] np.save(audio_path, wav) np.save(spc_path, spc) return (audio_path, spc_path, spc.shape[0])
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'meta_spec_%05d.npy' % index mel_filename = 'meta_mel_%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(mfcc_dir, wav_dir, index, wav_path, hparams, mode): """ Preprocesses a single utterance wav/text pair this writes the mfcc to disk and return a tuple to write to the train.txt file Args: - mfcc_dir: the directory to write the mfcc into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectrogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mfcc_filename, linear_filename, time_steps, mfcc_frames, linear_frames, text) """ try: # Load the audio as numpy array wav_full = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #M-AILABS extra silence specific if hparams.trim_silence: wav_full = audio.trim_silence(wav_full, hparams) # Preprocess Audio & Extract MFCC (mfcc + d + a) sample_idx = 0 sample_metadata = [] if (mode == "train") or (mode == "post_train"): # Add the same size slice from the end if wav_full.shape[0] >= hparams.sample_size: n_slice = int(np.floor(wav_full.shape[0]/hparams.sample_size)) samples = wav_full[:n_slice * hparams.sample_size].reshape((n_slice, hparams.sample_size)) if wav_full.shape[0] % hparams.sample_size != 0: ## FOR UNIT SEARCH : slice each audio by sample_size last_slice = wav_full[::-1][:hparams.sample_size] samples = np.vstack((samples, last_slice)) else: samples = [wav_full] else: samples = [wav_full] for wav in samples: #Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # #Trim silences # start, end = audio.start_and_end_indices(out, hparams.silence_threshold) # wav = wav[start: end] # preem_wav = preem_wav[start: end] # out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute mfcc mfcc = audio.mfcc(wav, hparams) mfcc_frames = mfcc.shape[0] # # Compute the mel scale spectrogram from the wav # mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) # mel_frames = mel_spectrogram.shape[1] if mfcc_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mfcc_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use out = out[:int(np.ceil(mfcc_frames/hparams.vqvae_down_freq) * hparams.vqvae_down_freq * audio.get_hop_size(hparams))] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = os.path.join(wav_dir, 'audio-{}-{}.npy'.format(index, sample_idx)) mfcc_filename = os.path.join(mfcc_dir, 'mfcc-{}-{}.npy'.format(index, sample_idx)) np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mfcc_filename, mfcc, allow_pickle=False) #global condition features if hparams.gin_channels > 0: if (mode == "train") or (mode == "post_train"): speaker_id = hparams.speakers.index(index[:4]) elif mode == "synth": speaker_id = 0 else: speaker_id = '<no_g>' sample_metadata.append((audio_filename, mfcc_filename, mfcc_filename, speaker_id, time_steps, mfcc_frames)) sample_idx += 1 return sample_metadata
def get_mel(filename): wav = load_wav(filename) mel = melspectrogram(wav).astype(np.float32) return mel
def _process_utterance(out_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None # Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max # Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) # [-1, 1] out = wav constant_values = 0. # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if (mel_frames > hparams.max_mel_frames and hparams.clip_mels_length) or ( hparams.min_text_tokens > len(text) or hparams.min_mel_frames > mel_frames): return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.hop_size, hparams.pad_sides) # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * hparams.hop_size # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * hparams.hop_size] assert len(out) % hparams.hop_size == 0 time_steps = len(out) npz_filename = '{}.npz'.format(index) mel_spectrogram = mel_spectrogram.T linear_spectrogram = linear_spectrogram.T r = hparams.reduction_factor if hparams.symmetric_mels: _pad_value = -hparams.max_abs_value else: _pad_value = 0. target_length = len(linear_spectrogram) mel_spectrogram = np.pad(mel_spectrogram, [[r, r], [0, 0]], "constant", constant_values=_pad_value) linear_spectrogram = np.pad(linear_spectrogram, [[r, r], [0, 0]], "constant", constant_values=_pad_value) target_length = target_length + 2 * r padded_target_length = (target_length // r + 1) * r num_pad = padded_target_length - target_length stop_token_target = np.pad(np.zeros(padded_target_length - 1, dtype=np.float32), (0, 1), "constant", constant_values=1) mel_spectrogram = np.pad(mel_spectrogram, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) linear_spectrogram = np.pad(linear_spectrogram, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) data = { 'mel': mel_spectrogram, 'linear': linear_spectrogram, 'input_data': text_to_sequence(text), # eos(~) 'time_steps': time_steps, 'stop_token_target': stop_token_target, 'mel_frames': padded_target_length, 'text': text, } np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False) # Return a tuple describing this training example return npz_filename, time_steps, padded_target_length, text
print("Completed generation " + str(i + 1) + ".") # File processing if __name__ == '__main__': random.seed(2018) print( "\nDue to dependence on open-source libraries, warning messages may appear." ) print("Loading .wav files...") filenames = os.listdir("input") filenames = [f for f in filenames if f.endswith('.wav')] wavs = [] for f in filenames: wav = audio.load_wav("input/" + f, hparams.sample_rate) wavs.append(wav) species_names = filenames GA = genetic_algorithm(wavs) GA.iterate() print("Saving gene spectrograms...") fig = plt.subplots() plt.draw() for i in range(len(GA.originals)): genes = GA.gene_pool[i] for n in range(len(genes)): w, h = genes[n] plt.subplot(len(genes), 2, 2 * n + 1) spectrum = np.log10(np.maximum(1e-5, w))
# -*- coding: utf-8 -*- import numpy as np from utils import audio from hparams import hparams as hps path = r'./data/000001.wav' # 第一步,加载语音,数据本来就是[-1,1],所以不需要归一化 wav = audio.load_wav(path, hps.sample_rate) # 第二步,去除前后的静音 if hps.trim_silence: wav = audio.trim_silence(wav, hps) # 第三步,计算mel图谱 mel_spectrogram = audio.melspectrogram(wav, hps).astype(np.float32) #第四步,计算线性图谱(声谱图) linear_spectrogram = audio.linearspectrogram(wav, hps).astype(np.float32) savename = path.split('/')[-1].split('.')[0] mel_filename = './data/mel-{}.npy'.format(savename) linear_filename = './data/linear-{}.npy'.format(savename) np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) np.save(linear_filename, linear_spectrogram.T, allow_pickle=False)
from utils import audio from hparams import hparams import numpy as np from griffin_lim import inv_spectrogram, tf import os if __name__ == '__main__': data_foler = "data" wavs = [ os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav") ] outputs_py = [file + ".py.gen.wav" for file in wavs] outputs_tf = [file + ".tf.gen.wav" for file in wavs] wavs = [ audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs ] spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs] print("Linear spectrograms dim: ") print(spectrogram[0].shape) # --------------------------------- librosa Version --------------------------------- # convert back gens = [audio.inv_spectrogram(s) for s in spectrogram] for gen, output in zip(gens, outputs_py): audio.save_wav(gen, output) # --------------------------------- TensorFlow Version --------------------------------- samples = [inv_spectrogram(spec) for spec in spectrogram]
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
_max_out_length = 700 f = open(tdd_file, encoding='utf-8') ctr = 0 for line in f: if len(line) > 2: ctr += 1 line = line.split('\n')[0] fname = line.split()[0] phones = ' '.join(k for k in line.split()[1:]) if generate_feats_flag: wav_fname = wav_dir + '/' + fname + '.wav' wav = audio.load_wav(wav_fname) max_samples = _max_out_length * 5 / 1000 * 16000 spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) lspec_fname = lspec_dir + '/' + fname + '_lspec.npy' mspec_fname = mspec_dir + '/' + fname + '_mspec.npy' np.save(lspec_fname, spectrogram.T, allow_pickle=False) np.save(mspec_fname, mel_spectrogram.T, allow_pickle=False) g = open(data_file, 'a') g.write(lspec_fname + '|' + mspec_fname + '|' + str(n_frames) + '| ' + phones + '\n') g.close() g = open(feats_dir + '/' + fname + '.feats', 'w') for phone in phones.split():
def extract_audio_mels(audio_path): wav = audio.load_wav(audio_path) mels = audio.melspectrogram(wav) return mels
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start: chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,) mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,) text_idx = '%s - %05d' % (text, chunk_idx,) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
from utils.audio import melspectrogram,inv_mel_spectrogram,load_wav,save_wav wav_path = "LJ001-0008.wav" raw_wav = load_wav(wav_path) mel_spec = melspectrogram(raw_wav) inv_wav = inv_mel_spectrogram(mel_spec) save_wav(inv_wav,"inv.wav")
def get_mel(wav_path): wav = load_wav(wav_path) return torch.Tensor(melspectrogram(wav).astype(np.float32))
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'bznsyp-audio-%05d.npy' % index mel_filename = 'bznsyp-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def _process_utterance(out_dir, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) #1차원짜리 wav파일 뽑아옴 #Load an audio file as a floating point time series. #Audio will be automatically resampled to the given rate (default sr=22050). #To preserve the native sampling rate of the file, use sr=None. #print('====wav====') #print(wav,wav.shape) (240001,) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max #We rescale because it is assumed in Wavenet training that wavs are in [-1, 1] when computing the mixture loss. This is mainly coming from PixelCNN implementation. #https://github.com/Rayhane-mamah/Tacotron-2/issues/69 #M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = audio.trim_silence(wav, hparams) # Trim leading and trailing silence #Mu-law quantize, default 값은 'raw' #The quantization noise is from the analog to digital conversion. The mu-law compression actually reduces the noise and increases the dynamic range. #If you search a little bit in the code you will find that the input is always mu-law encoded here. #scalar_input only determines if the model uses a one-hot encoding for every data point of the input waveform, or just uses floating point values for each sample. if hparams.input_type=='mulaw-quantize': #[0, quantize_channels) out = audio.mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif hparams.input_type=='mulaw': #[-1, 1] out = audio.mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # raw #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrograFm from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) #print('====mel_spectrogram====') #print(mel_spectrogram,mel_spectrogram.shape) #(80,797),(80,801) ... mel_frames = mel_spectrogram.shape[1] #print('===mel frame====') #print(mel_frames) 801, 797 ,... if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) #print('====linear_spectrogram====') #print(linear_spectrogram,linear_spectrogram.shape) #(1025,787),(1025,801) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: # hparams.use_lws = False #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #1024 == 2048//2 == fft_size//2 #print('===pad===') #print(pad) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) #print(out,out.shape) #(240001,) out = np.pad(out, pad, mode='reflect') #shape : (242049,) - 패딩 #print(out,out.shape) #(242049,) #print('===out====') #print(out,out.shape) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] #240300으로 맞춤(자름) assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) #print(audio.get_hop_size(hparams)) : 300 #print(out,out.shape) #(240300,) = 801*300 # Write the spectrogram and audio to disk wav_id = os.path.splitext(os.path.basename(wav_path))[0] #확장자 제외하고 파일 이름 얻기 #print('====wav_id====') #print(wav_id) # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) linear_filename = '{}-linear.npy'.format(wav_id) npz_filename = '{}.npz'.format(wav_id) npz_flag=True if npz_flag: # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다. data = { 'audio': out.astype(out_dtype), 'mel': mel_spectrogram.T, 'linear': linear_spectrogram.T, 'time_steps': time_steps, 'mel_frames': mel_frames, 'text': text, 'tokens': text_to_sequence(text), # eos(~)에 해당하는 "1"이 끝에 붙는다. 'loss_coeff': 1 # For Tacotron } #print('=====data====') #print(data) np.savez(os.path.join(out_dir,npz_filename ), **data, allow_pickle=False) #여러개의 배열을 1개의 압축되지 않은 *.npz 포맷 파일로 저장하기 else: np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example #print('====mel_frames====') #print(mel_frames) #print('====time_steps====') #print(time_steps) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text,npz_filename)
def get_mel(self, filename): wav = load_wav(filename) mel = melspectrogram(wav).astype(np.float32) return torch.Tensor(mel)
def _process_utterance(out_dir, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = audio.trim_silence(wav, hparams) # Trim leading and trailing silence #Mu-law quantize, default 값은 'raw' if hparams.input_type == 'mulaw-quantize': #[0, quantize_channels) out = audio.mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif hparams.input_type == 'mulaw': #[-1, 1] out = audio.mulaw(wav, hparams.quantize_channels) constant_values = audio.mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # raw #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: # hparams.use_lws = False #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk wav_id = os.path.splitext(os.path.basename(wav_path))[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) linear_filename = '{}-linear.npy'.format(wav_id) npz_filename = '{}.npz'.format(wav_id) npz_flag = True if npz_flag: # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다. data = { 'audio': out.astype(out_dtype), 'mel': mel_spectrogram.T, 'linear': linear_spectrogram.T, 'time_steps': time_steps, 'mel_frames': mel_frames, 'text': text, 'tokens': text_to_sequence(text), # eos(~)에 해당하는 "1"이 끝에 붙는다. 'loss_coeff': 1 # For Tacotron } np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False) else: np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text, npz_filename)
# encoding: utf-8 from utils import audio from hparams import hparams import numpy as np import io from griffin_lim import inv_spectrogram, tf if __name__ == '__main__': wavs = ["data/000001.wav", "data/000002.wav"] outputs_py = ["data/000001.gen.wav", "data/000002.gen.wav"] outputs_tf = ["data/000001.gen.tf.wav", "data/000002.gen.tf.wav"] wavs = [audio.load_wav(wav_path, hparams.sample_rate) for wav_path in wavs] spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs] print(spectrogram[0].shape) print(spectrogram[1].shape) # --------------------------------- librosa Version --------------------------------- # convert back gens = [audio.inv_spectrogram(s) for s in spectrogram] for gen, output in zip(gens, outputs_py): out = io.BytesIO() audio.save_wav(gen, out) with open(output, "wb") as f: f.write(out.getvalue()) # --------------------------------- TensorFlow Version --------------------------------- samples = [inv_spectrogram(spec) for spec in spectrogram]