def _adjust_time_resolution(self, batch, local_condition, max_time_steps): '''Adjust time resolution between audio and local condition ''' if local_condition: new_batch = [] for b in batch: x, c, g, l = b self._assert_ready_for_upsample(x, c) if max_time_steps is not None: max_steps = _ensure_divisible( max_time_steps, audio.get_hop_size(self._hparams), True) if len(x) > max_time_steps: max_time_frames = max_steps // audio.get_hop_size( self._hparams) start = np.random.randint(0, len(c) - max_time_frames) time_start = start * audio.get_hop_size(self._hparams) x = x[time_start:time_start + max_time_frames * audio.get_hop_size(self._hparams)] c = c[start:start + max_time_frames, :] self._assert_ready_for_upsample(x, c) new_batch.append((x, c, g, l)) return new_batch else: new_batch = [] for b in batch: x, c, g, l = b x = audio.trim_silence(x, hparams) if max_time_steps is not None and len(x) > max_time_steps: start = np.random.randint(0, len(c) - max_time_steps) x = x[start:start + max_time_steps] new_batch.append((x, c, g, l)) return new_batch
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except: print('file {} present in csv not in folder'.format(wav_path)) return None if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.trim_silence: wav = audio.trim_silence(wav) out = mulaw_quantize(wav, hparams.quantize_channels) start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] assert linear_frames == mel_frames l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size()) out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) time_steps = len(out) assert time_steps >= mel_frames * audio.get_hop_size() out = out[:mel_frames * audio.get_hop_size()] assert time_steps % audio.get_hop_size() == 0 audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir): hparams = self._hparams local_cond, global_cond = self._check_conditions() #Get True length of audio to be synthesized: audio_len = mel_len * hop_size audio_lengths = [len(x) * get_hop_size(self._hparams) for x in mel_spectrograms] #Prepare local condition batch maxlen = max([len(x) for x in mel_spectrograms]) #[-max, max] or [0,max] T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value) if self._hparams.clip_for_wavenet: mel_spectrograms = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms] c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms]).astype(np.float32) if self._hparams.normalize_for_wavenet: #rerange to [0, 1] c_batch = np.interp(c_batch, T2_output_range, (0, 1)) g = None if speaker_ids is None else np.asarray(speaker_ids, dtype=np.int32).reshape(len(c_batch), 1) feed_dict = {} if local_cond: feed_dict[self.local_conditions] = c_batch else: feed_dict[self.synthesis_length] = 100 if global_cond: feed_dict[self.global_conditions] = g #Generate wavs and clip extra padding to select Real speech parts generated_wavs = self.session.run(self.model.y_hat, feed_dict=feed_dict) generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)] audio_filenames = [] for i, generated_wav in enumerate(generated_wavs): #Save wav to disk audio_filename = os.path.join(out_dir, 'wavenet-audio-{}.wav'.format(basenames[i])) save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate) audio_filenames.append(audio_filename) #Save waveplot to disk if log_dir is not None: plot_filename = os.path.join(log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i])) util.waveplot(plot_filename, generated_wav, None, hparams) return audio_filenames
def _adjust_time_resolution(self, batch, local_condition, max_time_steps): '''Adjust time resolution between audio and local condition ''' if local_condition: new_batch = [] for b in batch: x, c, g, l = b if len(x) < len(c) * audio.get_hop_size(self._hparams): pad_length = audio.get_hop_size( self._hparams) * len(c) - len(x) if pad_length % 2 == 0: x = np.pad(x, (pad_length // 2, pad_length // 2), mode='constant', constant_values=_pad) else: x = np.pad(x, (pad_length // 2, (pad_length + 1) // 2), mode='constant', constant_values=_pad) else: c = self._pad_specs( c, len(x) // audio.get_hop_size(self._hparams)) self._assert_ready_for_upsample(x, c) if max_time_steps is not None: max_steps = _ensure_divisible( max_time_steps, audio.get_hop_size(self._hparams), True) if len(x) > max_time_steps: max_time_frames = max_steps // audio.get_hop_size( self._hparams) start = np.random.randint(0, len(c) - max_time_frames) time_start = start * audio.get_hop_size(self._hparams) x = x[time_start:time_start + max_time_frames * audio.get_hop_size(self._hparams)] c = c[start:start + max_time_frames, :] self._assert_ready_for_upsample(x, c) new_batch.append((x, c, g, l)) return new_batch else: new_batch = [] for b in batch: x, c, g, l = b x = audio.trim(x) if max_time_steps is not None and len(x) > max_time_steps: start = np.random.randint(0, len(c) - max_time_steps) x = x[start:start + max_time_steps] new_batch.append((x, c, g, l)) return new_batch
def _process_utterance(wav_dir, mel_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #[-1, 1] out = encode_mu_law(wav, mu=512) # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames or len( text) > hparams.max_text_length: return None #Zero pad for quantized signal #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample r = mel_frames * audio.get_hop_size(hparams) - len(wav) out = np.pad(out, (0, r), mode='constant', constant_values=0.) assert len(out) == mel_frames * audio.get_hop_size(hparams) time_steps = len(out) # Write the spectrogram and audio to disk filename = '{}.npy'.format(index) np.save(os.path.join(wav_dir, filename), out.astype(np.int16), allow_pickle=False) np.save(os.path.join(mel_dir, filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (filename, time_steps, mel_frames, text)
def re_save_all(wav_path, audio_filename, mel_filename, linear_filename): try: # Load the audio as numpy array aud = audio.load_audio(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #Trim lead/trail silences if hparams.trim_silence: aud = audio.trim_silence(aud, hparams) #Pre-emphasize preem_aud = audio.preemphasis(aud, hparams.preemphasis, hparams.preemphasize) #rescale audio if hparams.rescale: aud = aud / np.abs(aud).max() * hparams.rescaling_max preem_aud = preem_aud / np.abs(preem_aud).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (aud > 1.).any() or (aud < -1.).any(): raise RuntimeError('audio has invalid value: {}'.format(wav_path)) if (preem_aud > 1.).any() or (preem_aud < -1.).any(): raise RuntimeError('audio has invalid value: {}'.format(wav_path)) #[-1, 1] out = aud constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the audio mel_spectrogram = audio.melspectrogram(preem_aud, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] #Compute the linear scale spectrogram from the audui linear_spectrogram = audio.linearspectrogram(preem_aud, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(aud, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 # Write the spectrogram and audio to disk np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) np.save(linear_filename, linear_spectrogram.T, allow_pickle=False)
def _process_utterance(out_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - out_dir: the directory to write the msgpack into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None # Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max # Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, audio.get_hop_size(hparams), hparams.pad_sides) # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) npz_filename = '{}.npz'.format(index) r = hparams.outputs_per_step if hparams.symmetric_mels: _pad_value = -hparams.max_abs_value else: _pad_value = 0. # +2r for head and tail silence mel_spec = np.pad(mel_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value) linear_spec = np.pad(linear_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value) target_length = len(linear_spec) target_frames = (target_length // r + 1) * r num_pad = target_frames - target_length if num_pad != 0: linear_spec = np.pad(linear_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) mel_spec = np.pad(mel_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) stop_token = np.concatenate( [np.zeros(target_frames - 1, dtype=np.float32), np.ones(1, dtype=np.float32)], axis=0) data = { 'mel': mel_spec, 'linear': linear_spec, 'audio': out.astype(out_dtype), 'input_data': np.asarray(text_to_sequence(text)), 'time_steps': time_steps, 'mel_frames': target_frames, 'text': text, 'stop_token': stop_token, } dumps_msgpack(data, os.path.join(out_dir, npz_filename)) # Return a tuple describing this training example return npz_filename, time_steps, mel_frames, text
def _process_utterance(audio_dir, label_dir, index, wav_path, text_path, args): """ Preprocesses a single utterance wav/text_jamo pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text_jamo: text_jamo spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text_jamo) """ try: # Load the audio as numpy array # wav = audio.load_wav(wav_path, sr=args.sample_rate) with open(wav_path, 'rb') as pcmfile: buf = pcmfile.read() wav = np.frombuffer(buf, dtype='int16') except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None # rescale wav if args.rescale: wav = wav / np.abs(wav).max() * args.rescaling_max # M-AILABS extra silence specific if args.trim_silence: wav = audio.trim_silence(wav, args) # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, args).astype(out_dtype) mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, args.n_fft, audio.get_hop_size(args)) # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(args) # time resolution adjustment # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(args)] assert len(out) % audio.get_hop_size(args) == 0 time_steps = len(out) # text_jamo sequence with open(text_path, 'r', encoding='CP949') as f: line = f.readline() # ETRI transcription rule line = sentence_filter(line).upper() label_sequence = normalize(line) print(label_sequence) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) label_filename = 'label-{}.txt'.format(index) np.save(os.path.join(audio_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) with open(os.path.join(label_dir, label_filename), 'w', encoding='utf-8') as f_out: f_out.write(label_sequence) # Return a tuple describing this training example return (wav_path, text_path, mel_filename, label_filename, time_steps, mel_frames)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, spkid, uttid, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] preem_wav = preem_wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk sub_wav_dir = os.path.join(wav_dir, spkid) sub_mel_dir = os.path.join(mel_dir, spkid) sub_linear_dir = os.path.join(linear_dir, spkid) os.makedirs(sub_wav_dir, exist_ok=True) os.makedirs(sub_mel_dir, exist_ok=True) os.makedirs(sub_linear_dir, exist_ok=True) audio_filename = 'audio-{}.npy'.format(uttid) mel_filename = 'mel-{}.npy'.format(uttid) linear_filename = 'linear-{}.npy'.format(uttid) np.save(os.path.join(sub_wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(sub_mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(sub_linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (spkid, audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
return [None, eliminated] if hparams.predict_linear: # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram( wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames >>>>>>> f33090dba9ba4bc52db8367abdc48841d13c48f8 if hparams.use_lws: # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad audio signal <<<<<<< HEAD out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) ======= out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr( wav, hparams.n_fft, audio.get_hop_size(hparams)) >>>>>>> f33090dba9ba4bc52db8367abdc48841d13c48f8
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, ppgs, lf0_path, speaker, refer, hparams): """ Preprocesses a single utterance wav/ppgs pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - ppgs: ppgs spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, refer_name ,time_steps, mel_frames, linear_frames, ppgs,speaker,lf0) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, refer, time_steps, mel_frames, ppgs, speaker, lf0_path)
def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) # split_device = '/cpu:0' if self._hparams.wavenet_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0' # with tf.device(split_device): # hp = self._hparams # lout_int = [tf.int32] * hp.wavenet_num_gpus # lout_float = [tf.float32] * hp.wavenet_num_gpus # # tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if input_lengths is not None else [input_lengths] * hp.wavenet_num_gpus # # tower_y = tf.split(y, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if y is not None else [y] * hp.wavenet_num_gpus # tower_x = tf.split(x, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if x is not None else [x] * hp.wavenet_num_gpus # tower_c = tf.split(c, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if self.local_conditioning_enabled() else [None] * hp.wavenet_num_gpus # tower_g = tf.split(g, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if self.global_conditioning_enabled() else [None] * hp.wavenet_num_gpus # tower_test_inputs = tf.split(test_inputs, num_or_size_splits=hp.wavenet_num_gpus, axis=0) if test_inputs is not None else [test_inputs] * hp.wavenet_num_gpus # # self.tower_y_hat_q = [] # self.tower_y_hat_train = [] # self.tower_y = [] # self.tower_input_lengths = [] # self.tower_means = [] # self.tower_log_scales = [] # self.tower_y_hat_log = [] # self.tower_y_log = [] # self.tower_c = [] # self.tower_y_eval = [] # self.tower_eval_length = [] # self.tower_y_hat = [] # self.tower_y_target = [] # self.tower_eval_c = [] # self.tower_mask = [] # self.tower_upsampled_local_features = [] # self.tower_eval_upsampled_local_features = [] # self.tower_synth_upsampled_local_features = [] # log('Initializing Wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not ( self.is_training or self.is_evaluating))) #1. Declare GPU devices #gpus = ['/gpu:{}'.format(i) for i in range(hp.wavenet_num_gpus)] #for i in range(hp.wavenet_num_gpus): #with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device='/cpu:0', worker_device=gpus[i])): with tf.variable_scope('inference') as scope: #log(' device: {}'.format(i)) #Training if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask( input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step( x, c, g, softmax=False ) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL) if self._hparams.out_channels == 2: self.means = self.y_hat[:, 0, :] self.log_scales = y_hat[:, 1, :] else: self.means = None #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies( [tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = util.inv_mulaw_quantize( y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] if hparams.out_channels == 2: y_hat_log = sample_from_gaussian( y_hat_log, log_scale_min_gauss=hparams.log_scale_min_gauss) else: y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log # self.tower_c.append(tower_c[i]) # self.tower_upsampled_local_features.append(self.upsampled_local_features) log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] #test_inputs = tf.reshape(y_target, [1, -1, 1]) if not hparams.wavenet_natural_eval else None if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = tf.expand_dims(g[idx], axis=0) batch_size = tf.shape(c)[0] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental( initial_input, c=c, g=g, time_length=length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min, log_scale_min_gauss=hparams.log_scale_min_gauss) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target # self.tower_eval_c.append(tower_c[i][idx]) # self.tower_eval_upsampled_local_features.append(self.upsampled_local_features[idx]) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: batch_size = tf.shape(c)[0] if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ( 'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}' .format(hparams.cin_channels, c.shape)) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = audio.get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) if g is not None: assert g.shape == (batch_size, 1) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value y_hat = self.incremental( initial_input, c=c, g=g, time_length=synthesis_length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min, log_scale_min_gauss=hparams.log_scale_min_gauss) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [batch_size, -1]) y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = util.inv_mulaw(tf.reshape(y_hat, [batch_size, -1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [batch_size, -1]) self.y_hat = y_hat #self.tower_synth_upsampled_local_features.append(self.upsampled_local_features) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() log(' Receptive Field: ({} samples / {:.1f} ms)'.format( self.receptive_field, self.receptive_field / hparams.sample_rate * 1000.)) #1_000_000 is causing syntax problems for some people?! Python please :) log(' WaveNet Parameters: {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.variables]) / 1000000)) self.ema = tf.train.ExponentialMovingAverage( decay=hparams.wavenet_ema_decay)
def _process_utterance(mel_dir, index, wav_path, start, end, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - start, end: start, end points of speech - hparams: hyper parameters Returns: - A tuple: (wav_path, mel_filename, time_steps, mel_frames, start, end) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None start += 1 * hparams.sample_rate end += 1 * hparams.sample_rate #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #[-1, 1] out = wav out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype) mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) start = round(start/int(time_steps / mel_frames)) end = round(end/int(time_steps / mel_frames)) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (wav_path, mel_filename, time_steps, mel_frames, start, end)
def build_from_path_ispl(hparams, input_dirs, mel_dir, label_dir, tqdm=lambda x: x): """ Preprocesses the speech dataset from a gven input path to given output directories Args: - hparams: hyper parameters - input_dirs: input directory that contains the files to prerocess - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset - label_dir: the directory to write the label into - tqdm: Optional, provides a nice progress bar Returns: - A list of tuple describing the train examples. this should be written to train.txt """ # We use ProcessPoolExecutor to parallelize across processes, this is just for # optimization purposes and it can be omited futures = [] index = 1 for input_dir in input_dirs: files = find_files(os.path.join(input_dir)) for wav_path in files: file_name = wav_path.split("\\")[-1] if int(file_name.split('.')[0]) <= 10: label_path = wav_path.split("\\")[0] + '/label.txt' with open(label_path, encoding='utf-8') as f: lines = f.readlines() for line in lines: if file_name in line: labels = line.replace('[', '').replace(']', '').split(':')[1].replace(',\n', '').split(',') start = [] end = [] for idx in range(0, len(labels), 2): start.append(int(labels[idx])) end.append(int(labels[idx+1])) try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # [-1, 1] out = wav out_dtype = np.float32 if int(file_name.split('.')[0]) <= 10: label = np.zeros_like(out) for idx in range(len(start)): start[idx] = int(start[idx] / 1000 * hparams.sample_rate) end[idx] = int(end[idx] / 1000 * hparams.sample_rate) label[start[idx]:end[idx]] = 1. else: label = wav_path.split('.')[0] + '.label' with open(label, encoding='utf-8') as f: lines = f.readlines() label = np.asarray([int(line.strip('\n')) for line in lines]) # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype) mel_spectrogram = mel_spectrogram[:, -len(label):] mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) if int(file_name.split('.')[0]) <= 10: # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') label = np.pad(label, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] label = label[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 label = label[::audio.get_hop_size(hparams)] time_steps = len(out) else: time_steps = len(out) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) label_filename = 'label-{}.npy'.format(index) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(label_dir, label_filename), label, allow_pickle=False) futures.append((wav_path, mel_filename, time_steps, mel_frames, label_filename)) index += 1 return [future for future in tqdm(futures)]
def _process_utterance(out_dir, index, wav_path, pinyin, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' mel_dir = out_dir + "/mels" linear_dir = out_dir + "/linear" wav_dir = out_dir + "/audio" # Load the audio to a numpy array: wav = audio.load_wav(wav_path, sr=hparams.sample_rate) print("debug wav_path:", wav_path) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute a mel-scale spectrogram from the wav: #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: print("debug --- drop wav_path:", wav_path, "mel_frames:", mel_frames) return None # Compute the linear-scale spectrogram from the wav: #spectrogram = audio.spectrogram(wav).astype(np.float32) #n_frames = spectrogram.shape[1] linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrograms to disk: #spectrogram_filename = 'thchs30-spec-%05d.npy' % index #mel_filename = 'thchs30-mel-%05d.npy' % index #np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) #np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) print("debug save wav file:", os.path.join(wav_dir, audio_filename)) # Return a tuple describing this training example: return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, pinyin)
def __getitem__(self, indexs): if self.data is None: self.data = np.load(self.hparams.ds_name + '.npz', allow_pickle=True) ret = [] if not isinstance(indexs, list): _indexs = [indexs] else: _indexs = indexs for index in _indexs: data = self.data[self.audio_and_text_keys[index]].item() text = np.array(data['text'], np.int) if self.hparams.multispeaker: spk_id = data.get('spk_id', 0) else: spk_id = 0 if self.hparams.add_sil == 2: text = text.reshape(-1, 3 if not self.hparams.use_pinyin else 2) text = np.concatenate( [text, 2 * np.ones([text.shape[0], 1], np.int)], -1) # [L, 4] text = text.reshape(-1) # [L + L//3] elif self.hparams.add_sil == 3: text = np.stack([text, 128 + text, 256 + text], -1) # [L, 3] text = text.reshape( -1, 9 if not self.hparams.use_pinyin else 2) # [L/3, 9] text = np.concatenate( [text, 2 * np.ones([text.shape[0], 1], np.int)], -1) # [L/3, 10] text = text.reshape(-1) # [10L/3] text = torch.from_numpy(text) mel = torch.from_numpy(np.array(data['mels']).reshape(-1, 80).T) if self.hparams.use_linear or self.hparams.linear_directly: linear = torch.from_numpy( np.array(data['linear']).reshape(-1, self.hparams.num_freq).T) else: linear = None if self.hparams.speech and self.type != 'val': mel = mel[:, :1550] text = text[:350] if linear: linear = linear[:1550] pitch = None if self.hparams.use_pitch: pitch_key = 'pitches' if not self.hparams.use_smooth_pitch else 'smooth_pitches' pitch = torch.from_numpy(np.array(data[pitch_key], np.int)) utt_ids = torch.from_numpy(np.array(data['utt_id'], np.int)) if self.hparams.prefix_len > 0: text_len = int(self.hparams.prefix_len * text.shape[0] / mel.shape[1]) text = text[:text_len] mel = mel[:, :self.hparams.prefix_len] pitch = pitch[:self.hparams.prefix_len] attn = None if self.hparams.use_ali or self.hparams.use_ali_mask: attn = np.zeros((mel.shape[1], text.shape[0])) if self.hparams.use_phoneme_align: mel_splits = [ int(x * self.hparams.audio_sample_rate / self.hparams.hop_size) for x in data['splits'] ] last = 0 for t_idx, s in enumerate(mel_splits): attn[last:s, t_idx] = 1 else: splits_begin = np.clip(np.array(data['splits'], np.int), 0, mel.shape[1] - 1) splits_end = np.clip(np.array(data['splits_end'], np.int), 0, mel.shape[1] - 1) splits_begin = [0] + list(splits_begin) splits_end = [0] + list(splits_end) if not self.hparams.use_ali_mask2: # TODO: PINYIN? if self.hparams.use_pinyin: for i in range(text.shape[0] // 3): splits_begin_step = (splits_begin[i + 1] - splits_begin[i] - 3) / 2 if self.hparams.attn_step_clip10: splits_begin_step = np.clip( splits_begin_step, 0, 10) attn[int(splits_begin[i] ):int(splits_begin[i] + splits_begin_step), i * 3] += 1 attn[int(splits_begin[i] + splits_begin_step ):int(splits_begin[i] + splits_begin_step * 2), i * 3 + 1] += 1 attn[int(splits_begin[i + 1]) - 3:int(splits_begin[i + 1]), i * 3 + 2] += 1 else: if self.hparams.add_sil == 0: for i in range(text.shape[0] // 3): splits_begin_step = (splits_begin[i + 1] - splits_begin[i]) / 3 splits_end_step = (splits_end[i + 1] - splits_end[i]) / 3 if self.hparams.attn_step_clip10: splits_begin_step = np.clip( splits_begin_step, 0, 10) splits_end_step = np.clip( splits_end_step, 0, 10) attn[int(splits_begin[i] ):int(splits_begin[i] + splits_begin_step), i * 3] += 0.5 attn[int(splits_begin[i] + splits_begin_step ):int(splits_begin[i] + splits_begin_step * 2), i * 3 + 1] += 0.5 attn[int(splits_begin[i] + splits_begin_step * 2):int(splits_begin[i + 1]), i * 3 + 2] += 0.5 attn[int(splits_end[i] ):int(splits_end[i] + splits_end_step), i * 3] += 0.5 attn[int(splits_end[i] + splits_end_step ):int(splits_end[i] + splits_end_step * 2), i * 3 + 1] += 0.5 attn[int(splits_end[i] + splits_end_step * 2):int(splits_end[i + 1]), i * 3 + 2] += 0.5 elif self.hparams.add_sil == 2: for i in range(text.shape[0] // 4): splits_begin_step = (splits_begin[i + 1] - splits_begin[i] - 3) / 3 splits_end_step = (splits_end[i + 1] - splits_end[i] - 3) / 3 if self.hparams.attn_step_clip10: splits_begin_step = np.clip( splits_begin_step, 0, 10) splits_end_step = np.clip( splits_end_step, 0, 10) attn[int(splits_begin[i] ):int(splits_begin[i] + splits_begin_step), i * 4] += 0.5 attn[int(splits_begin[i] + splits_begin_step ):int(splits_begin[i] + splits_begin_step * 2), i * 4 + 1] += 0.5 attn[int(splits_begin[i] + splits_begin_step * 2):int(splits_begin[i + 1]) - 3, i * 4 + 2] += 0.5 attn[int(splits_begin[i + 1]) - 3:int(splits_begin[i + 1]), i * 4 + 3] += 0.5 attn[int(splits_end[i] ):int(splits_end[i] + splits_end_step), i * 4] += 0.5 attn[int(splits_end[i] + splits_end_step ):int(splits_end[i] + splits_end_step * 2), i * 4 + 1] += 0.5 attn[int(splits_end[i] + splits_end_step * 2):int(splits_end[i + 1]) - 3, i * 4 + 2] += 0.5 attn[int(splits_end[i + 1]) - 3:int(splits_end[i + 1]), i * 4 + 3] += 0.5 else: for i in range(text.shape[0] // 3): attn[int(splits_begin[i]):int(splits_begin[i + 1]), i * 3:(i + 1) * 3] = 1 attn[int(splits_end[i]):int(splits_end[i + 1]), i * 3:(i + 1) * 3] = 1 attn = torch.from_numpy(attn) if self.hparams.use_wavenet: wav = torch.from_numpy(np.array(data['raw_wav'])) max_time_steps = self.hparams.wavenet_max_time if wav.shape[0] > max_time_steps: max_time_frames = max_time_steps // audio.get_hop_size( self.hparams) start_cond_idx = torch.randint( mel.shape[1] - max_time_frames, []) else: start_cond_idx = 0 else: wav = None start_cond_idx = None if self.hparams.linear_directly: mel = linear ret.append([ text, mel, pitch, utt_ids, attn, linear, spk_id, wav, start_cond_idx ]) if not isinstance(indexs, list): return ret[0] else: return ret
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): wav = _trim_wav(audio.load_wav(wav_path, sr=hparams.sample_rate)) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) name = os.path.splitext(os.path.basename(wav_path))[0] speaker_id = _speaker_re.match(name).group(1) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, speaker_id, text)
def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams, speaker_id): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectrogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index)) mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index)) np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) #global condition features if hparams.gin_channels > 0: # raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training') speaker_id = speaker_id #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable) else: speaker_id = speaker_id # Return a tuple describing this training example return (audio_filename, mel_filename, '_', speaker_id, time_steps, mel_frames)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path, sr=hparams.sample_rate) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) log('Initializing Wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not ( self.is_training or self.is_evaluating))) with tf.variable_scope('inference') as scope: #Training if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask( input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step( x, c, g, softmax=False ) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL) if self._hparams.out_channels == 2: self.means = self.y_hat[:, 0, :] self.log_scales = self.y_hat[:, 1, :] else: self.means = None #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies( [tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = util.inv_mulaw_quantize( y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] if hparams.out_channels == 2: y_hat_log = sample_from_gaussian( y_hat_log, log_scale_min_gauss=hparams.log_scale_min_gauss) else: y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = tf.expand_dims(g[idx], axis=0) batch_size = tf.shape(c)[0] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental(initial_input, c=c, g=g, time_length=length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: batch_size = tf.shape(c)[0] if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ( 'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}' .format(hparams.cin_channels, c.shape)) with tf.control_dependencies( [tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = audio.get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.tile( tf.reshape(initial_input, [1, 1, hparams.quantize_channels]), [batch_size, 1, 1]) else: initial_input = tf.ones([batch_size, 1, 1], tf.float32) * initial_value y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length, softmax=False, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [batch_size, -1]) self.out_node = y_hat y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = util.inv_mulaw(tf.reshape(y_hat, [batch_size, -1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [batch_size, -1]) self.y_hat = y_hat if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() self.ema = tf.train.ExponentialMovingAverage( decay=hparams.wavenet_ema_decay)
def initialize(self, y, c, g, input_lengths, x=None, synthesis_length=None): '''Initialize wavenet graph for train, eval and test cases. ''' hparams = self._hparams self.is_training = x is not None self.is_evaluating = not self.is_training and y is not None #Set all convolutions to corresponding mode self.set_mode(self.is_training) log('Initializing Wavenet model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(self.is_training)) log(' Eval mode: {}'.format(self.is_evaluating)) log(' Synthesis mode: {}'.format(not (self.is_training or self.is_evaluating))) with tf.variable_scope('inference') as scope: #Training if self.is_training: batch_size = tf.shape(x)[0] #[batch_size, time_length, 1] self.mask = self.get_mask(input_lengths, maxlen=tf.shape(x)[-1]) #To be used in loss computation #[batch_size, channels, time_length] y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length, channels] self.y_hat_q = tf.transpose(y_hat, [0, 2, 1]) self.y_hat = y_hat self.y = y self.input_lengths = input_lengths #Graph extension for log saving #[batch_size, time_length] shape_control = (batch_size, tf.shape(x)[-1], 1) with tf.control_dependencies([tf.assert_equal(tf.shape(y), shape_control)]): y_log = tf.squeeze(y, [-1]) if is_mulaw_quantize(hparams.input_type): self.y = y_log y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4), lambda: tf.squeeze(y_hat, [-1]), lambda: y_hat) y_hat_log = tf.reshape(y_hat_log, [batch_size, hparams.out_channels, -1]) if is_mulaw_quantize(hparams.input_type): #[batch_size, time_length] y_hat_log = tf.reduce_max(tf.nn.softmax(y_hat_log, axis=1), 1) y_hat_log = util.inv_mulaw_quantize(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw_quantize(y_log, hparams.quantize_channels) else: #[batch_size, time_length] y_hat_log = sample_from_discretized_mix_logistic( y_hat_log, log_scale_min=hparams.log_scale_min) if is_mulaw(hparams.input_type): y_hat_log = util.inv_mulaw(y_hat_log, hparams.quantize_channels) y_log = util.inv_mulaw(y_log, hparams.quantize_channels) self.y_hat_log = y_hat_log self.y_log = y_log log(' inputs: {}'.format(x.shape)) if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_log.shape)) log(' outputs: {}'.format(y_hat_log.shape)) #evaluating elif self.is_evaluating: #[time_length, ] idx = 0 length = input_lengths[idx] y_target = tf.reshape(y[idx], [-1])[:length] if c is not None: c = tf.expand_dims(c[idx, :, :length], axis=0) with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3)]): c = tf.identity(c, name='eval_assert_c_rank_op') if g is not None: g = g[idx] #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 #[channels, ] if is_mulaw_quantize(hparams.input_type): initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels]) else: initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value #Fast eval y_hat = self.incremental(initial_input, c=c, g=g, time_length=length, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) #Save targets and length for eval loss computation if is_mulaw_quantize(hparams.input_type): self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length] else: self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :] self.eval_length = length if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1]) y_hat = inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) y_target = inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat self.y_target = y_target if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' targets: {}'.format(y_target.shape)) log(' outputs: {}'.format(y_hat.shape)) #synthesizing else: if c is None: assert synthesis_length is not None else: #[batch_size, local_condition_time, local_condition_dimension(num_mels)] message = ('Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'.format( hparams.cin_channels, c.shape)) with tf.control_dependencies([tf.assert_equal(tf.rank(c), 3, message=message)]): c = tf.identity(c, name='synthesis_assert_c_rank_op') Tc = tf.shape(c)[1] upsample_factor = audio.get_hop_size(self._hparams) #Overwrite length with respect to local condition features synthesis_length = Tc * upsample_factor #[batch_size, local_condition_dimension, local_condition_time] #time_length will be corrected using the upsample network c = tf.transpose(c, [0, 2, 1]) #Start silence frame if is_mulaw_quantize(hparams.input_type): initial_value = mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = tf.one_hot(indices=initial_value, depth=hparams.quantize_channels, dtype=tf.float32) initial_input = tf.reshape(initial_input, [1, 1, hparams.quantize_channels]) else: initial_input = tf.ones([1, 1, 1], tf.float32) * initial_value y_hat = self.incremental(initial_input, c=c, g=g, time_length=synthesis_length, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = tf.reshape(tf.reduce_max(y_hat, axis=1), [-1]) y_hat = util.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = util.inv_mulaw(tf.reshape(y_hat, [-1]), hparams.quantize_channels) else: y_hat = tf.reshape(y_hat, [-1]) self.y_hat = y_hat if self.local_conditioning_enabled(): log(' local_condition: {}'.format(c.shape)) if self.has_speaker_embedding(): log(' global_condition: {}'.format(g.shape)) log(' outputs: {}'.format(y_hat.shape)) self.variables = tf.trainable_variables() self.ema = tf.train.ExponentialMovingAverage(decay=hparams.wavenet_ema_decay)
def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir): hparams = self._hparams local_cond, global_cond = self._check_conditions() #Switch mels in case of debug # if self.synth_debug: # assert len(hparams.wavenet_debug_mels) == len(hparams.wavenet_debug_wavs) # mel_spectrograms = [np.load(mel_file) for mel_file in hparams.wavenet_debug_mels] #Get True length of audio to be synthesized: audio_len = mel_len * hop_size audio_lengths = [ len(x) * get_hop_size(self._hparams) for x in mel_spectrograms ] #Prepare local condition batch maxlen = max([len(x) for x in mel_spectrograms]) #[-max, max] or [0,max] T2_output_range = ( -self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else ( 0, self._hparams.max_abs_value) if self._hparams.clip_for_wavenet: mel_spectrograms = [ np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms ] c_batch = np.stack([ _pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms ]).astype(np.float32) if self._hparams.normalize_for_wavenet: #rerange to [0, 1] c_batch = np.interp(c_batch, T2_output_range, (0, 1)) g = None if speaker_ids is None else np.asarray( speaker_ids, dtype=np.int32).reshape(len(c_batch), 1) feed_dict = {} if local_cond: feed_dict[self.local_conditions] = c_batch else: feed_dict[self.synthesis_length] = 100 if global_cond: feed_dict[self.global_conditions] = g # if self.synth_debug: # debug_wavs = hparams.wavenet_debug_wavs # assert len(debug_wavs) % hparams.wavenet_num_gpus == 0 # test_wavs = [np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs] # # #pad wavs to same length # max_test_len = max([len(x) for x in test_wavs]) # test_wavs = np.stack([_pad_inputs(x, max_test_len) for x in test_wavs]).astype(np.float32) # # assert len(test_wavs) == len(debug_wavs) # feed_dict[self.targets] = test_wavs.reshape(len(test_wavs), max_test_len, 1) # feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]]) #Generate wavs and clip extra padding to select Real speech parts #generated_wavs, upsampled_features = self.session.run([self.model.tower_y_hat, self.model.tower_synth_upsampled_local_features], feed_dict=feed_dict) #Linearize outputs (n_gpus -> 1D) #generated_wavs = [wav for gpu_wavs in generated_wavs for wav in gpu_wavs] #upsampled_features = [feat for gpu_feats in upsampled_features for feat in gpu_feats] #generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)] #upsampled_features = [upsampled_feature[:, :length] for upsampled_feature, length in zip(upsampled_features, audio_lengths)] generated_wavs = self.session.run(self.model.y_hat, feed_dict=feed_dict) generated_wavs = [ generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths) ] audio_filenames = [] for i, generated_wav in enumerate(generated_wavs): #Save wav to disk audio_filename = os.path.join( out_dir, 'wavenet-audio-{}.wav'.format(basenames[i])) save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate) audio_filenames.append(audio_filename) #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. # generated_mel = melspectrogram(generated_wav, hparams).T # util.plot_spectrogram(generated_mel, os.path.join(log_dir, 'wavenet-mel-spectrogram-{}.png'.format(basenames[i])), # title='Local Condition vs Reconstructed Audio Mel-Spectrogram analysis', target_spectrogram=input_mel) # #Save upsampled features to visualize checkerboard artifacts. # util.plot_spectrogram(upsampled_feature.T, os.path.join(log_dir, 'wavenet-upsampled_features-{}.png'.format(basenames[i])), # title='Upmsampled Local Condition features', auto_aspect=True) #Save waveplot to disk if log_dir is not None: plot_filename = os.path.join( log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i])) util.waveplot(plot_filename, generated_wav, None, hparams, title='WaveNet generated Waveform.') return audio_filenames
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except : print('file {} present in csv not in folder'.format( wav_path)) return None if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav) #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size()) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) time_steps = len(out) assert time_steps >= mel_frames * audio.get_hop_size() #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size()] assert time_steps % audio.get_hop_size() == 0 # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _assert_ready_for_upsample(self, x, c): assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size( self._hparams)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)