def _adjust_time_resolution(self, batch, local_condition, max_time_steps): '''Adjust time resolution between audio and local condition ''' if local_condition: new_batch = [] for b in batch: x, c, g, l = b self._assert_ready_for_upsample(x, c) if max_time_steps is not None: max_steps = _ensure_divisible( max_time_steps, audio.get_hop_size(self._hparams), True) if len(x) > max_time_steps: max_time_frames = max_steps // audio.get_hop_size( self._hparams) start = np.random.randint(0, len(c) - max_time_frames) time_start = start * audio.get_hop_size(self._hparams) x = x[time_start:time_start + max_time_frames * audio.get_hop_size(self._hparams)] c = c[start:start + max_time_frames, :] self._assert_ready_for_upsample(x, c) new_batch.append((x, c, g, l)) return new_batch else: new_batch = [] for b in batch: x, c, g, l = b x = audio.trim_silence(x, hparams) if max_time_steps is not None and len(x) > max_time_steps: start = np.random.randint(0, len(c) - max_time_steps) x = x[start:start + max_time_steps] new_batch.append((x, c, g, l)) return new_batch
def ensure_divisible_mel(length, divisible_by=256, lower=True): if length % divisible_by == 0: max_steps = length return max_steps // audio.get_hop_size() if lower: max_steps = length - length % divisible_by else: max_steps = length + (divisible_by - length % divisible_by) return max_steps // audio.get_hop_size()
def __init__(self, coord, data_dirs, batch_size, receptive_field, gc_enable=False, queue_size=8): super(DataFeederWavenet, self).__init__() self.data_dirs = data_dirs self.coord = coord self.batch_size = batch_size self.receptive_field = receptive_field self.hop_size = audio.get_hop_size(hparams) self.sample_size = ensure_divisible(hparams.sample_size, self.hop_size, True) self.max_frames = self.sample_size // self.hop_size # sample_size 크기를 확보하기 위해. self.queue_size = queue_size self.gc_enable = gc_enable self.skip_path_filter = hparams.skip_path_filter self.rng = np.random.RandomState(123) self._offset = defaultdict(lambda: 2) # key에 없는 값이 들어어면 2가 할당된다. self.data_dir_to_id = { data_dir: idx for idx, data_dir in enumerate(self.data_dirs) } # data_dir <---> speaker_id 매핑 self.path_dict = get_path_dict( self.data_dirs, np.max([self.sample_size, receptive_field ])) # receptive_field 보다 작은 것을 버리고, 나머지만 돌려준다. self._placeholders = [ tf.placeholder(tf.float32, shape=[None, None, 1], name='input_wav'), tf.placeholder(tf.float32, shape=[None, None, hparams.num_mels], name='local_condition') ] dtypes = [tf.float32, tf.float32] if self.gc_enable: self._placeholders.append( tf.placeholder(tf.int32, shape=[None], name='speaker_id')) dtypes.append(tf.int32) queue = tf.FIFOQueue(self.queue_size, dtypes, name='input_queue') self.enqueue = queue.enqueue(self._placeholders) if self.gc_enable: self.inputs_wav, self.local_condition, self.speaker_id = queue.dequeue( ) else: self.inputs_wav, self.local_condition = queue.dequeue() self.inputs_wav.set_shape(self._placeholders[0].shape) self.local_condition.set_shape(self._placeholders[1].shape) if self.gc_enable: self.speaker_id.set_shape(self._placeholders[2].shape)
def synthesis(checkpoint_path, local_path, global_id, output_dir, hp): checkpoint_name = checkpoint_path.split('/')[-1] audio_dir = os.path.join(output_dir, checkpoint_name, 'wavs') plot_dir = os.path.join(output_dir, checkpoint_name, 'plots') os.makedirs(audio_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) ph = create_placeholders() model = create_model(ph, hp) # apply ema to variable ema = tf.train.ExponentialMovingAverage(decay=hp.ema_decay) local_condition = np.load(local_path) local_condition = local_condition.reshape([1, -1, hparams.num_mels]) if not hp.upsample_conditional_features: local_condition = np.repeat(local_condition, audio.get_hop_size(), axis=1) index = local_path.split('-')[-1].split('.')[0] saver = tf.train.Saver(ema.variables_to_restore()) config = tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True), log_device_placement=False, ) with tf.Session(config=config) as sess: saver.restore(sess, checkpoint_path) start_time = time.time() outputs = sess.run(model.eval_outputs, feed_dict={ph['local_condition']: local_condition}) duration = time.time() - start_time print( 'Time Evaluation: Generation of {} audio samples took {:.3f} sec ({:.3f} frames/sec)' .format(len(outputs), duration, len(outputs) / duration)) waveform = np.reshape(outputs, [-1]) audio_path = os.path.join(audio_dir, '{}.wav'.format(index)) plot_path = os.path.join(plot_dir, '{}.png'.format(index)) waveplot(plot_path, waveform, None, hp) librosa.output.write_wav(audio_path, waveform, sr=hp.sample_rate)
def _adjust_time_step(self, audio_data, local_feature, max_time_steps): """Adjust time resolution for local condition.""" hop_size = audio.get_hop_size() if local_feature is not None: if self._hparams.upsample_conditional_features: self._assert_ready_for_upsample(audio_data, local_feature) if max_time_steps is not None: max_steps = _ensure_divisible(max_time_steps, hop_size, True) if len(audio_data) > max_time_steps: max_time_frames = max_steps // hop_size start = np.random.randint(0, len(local_feature) - max_time_frames) time_start = start * hop_size audio_data = audio_data[time_start:time_start + max_time_frames * hop_size] local_feature = local_feature[start:start + max_time_frames, :] self._assert_ready_for_upsample(audio_data, local_feature) else: audio_data, local_feature = audio.adjust_time_resolution(audio_data, local_feature) if max_time_steps is not None and len(audio_data) > max_time_steps: s = np.random.randint(0, len(audio_data) - max_time_steps) audio_data, local_feature = audio_data[s:s + max_time_steps], local_feature[s:s + max_time_steps, :] assert len(audio_data) == len(local_feature) return audio_data, local_feature
def _process_utterance(out_dir, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = audio.trim_silence(wav, hparams) # Trim leading and trailing silence #Mu-law quantize, default 값은 'raw' if hparams.input_type == 'mulaw-quantize': #[0, quantize_channels) out = audio.mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif hparams.input_type == 'mulaw': #[-1, 1] out = audio.mulaw(wav, hparams.quantize_channels) constant_values = audio.mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # raw #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: # hparams.use_lws = False #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk wav_id = os.path.splitext(os.path.basename(wav_path))[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) linear_filename = '{}-linear.npy'.format(wav_id) npz_filename = '{}.npz'.format(wav_id) npz_flag = True if npz_flag: # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다. data = { 'audio': out.astype(out_dtype), 'mel': mel_spectrogram.T, 'linear': linear_spectrogram.T, 'time_steps': time_steps, 'mel_frames': mel_frames, 'text': text, 'tokens': text_to_sequence(text), # eos(~)에 해당하는 "1"이 끝에 붙는다. 'loss_coeff': 1 # For Tacotron } np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False) else: np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text, npz_filename)
def _process_utterance(out_dir, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) #1차원짜리 wav파일 뽑아옴 #Load an audio file as a floating point time series. #Audio will be automatically resampled to the given rate (default sr=22050). #To preserve the native sampling rate of the file, use sr=None. #print('====wav====') #print(wav,wav.shape) (240001,) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max #We rescale because it is assumed in Wavenet training that wavs are in [-1, 1] when computing the mixture loss. This is mainly coming from PixelCNN implementation. #https://github.com/Rayhane-mamah/Tacotron-2/issues/69 #M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = audio.trim_silence(wav, hparams) # Trim leading and trailing silence #Mu-law quantize, default 값은 'raw' #The quantization noise is from the analog to digital conversion. The mu-law compression actually reduces the noise and increases the dynamic range. #If you search a little bit in the code you will find that the input is always mu-law encoded here. #scalar_input only determines if the model uses a one-hot encoding for every data point of the input waveform, or just uses floating point values for each sample. if hparams.input_type=='mulaw-quantize': #[0, quantize_channels) out = audio.mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif hparams.input_type=='mulaw': #[-1, 1] out = audio.mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # raw #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrograFm from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) #print('====mel_spectrogram====') #print(mel_spectrogram,mel_spectrogram.shape) #(80,797),(80,801) ... mel_frames = mel_spectrogram.shape[1] #print('===mel frame====') #print(mel_frames) 801, 797 ,... if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) #print('====linear_spectrogram====') #print(linear_spectrogram,linear_spectrogram.shape) #(1025,787),(1025,801) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: # hparams.use_lws = False #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #1024 == 2048//2 == fft_size//2 #print('===pad===') #print(pad) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) #print(out,out.shape) #(240001,) out = np.pad(out, pad, mode='reflect') #shape : (242049,) - 패딩 #print(out,out.shape) #(242049,) #print('===out====') #print(out,out.shape) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] #240300으로 맞춤(자름) assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) #print(audio.get_hop_size(hparams)) : 300 #print(out,out.shape) #(240300,) = 801*300 # Write the spectrogram and audio to disk wav_id = os.path.splitext(os.path.basename(wav_path))[0] #확장자 제외하고 파일 이름 얻기 #print('====wav_id====') #print(wav_id) # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) linear_filename = '{}-linear.npy'.format(wav_id) npz_filename = '{}.npz'.format(wav_id) npz_flag=True if npz_flag: # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다. data = { 'audio': out.astype(out_dtype), 'mel': mel_spectrogram.T, 'linear': linear_spectrogram.T, 'time_steps': time_steps, 'mel_frames': mel_frames, 'text': text, 'tokens': text_to_sequence(text), # eos(~)에 해당하는 "1"이 끝에 붙는다. 'loss_coeff': 1 # For Tacotron } #print('=====data====') #print(data) np.savez(os.path.join(out_dir,npz_filename ), **data, allow_pickle=False) #여러개의 배열을 1개의 압축되지 않은 *.npz 포맷 파일로 저장하기 else: np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example #print('====mel_frames====') #print(mel_frames) #print('====time_steps====') #print(time_steps) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text,npz_filename)
def _process_utterance(mfcc_dir, wav_dir, index, wav_path, hparams, mode): """ Preprocesses a single utterance wav/text pair this writes the mfcc to disk and return a tuple to write to the train.txt file Args: - mfcc_dir: the directory to write the mfcc into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectrogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mfcc_filename, linear_filename, time_steps, mfcc_frames, linear_frames, text) """ try: # Load the audio as numpy array wav_full = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #M-AILABS extra silence specific if hparams.trim_silence: wav_full = audio.trim_silence(wav_full, hparams) # Preprocess Audio & Extract MFCC (mfcc + d + a) sample_idx = 0 sample_metadata = [] if (mode == "train") or (mode == "post_train"): # Add the same size slice from the end if wav_full.shape[0] >= hparams.sample_size: n_slice = int(np.floor(wav_full.shape[0]/hparams.sample_size)) samples = wav_full[:n_slice * hparams.sample_size].reshape((n_slice, hparams.sample_size)) if wav_full.shape[0] % hparams.sample_size != 0: ## FOR UNIT SEARCH : slice each audio by sample_size last_slice = wav_full[::-1][:hparams.sample_size] samples = np.vstack((samples, last_slice)) else: samples = [wav_full] else: samples = [wav_full] for wav in samples: #Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # #Trim silences # start, end = audio.start_and_end_indices(out, hparams.silence_threshold) # wav = wav[start: end] # preem_wav = preem_wav[start: end] # out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute mfcc mfcc = audio.mfcc(wav, hparams) mfcc_frames = mfcc.shape[0] # # Compute the mel scale spectrogram from the wav # mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) # mel_frames = mel_spectrogram.shape[1] if mfcc_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mfcc_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use out = out[:int(np.ceil(mfcc_frames/hparams.vqvae_down_freq) * hparams.vqvae_down_freq * audio.get_hop_size(hparams))] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = os.path.join(wav_dir, 'audio-{}-{}.npy'.format(index, sample_idx)) mfcc_filename = os.path.join(mfcc_dir, 'mfcc-{}-{}.npy'.format(index, sample_idx)) np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mfcc_filename, mfcc, allow_pickle=False) #global condition features if hparams.gin_channels > 0: if (mode == "train") or (mode == "post_train"): speaker_id = hparams.speakers.index(index[:4]) elif mode == "synth": speaker_id = 0 else: speaker_id = '<no_g>' sample_metadata.append((audio_filename, mfcc_filename, mfcc_filename, speaker_id, time_steps, mfcc_frames)) sample_idx += 1 return sample_metadata
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'bznsyp-audio-%05d.npy' % index mel_filename = 'bznsyp-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def assert_ready_for_upsampling(x, c): assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size()
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size()].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def collate_fn(batch): """Create batch Args: batch(tuple): List of tuples - x[0] (ndarray,int) : list of (T,) - x[1] (ndarray,int) : list of (T, D) - x[2] (ndarray,int) : list of (1,), speaker id Returns: tuple: Tuple of batch - x (FloatTensor) : Network inputs (B, C, T) - y (LongTensor) : Network targets (B, T, 1) """ local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0 global_conditioning = len(batch[0]) >= 3 and hparams.gin_channels > 0 if hparams.max_time_sec is not None: max_time_steps = int(hparams.max_time_sec * hparams.sample_rate) elif hparams.max_time_steps is not None: max_time_steps = hparams.max_time_steps else: max_time_steps = None # Time resolution adjustment if local_conditioning: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] if hparams.upsample_conditional_features: assert_ready_for_upsampling(x, c) if max_time_steps is not None: max_steps = ensure_divisible(max_time_steps, audio.get_hop_size(), True) if len(x) > max_steps: max_time_frames = max_steps // audio.get_hop_size() s = np.random.randint(0, len(c) - max_time_frames) ts = s * audio.get_hop_size() x = x[ts:ts + audio.get_hop_size() * max_time_frames] c = c[s:s + max_time_frames, :] assert_ready_for_upsampling(x, c) else: x, c = audio.adjust_time_resolution(x, c) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] assert len(x) == len(c) new_batch.append((x, c, g)) batch = new_batch else: new_batch = [] for idx in range(len(batch)): x, c, g = batch[idx] x = audio.trim(x) if max_time_steps is not None and len(x) > max_time_steps: s = np.random.randint(0, len(x) - max_time_steps) if local_conditioning: x, c = x[s:s + max_time_steps], c[s:s + max_time_steps, :] else: x = x[s:s + max_time_steps] new_batch.append((x, c, g)) batch = new_batch # Lengths input_lengths = [len(x[0]) for x in batch] max_input_len = max(input_lengths) # (B, T, C) # pad for time-axis if is_mulaw_quantize(hparams.input_type): x_batch = np.array([ _pad_2d( np_utils.to_categorical(x[0], num_classes=hparams.quantize_channels), max_input_len) for x in batch ], dtype=np.float32) else: x_batch = np.array( [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch], dtype=np.float32) assert len(x_batch.shape) == 3 # (B, T) if is_mulaw_quantize(hparams.input_type): y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.int) else: y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.float32) assert len(y_batch.shape) == 2 # (B, T, D) if local_conditioning: max_len = max([len(x[1]) for x in batch]) c_batch = np.array([_pad_2d(x[1], max_len) for x in batch], dtype=np.float32) assert len(c_batch.shape) == 3 # (B x C x T) c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous() else: c_batch = None if global_conditioning: g_batch = torch.LongTensor([x[2] for x in batch]) else: g_batch = None # Covnert to channel first i.e., (B, C, T) x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous() # Add extra axis if is_mulaw_quantize(hparams.input_type): y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous() else: y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous() input_lengths = torch.LongTensor(input_lengths) return x_batch, y_batch, c_batch, g_batch, input_lengths
def collate_fn(batch): """Create batch Args: batch(tuple): List of tuples - x[0] (ndarray,int) : list of (T,) - x[1] (ndarray,int) : list of (T, D) - x[2] (ndarray,int) : list of (1,), speaker id Returns: tuple: Tuple of batch - x (FloatTensor) : Network inputs (B, C, T) - y (LongTensor) : Network targets (B, T, 1) """ local_conditioning = len(batch[0]) >= 2 and hparams.cin_channels > 0 global_conditioning = len(batch[0]) >= 3 and hparams.file_channel > 0 if hparams.max_time_sec is not None: max_time_steps = int(hparams.max_time_sec * hparams.sample_rate) elif hparams.max_time_steps is not None: max_time_steps = hparams.max_time_steps else: max_time_steps = None max_time_second = max_time_steps / hparams.sample_rate use_image_num = int( np.floor(max_time_second / (0.04 * hparams.image_hope_size))) # Time resolution adjustment video_block = [] flow_block = [] if local_conditioning: new_batch = [] for idx in range(len(batch)): x, c, video, flow, start, g, path = batch[idx] if hparams.upsample_conditional_features: assert_ready_for_upsampling(x, c) if max_time_steps is not None: max_steps = ensure_divisible(max_time_steps, audio.get_hop_size(), True) if len(x) > max_steps: for ln in range(hparams.load_num): mel_start = 3 + 4 * start[ln] c1 = c[mel_start:mel_start + use_image_num * 4] x1 = x[mel_start * hparams.hop_size:(mel_start + use_image_num * 4) * hparams.hop_size] new_batch.append( (x1, c1, g, os.path.join(path, str(start[ln])))) video_block.append(torch.FloatTensor(video)) flow_block.append(torch.FloatTensor(flow)) batch = new_batch # Lengths input_lengths = [len(x[0]) for x in batch] max_input_len = max(input_lengths) # (B, T, C) # pad for time-axis if is_mulaw_quantize(hparams.input_type): x_batch = np.array([ _pad_2d( np_utils.to_categorical(x[0], num_classes=hparams.quantize_channels), max_input_len) for x in batch ], dtype=np.float32) else: x_batch = np.array( [_pad_2d(x[0].reshape(-1, 1), max_input_len) for x in batch], dtype=np.float32) assert len(x_batch.shape) == 3 # (B, T) if is_mulaw_quantize(hparams.input_type): y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.int) else: y_batch = np.array([_pad(x[0], max_input_len) for x in batch], dtype=np.float32) assert len(y_batch.shape) == 2 # (B, T, D) if local_conditioning: max_len = max([len(x[1]) for x in batch]) c_batch = np.array([_pad_2d(x[1], max_len) for x in batch], dtype=np.float32) assert len(c_batch.shape) == 3 # (B x C x T) c_batch = torch.FloatTensor(c_batch).transpose(1, 2).contiguous() else: c_batch = None if global_conditioning: g_batch = torch.LongTensor([x[2] for x in batch]) else: g_batch = None path_batch = list(x[3] for x in batch) video_batch = torch.cat(video_block, 0) flow_batch = torch.cat(flow_block, 0) # Covnert to channel first i.e., (B, C, T) x_batch = torch.FloatTensor(x_batch).transpose(1, 2).contiguous() # Add extra axis if is_mulaw_quantize(hparams.input_type): y_batch = torch.LongTensor(y_batch).unsqueeze(-1).contiguous() else: y_batch = torch.FloatTensor(y_batch).unsqueeze(-1).contiguous() input_lengths = torch.LongTensor(input_lengths) return video_batch, flow_batch, c_batch, x_batch, y_batch, g_batch, input_lengths, path_batch
def _assert_ready_for_upsample(self, x, c): assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size()
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start: chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,) mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,) text_idx = '%s - %05d' % (text, chunk_idx,) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
def _process_utterance(out_dir, index, wav_path, text, silence_threshold, fft_size): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, text, mel_len) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hp.rescaling: wav = wav / np.abs(wav).max() * hp.rescaling_max if hp.input_type != "raw": # Mu-law quantize out = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(out, silence_threshold) out = out[start:end] wav = wav[start:end] constant_value = P.mulaw_quantize(0, 256) out_dtype = np.int16 else: out = wav constant_value = 0. out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_value) mel_len = mel_spectrogram.shape[0] assert len(out) >= mel_len * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:mel_len * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_path = os.path.join(out_dir, '{}-audio.npy'.format(wav_id)) mel_path = os.path.join(out_dir, '{}-mel.npy'.format(wav_id)) np.save(audio_path, out.astype(out_dtype), allow_pickle=False) np.save(mel_path, mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return os.path.abspath(audio_path), os.path.abspath( mel_path), text, timesteps