def save_states(global_step, mel_outputs, linear_outputs, attn, y, checkpoint_dir=None): idx = 1 # idx = np.random.randint(0, len(mel_outputs)) # Alignment path = os.path.join(checkpoint_dir, "step{}_alignment.png".format(global_step)) alignment = attn[idx].cpu().data.numpy( ) # alignment = attn[idx].cpu().data.numpy()[:, :input_length] plot_alignment(alignment.T, path, info="tacotron, step={}".format(global_step)) # Predicted spectrogram path = os.path.join(checkpoint_dir, "step{}_predicted_spectrogram.png".format(global_step)) linear_output = linear_outputs[idx].cpu().data.numpy() plot_spectrogram(linear_output, path) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) path = os.path.join(checkpoint_dir, "step{}_predicted.wav".format(global_step)) audio.save_wav(signal, path) # Target spectrogram path = os.path.join(checkpoint_dir, "step{}_target_spectrogram.png".format(global_step)) linear_output = y[idx].cpu().data.numpy() plot_spectrogram(linear_output, path)
def synthesize(self,texts, speaker): from utils.plot import plot_alignment,plot_spectrogram inputs,input_lengths=self.process_sentence(texts) # print(inputs,input_lengths) speaker=np.array(speaker) # tacotron2 inference. mel_outputs, post_mel_outputs, stop_outputs, alignment_historys = self.model.inference( inputs, input_lengths, speaker_ids=speaker, use_window_mask=False, win_front=20, win_back=20, maximum_iterations=1000, ) plot_spectrogram(post_mel_outputs[0].numpy(),'./mel.png','inference') if self.vocoder_type=='GL': target_lengths = self._get_output_lengths(stop_outputs) # Take off the batch wise padding mels = [mel[:target_length, :].numpy() for mel, target_length in zip(post_mel_outputs, target_lengths)] wavs=[self.vocoder(mel.T,self.hp) for mel in mels] else: if self.vocoder_type=='Multi': if post_mel_outputs.shape[1]%self.vocoder_window!=0: post_mel_outputs=post_mel_outputs[:,:-int( post_mel_outputs.shape[1]%self.vocoder_window)] _,wavs=self.vocoder(post_mel_outputs) else: _,wavs = self.vocoder(post_mel_outputs) return wavs
def plot_result(self,pred,target): os.makedirs(os.path.join(self.config['outdir'], 'plots'), exist_ok=True) plot_spectrogram(pred, os.path.join(self.config['outdir'], 'plots','mel-before-{}.png'.format(self.steps)), target_spectrogram=target)
def save_current_model(args, checkpoint_path, global_step, hparams, loss, model, plot_dir, saver, sess, step, wav_dir): # Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) input_seq, mel_prediction, linear_prediction, attention_mask_sample, targets_mel, target_length, linear_target = sess.run( [ model.inputs[0], model.post_net_predictions[0], model.mag_pred[0], model.alignments[0], model.targets_mel[0], model.targets_length[0], model.targets_mag[0], ]) alignments, alignment_titles = get_alignments(attention_mask_sample) # save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, '{}-linear.wav'.format(step)), sr=hparams.sample_rate) # Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join(plot_dir, '{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, '{}-mel.wav'.format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) for i in range(len(alignments)): plot.plot_alignment( alignments[i], os.path.join(plot_dir, '{}_{}-align.png'.format(step, alignment_titles[i])), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.reduction_factor) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, '{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=targets_mel, max_len=target_length) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
def plot_result(self, mel_pred, mel_target, alig): os.makedirs(os.path.join(self.config['outdir'], 'plots'), exist_ok=True) plot_spectrogram(mel_pred, os.path.join(self.config['outdir'], 'plots', 'mel-before-{}.png'.format(self.steps)), target_spectrogram=mel_target) plot_alignment( alig, os.path.join(self.config['outdir'], 'plots', 'alig-{}.png'.format(self.steps)))
def train(log_dir, args): save_dir = os.path.join(log_dir, 'pretrained/') checkpoint_path = os.path.join(save_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) plot_dir = os.path.join(log_dir, 'plots') os.makedirs(plot_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: step_count = 0 try: #simple text file to keep count of global step with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file: step_count = int(file.read()) except: print('no step_counter file found, assuming there is no saved checkpoint') global_step = tf.Variable(step_count, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.model, hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initiating feeder feeder.start_in_session(sess) #Training loop while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step: {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: with open(os.path.join(log_dir,'step_counter.txt'), 'w') as file: file.write(str(step)) log('Saving checkpoint to: {}-{}'.format(checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) # Unlike the original tacotron, we won't save audio # because we yet have to use wavenet as vocoder log('Saving alignement and Mel-Spectrograms..') input_seq, prediction, alignment, target = sess.run([model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], ]) #save predicted spectrogram to disk (for plot and manual evaluation purposes) mel_filename = 'ljspeech-mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(log_dir, mel_filename), prediction, allow_pickle=False) #save alignment plot to disk (control purposes) plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss)) #save real mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(target, os.path.join(plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, Real'.format(args.model, time_string(), step, loss)) #save predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(prediction, os.path.join(plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss)) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def main(): with tf.device( '/cpu:0'): # cpu가 더 빠르다. gpu로 설정하면 Error. tf.device 없이 하면 더 느려진다. config = get_arguments() started_datestring = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now()) logdir = os.path.join(config.logdir, 'generate', started_datestring) print('logdir0-------------' + logdir) if not os.path.exists(logdir): os.makedirs(logdir) load_hparams(hparams, config.checkpoint_dir) sess = tf.Session() scalar_input = hparams.scalar_input net = WaveNetModel( batch_size=config.batch_size, dilations=hparams.dilations, filter_width=hparams.filter_width, residual_channels=hparams.residual_channels, dilation_channels=hparams.dilation_channels, quantization_channels=hparams.quantization_channels, out_channels=hparams.out_channels, skip_channels=hparams.skip_channels, use_biases=hparams.use_biases, scalar_input=hparams.scalar_input, global_condition_channels=hparams.gc_channels, global_condition_cardinality=config.gc_cardinality, local_condition_channels=hparams.num_mels, upsample_factor=hparams.upsample_factor, legacy=hparams.legacy, residual_legacy=hparams.residual_legacy, train_mode=False ) # train 단계에서는 global_condition_cardinality를 AudioReader에서 파악했지만, 여기서는 넣어주어야 함 if scalar_input: samples = tf.placeholder(tf.float32, shape=[net.batch_size, None]) else: samples = tf.placeholder( tf.int32, shape=[net.batch_size, None] ) # samples: mu_law_encode로 변환된 것. one-hot으로 변환되기 전. (batch_size, 길이) # local condition이 (N,T,num_mels) 여야 하지만, 길이 1까지로 들어가야하기 때무넹, (N,1,num_mels) --> squeeze하면 (N,num_mels) upsampled_local_condition = tf.placeholder( tf.float32, shape=[net.batch_size, hparams.num_mels]) next_sample = net.predict_proba_incremental( samples, upsampled_local_condition, [config.gc_id] * net.batch_size ) # Fast Wavenet Generation Algorithm-1611.09482 algorithm 적용 # making local condition data. placeholder - upsampled_local_condition 넣어줄 upsampled local condition data를 만들어 보자. print('logdir0-------------' + logdir) mel_input = np.load(config.mel) sample_size = mel_input.shape[0] * hparams.hop_size mel_input = np.tile(mel_input, (config.batch_size, 1, 1)) with tf.variable_scope('wavenet', reuse=tf.AUTO_REUSE): upsampled_local_condition_data = net.create_upsample( mel_input, upsample_type=hparams.upsample_type) var_list = [ var for var in tf.global_variables() if 'queue' not in var.name ] saver = tf.train.Saver(var_list) print('Restoring model from {}'.format(config.checkpoint_dir)) load(saver, sess, config.checkpoint_dir) init_op = tf.group(tf.initialize_all_variables(), net.queue_initializer) sess.run(init_op) # 이 부분이 없으면, checkpoint에서 복원된 값들이 들어 있다. quantization_channels = hparams.quantization_channels if config.wav_seed: # wav_seed의 길이가 receptive_field보다 작으면, padding이라도 해야 되는 거 아닌가? 그냥 짧으면 짧은 대로 return함 --> 그래서 너무 짧으면 error seed = create_seed(config.wav_seed, hparams.sample_rate, quantization_channels, net.receptive_field, scalar_input) # --> mu_law encode 된 것. if scalar_input: waveform = seed.tolist() else: waveform = sess.run( seed).tolist() # [116, 114, 120, 121, 127, ...] print('Priming generation...') for i, x in enumerate(waveform[-net.receptive_field:-1] ): # 제일 마지막 1개는 아래의 for loop의 첫 loop에서 넣어준다. if i % 100 == 0: print('Priming sample {}/{}'.format( i, net.receptive_field), end='\r') sess.run(next_sample, feed_dict={ samples: np.array([x] * net.batch_size).reshape( net.batch_size, 1), upsampled_local_condition: np.zeros([net.batch_size, hparams.num_mels]) }) print('Done.') waveform = np.array([waveform[-net.receptive_field:]] * net.batch_size) else: # Silence with a single random sample at the end. if scalar_input: waveform = [0.0] * (net.receptive_field - 1) waveform = np.array(waveform * net.batch_size).reshape( net.batch_size, -1) waveform = np.concatenate( [ waveform, 2 * np.random.rand(net.batch_size).reshape( net.batch_size, -1) - 1 ], axis=-1) # -1~1사이의 random number를 만들어 끝에 붙힌다. # wavefor: shape(batch_size,net.receptive_field ) else: waveform = [quantization_channels / 2] * ( net.receptive_field - 1 ) # 필요한 receptive_field 크기보다 1개 작게 만든 후, 아래에서 random하게 1개를 덧붙힌다. waveform = np.array(waveform * net.batch_size).reshape( net.batch_size, -1) waveform = np.concatenate( [ waveform, np.random.randint(quantization_channels, size=net.batch_size).reshape( net.batch_size, -1) ], axis=-1) # one hot 변환 전. (batch_size, 5117) start_time = time.time() upsampled_local_condition_data = sess.run( upsampled_local_condition_data) last_sample_timestamp = datetime.now() for step in range(sample_size): # 원하는 길이를 구하기 위해 loop sample_size window = waveform[:, -1:] # 제일 끝에 있는 1개만 samples에 넣어 준다. window: shape(N,1) # Run the WaveNet to predict the next sample. # fast가 아닌경우. window: [128.0, 128.0, ..., 128.0, 178, 185] # fast인 경우, window는 숫자 1개. prediction = sess.run( next_sample, feed_dict={ samples: window, upsampled_local_condition: upsampled_local_condition_data[:, step, :] } ) # samples는 mu law encoding된 것. 계산 과정에서 one hot으로 변환된다. --> (batch_size,256) if scalar_input: sample = prediction # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다. else: # Scale prediction distribution using temperature. # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다. # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다. np.seterr(divide='ignore') scaled_prediction = np.log( prediction ) / config.temperature # config.temperature인 경우는 값의 변화가 없다. scaled_prediction = ( scaled_prediction - np.logaddexp.reduce( scaled_prediction, axis=-1, keepdims=True) ) # np.log(np.sum(np.exp(scaled_prediction))) scaled_prediction = np.exp(scaled_prediction) np.seterr(divide='warn') # Prediction distribution at temperature=1.0 should be unchanged after # scaling. if config.temperature == 1.0: np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg= 'Prediction scaling at temperature=1.0 is not working as intended.' ) # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다. sample = [[ np.random.choice(np.arange(quantization_channels), p=p) ] for p in scaled_prediction] # choose one sample per batch waveform = np.concatenate([waveform, sample], axis=-1) #window.shape: (N,1) # Show progress only once per second. current_sample_timestamp = datetime.now() time_since_print = current_sample_timestamp - last_sample_timestamp if time_since_print.total_seconds() > 1.: duration = time.time() - start_time print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format( step + 1, sample_size, duration), end='\r') last_sample_timestamp = current_sample_timestamp # Introduce a newline to clear the carriage return from the progress. print() # Save the result as a wav file. if hparams.input_type == 'raw': out = waveform[:, net.receptive_field:] elif hparams.input_type == 'mulaw': decode = mu_law_decode(samples, quantization_channels, quantization=False) out = sess.run( decode, feed_dict={samples: waveform[:, net.receptive_field:]}) else: # 'mulaw-quantize' decode = mu_law_decode(samples, quantization_channels, quantization=True) out = sess.run( decode, feed_dict={samples: waveform[:, net.receptive_field:]}) # save wav for i in range(net.batch_size): config.wav_out_path = logdir + '/test-{}.wav'.format(i) mel_path = config.wav_out_path.replace(".wav", ".png") gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype( np.float32).T audio.save_wav(out[i], config.wav_out_path, hparams.sample_rate) # save_wav 내에서 out[i]의 값이 바뀐다. plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram', target_spectrogram=mel_input[i]) print('Finished generating.')
def _train_epoch(self, dataloader): self.model.train() running_loss = 0.0 running_l1_loss = 0.0 running_ssim_loss = 0.0 running_drn_loss = 0.0 pbar = tqdm(dataloader, unit="audios", unit_scale=dataloader.batch_size, \ disable=self.hparams.trainer.disable_progress_bar) for it, batch in enumerate(pbar, start=1): self.optimizer.zero_grad() mels, mlens, texts, tlens, durations = \ batch['mels'], batch['mlens'].squeeze(1), batch['texts'].long(), batch['tlens'].squeeze(1), batch['drns'].long() mels, mlens, texts, tlens, durations = \ mels.to(self.device), mlens.to(self.device), texts.to(self.device), tlens.to(self.device), durations.to(self.device) mels = self.normalizer(mels) melspecs, prd_durans = self.model((texts, tlens, durations, 1.0)) outputs_and_targets = (melspecs, mels, mlens, tlens, durations, prd_durans) loss, l1_loss, ssim_loss, drn_loss = self.compute_metrics( outputs_and_targets) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) self.optimizer.step() self.step += 1 loss, l1_loss, ssim_loss, drn_loss = \ loss.item(), l1_loss.item(), ssim_loss.item(), drn_loss.item() running_loss += loss running_l1_loss += l1_loss running_ssim_loss += ssim_loss running_drn_loss += drn_loss # update the progress bar pbar.set_postfix({ 'l1': "%.05f" % (running_l1_loss / it), 'ssim': "%.05f" % (running_ssim_loss / it), 'drn': "%.05f" % (running_drn_loss / it) }) mels, melspecs = mels.cpu().detach(), melspecs.cpu().detach() index = -1 mlen, tlen = mlens[index].item(), tlens[index].item() mels_fig = plot_spectrogram( melspecs[index, :mlen, :], target_spectrogram=mels[index, :mlen, :]) self.loggers.log_step( 'train', self.step, { 'step_l1_loss': l1_loss, 'step_ssim_loss': ssim_loss, 'step_drn_loss': drn_loss }, {'melspecs': mels_fig}) epoch_loss = running_loss / it epoch_l1_loss = running_l1_loss / it epoch_ssim_loss = running_ssim_loss / it epoch_drn_loss = running_drn_loss / it return epoch_loss, epoch_l1_loss, epoch_ssim_loss, epoch_drn_loss
def _train_epoch(self, dataloader=None): self.model.train() ll = len(dataloader) running_loss = 0.0 running_l1_loss = 0.0 running_ssim_loss = 0.0 running_att_loss = 0.0 pbar = tqdm(dataloader, unit="audios", unit_scale=dataloader.batch_size, \ disable=self.hparams.trainer.disable_progress_bar) for it, batch in enumerate(pbar, start=1): self.optimizer.zero_grad() mels, mlens, texts, tlens = \ batch['mels'], batch['mlens'].squeeze(1), batch['texts'].long(), batch['tlens'].squeeze(1) mels, mlens, texts, tlens = \ mels.to(self.device), mlens.to(self.device), texts.to(self.device), tlens.to(self.device) s = mels = self.normalizer(mels) # Spectrogram augmentation if self.hparams.duration.enable_augment: s = add_random_noise(mels, self.hparams.duration.noise) s = degrade_some(self.model, s, texts, tlens, \ self.hparams.duration.feed_ratio, repeat=self.hparams.duration.feed_repeat) s = frame_dropout(s, self.hparams.duration.replace_ratio) melspecs, attns = self.model((texts, tlens, s, True)) outputs_and_targets = (melspecs, mels, attns, mlens, tlens) loss, l1_loss, ssim_loss, att_loss = self.compute_metrics( outputs_and_targets) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) self.optimizer.step() self.step += 1 loss, l1_loss, ssim_loss, att_loss = loss.item(), l1_loss.item( ), ssim_loss.item(), att_loss.item() running_loss += loss running_l1_loss += l1_loss running_ssim_loss += ssim_loss running_att_loss += att_loss # update the progress bar pbar.set_postfix({ 'l1': "%.05f" % (running_l1_loss / it), 'ssim': "%.05f" % (running_ssim_loss / it), 'att': "%.05f" % (running_att_loss / it) }) mels, melspecs, attns = mels.cpu().detach(), melspecs.cpu().detach( ), attns.cpu().detach() index = -1 mlen, tlen = mlens[index].item(), tlens[index].item() mels_fig = plot_spectrogram( melspecs[index, :mlen, :], target_spectrogram=mels[index, :mlen, :]) attn_fig = plot_alignment(attns[index, :mlen, :tlen]) self.loggers.log_step( 'train', self.step, { 'step_l1_loss': l1_loss, 'step_ssim_loss': ssim_loss, 'step_att_loss': att_loss }, { 'melspecs': mels_fig, 'attention': attn_fig }) epoch_loss = running_loss / ll epoch_l1_loss = running_l1_loss / ll epoch_ssim_loss = running_ssim_loss / ll epoch_att_loss = running_att_loss / ll return epoch_loss, epoch_l1_loss, epoch_ssim_loss, epoch_att_loss
def eval_step(sess,logdir,step,waveform,upsampled_local_condition_data,speaker_id_data,mel_input_data,samples,speaker_id,upsampled_local_condition,next_sample,temperature=1.0): waveform = waveform[:,:1] sample_size = upsampled_local_condition_data.shape[1] last_sample_timestamp = datetime.now() start_time = time.time() for step2 in range(sample_size): # 원하는 길이를 구하기 위해 loop sample_size window = waveform[:,-1:] # 제일 끝에 있는 1개만 samples에 넣어 준다. window: shape(N,1) prediction = sess.run(next_sample, feed_dict={samples: window,upsampled_local_condition: upsampled_local_condition_data[:,step2,:],speaker_id: speaker_id_data }) if hparams.scalar_input: sample = prediction # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다. else: # Scale prediction distribution using temperature. # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다. # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다. np.seterr(divide='ignore') scaled_prediction = np.log(prediction) / temperature # config.temperature인 경우는 값의 변화가 없다. scaled_prediction = (scaled_prediction - np.logaddexp.reduce(scaled_prediction,axis=-1,keepdims=True)) # np.log(np.sum(np.exp(scaled_prediction))) scaled_prediction = np.exp(scaled_prediction) np.seterr(divide='warn') # Prediction distribution at temperature=1.0 should be unchanged after # scaling. if temperature == 1.0: np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg='Prediction scaling at temperature=1.0 is not working as intended.') # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다. sample = [[np.random.choice(np.arange(hparams.quantization_channels), p=p)] for p in scaled_prediction] # choose one sample per batch waveform = np.concatenate([waveform,sample],axis=-1) #window.shape: (N,1) # Show progress only once per second. current_sample_timestamp = datetime.now() time_since_print = current_sample_timestamp - last_sample_timestamp if time_since_print.total_seconds() > 1.: duration = time.time() - start_time print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(step2 + 1, sample_size, duration), end='\r') last_sample_timestamp = current_sample_timestamp print('\n') # Save the result as a wav file. if hparams.input_type == 'raw': out = waveform[:,1:] elif hparams.input_type == 'mulaw': decode = mu_law_decode(samples, hparams.quantization_channels,quantization=False) out = sess.run(decode, feed_dict={samples: waveform[:,1:]}) else: # 'mulaw-quantize' decode = mu_law_decode(samples, hparams.quantization_channels,quantization=True) out = sess.run(decode, feed_dict={samples: waveform[:,1:]}) # save wav for i in range(1): wav_out_path= logdir + '/test-{}-{}.wav'.format(step,i) mel_path = wav_out_path.replace(".wav", ".png") gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(np.float32).T audio.save_wav(out[i], wav_out_path, hparams.sample_rate) # save_wav 내에서 out[i]의 값이 바뀐다. plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram{}'.format(step),target_spectrogram=mel_input_data[i])
def run_eval(args, eval_dir, eval_model, eval_plot_dir, eval_wav_dir, feeder, hparams, sess, step, summary_writer): # Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) sum_eval_loss = 0.0 sum_mel_loss = 0.0 sum_stop_token_loss = 0.0 sum_linear_loss = 0.0 count = 0.0 mel_p = None mel_t = None t_len = None attention_mask_sample = None lin_p = None lin_t = None for _ in tqdm(range(feeder.test_steps)): test_eloss, test_mel_loss, test_stop_token_loss, test_linear_loss, mel_p, mel_t, t_len, attention_mask_sample, lin_p, lin_t = sess.run( [ eval_model.loss, eval_model.mel_loss, eval_model.stop_token_loss, eval_model.linear_loss, eval_model.post_net_predictions[0], eval_model.targets_mel[0], eval_model.targets_length[0], eval_model.alignments[0], eval_model.mag_pred[0], eval_model.targets_mag[0], ]) sum_eval_loss += test_eloss sum_mel_loss += test_mel_loss sum_stop_token_loss += test_stop_token_loss sum_linear_loss += test_linear_loss count += 1.0 wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, '{}-eval-linear.wav'.format(step)), sr=hparams.sample_rate) if count > 0.0: eval_loss = sum_eval_loss / count mel_loss = sum_mel_loss / count stop_token_loss = sum_stop_token_loss / count linear_loss = sum_linear_loss / count else: eval_loss = sum_eval_loss mel_loss = sum_mel_loss stop_token_loss = sum_stop_token_loss linear_loss = sum_linear_loss log('Saving eval log to {}..'.format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, '{}-eval-mel.wav'.format(step)), sr=hparams.sample_rate) alignments, alignment_titles = get_alignments(attention_mask_sample) for i in range(len(alignments)): plot.plot_alignment(alignments[i], os.path.join( eval_plot_dir, '{}_{}-eval-align.png'.format( step, alignment_titles[i])), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=t_len // hparams.reduction_factor) plot.plot_spectrogram( mel_p, os.path.join(eval_plot_dir, '{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) plot.plot_spectrogram( lin_p, os.path.join(eval_plot_dir, '{}-eval-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, mel_loss, stop_token_loss, eval_loss)
from datasets.audio.stft import TacotronSTFT from utils.plot import plot_spectrogram fullpath = '../audios/LJ001-0007.wav' filter_length = 1024 hop_length = 256 win_length = 1024 n_mel_channels = 80 sampling_rate = 22050 mel_fmin = 0.0 # 80.0 mel_fmax = 8000.0 # 7600.0 stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, n_mel_channels=n_mel_channels, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) wav, sr = librosa.load(fullpath, sr=None) assert sr == sampling_rate wav = torch.from_numpy(wav).unsqueeze(0) mel = stft.mel_spectrogram(wav).squeeze(0).t() print(mel.size()) plot_spectrogram(pred_spectrogram=mel, save_img=True, path='test.png')
def synthesize(self, texts, basenames, log_dir, mel_filenames): hparams = self._hparams # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) sequences = [np.asarray(text_to_sequence(text)) for text in texts] input_lengths = [len(seq) for seq in sequences] seqs, max_seq_len = self._prepare_inputs(sequences) feed_dict = { self.inputs: seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32) } linears, mels, alignments, audio_length = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignments[0], self.audio_length], feed_dict=feed_dict) # Natural batch synthesis # Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = audio_length if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way if platform.system() == 'Linux': # Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': # windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "centaur/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') return for i, mel in enumerate(mels): if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) alignments_samples, alignment_titles = self.get_alignments(alignments) for idx in range(len(alignments_samples)): # save alignments plot.plot_alignment(alignments_samples[idx], os.path.join(log_dir, 'plots/{}.png'.format( alignment_titles[ idx])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) # save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) # save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True)
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): hparams = self._hparams # [-max, max] or [0,max] t2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) seqs = [np.asarray(text_to_sequence(text)) for text in texts] input_lengths = [len(seq) for seq in seqs] input_seqs, max_seq_len = self._prepare_inputs(seqs) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] target_lengths = [len(np_target) for np_target in np_targets] target_seqs, max_target_len = self._prepare_targets(np_targets, self._hparams.outputs_per_step) feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) linears = None if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) # Natural batch synthesis # Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) # Take off the batch wise padding mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) # Natural batch synthesis # Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) # Take off the batch wise padding mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)] linears = np.clip(linears, t2_output_range[0], t2_output_range[1]) assert len(mels) == len(linears) == len(texts) mels = np.clip(mels, t2_output_range[0], t2_output_range[1]) if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way if platform.system() == 'Linux': # Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': # windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') return saved_mels_paths = [] for i, mel in enumerate(mels): # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) # save alignments plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) # save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) if linears: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) # save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths