def main(): # Target data filename = "120_kmeans_obj.pkl" kmeans = k.load_pkl(filename) spec, label = load_test_data() print("spec", spec.shape) print("label", label.shape) spec_ = np.empty((513, ), np.float32) for i in range(len(label)): spec_ = np.vstack((spec_, kmeans.cluster_centers_[label[i]])) spec_ = np.delete(spec_, 0, 0) print("compare data structure ----") print("spec: ", spec.shape) print("spec_: ", spec_.shape) print("spec data:", spec) print("spec_ data:", spec_) print("min-max spce_ data:", min_max(spec_)) waveform = audio.inv_spectrogram(spec) waveform_ = audio.inv_spectrogram(spec_) waveformmm_ = audio.inv_spectrogram(min_max(spec_)) audio.save_wav(waveform, 'ideal_out.wav') audio.save_wav(waveform_, 'idela_out_.wav') audio.save_wav(waveformmm_, 'idelal_outmm_.wav')
def save_states(self, global_epoch, mel_outputs, linear_outputs, ling, mel, linear, lengths): print("Save intermediate states at epoch {}".format(global_epoch)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(lengths) - 1) # Predicted mel spectrogram if mel_outputs is not None: mel_output = mel_outputs[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) self.writer.add_image("Predicted mel spectrogram", mel_output, global_epoch) # Predicted spectrogram if linear_outputs is not None: linear_output = linear_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) self.writer.add_image("Predicted spectrogram", spectrogram, global_epoch) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(self.checkpoint_dir, "epoch{:09d}_predicted.wav".format(global_epoch)) try: self.writer.add_audio("Predicted audio signal", signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path) # Target mel spectrogram if mel_outputs is not None: #ling = ling[idx].cpu().data.numpy() #mel = prepare_spec_image(audio._denormalize(mel)) #self.writer.add_image("Source mel spectrogram", ling, global_epoch) mel = mel[idx].cpu().data.numpy() mel = prepare_spec_image(audio._denormalize(mel)) self.writer.add_image("Target mel spectrogram", mel, global_epoch) if linear_outputs is not None: linear = linear[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear)) self.writer.add_image("Target spectrogram", spectrogram, global_epoch) # Target audio signal signal = audio.inv_spectrogram(linear.T) signal /= np.max(np.abs(signal)) try: self.writer.add_audio("Target audio signal", signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass
def tts(model, text, p=0): """Convert text to speech waveform given a deepvoice3 model. """ if use_cuda: model = model.cuda() model.eval() sequence = np.array(_frontend.text_to_sequence(text, p=p)) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long() text_positions = Variable(text_positions) if use_cuda: sequence = sequence.cuda() text_positions = text_positions.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() mel = mel_outputs[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
def tts(model, text, p=0): """Convert text to speech waveform given a deepvoice3 model. Args: text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. """ if use_cuda: model = model.cuda() model.eval() model.make_generation_fast_() sequence = np.array(_frontend.text_to_sequence(text, p=p)) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long() text_positions = Variable(text_positions) if use_cuda: sequence = sequence.cuda() text_positions = text_positions.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() mel = mel_outputs[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
def tts(model, text, p=0, speaker_id=None, fast=False): """Convert text to speech waveform given a deepvoice3 model. Args: text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. """ model = model.to(device) model.eval() if fast: model.make_generation_fast_() sequence = np.array(_frontend.text_to_sequence(text, p=p)) sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device) speaker_ids = None if speaker_id is None else torch.LongTensor( [speaker_id]).to(device) # Greedy decoding with torch.no_grad(): mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() mel = mel_outputs[0].cpu().data.numpy() mel = audio._denormalize(mel) # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
def save_and_plot_fn(args, log_dir, step, loss, prefix): idx, (seq, spec, align) = args audio_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) align_path = os.path.join( log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx)) waveform = inv_spectrogram(spec.T) save_audio(waveform, audio_path) info_text = 'step={:d}, loss={:.5f}'.format(step, loss) if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]: log('Training korean : Use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=True), isKorean=True) else: log('Training non-korean : X use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=False), isKorean=False)
def synthesize(self, text, speaker_id=0): """Convert text to speech waveform given a deepvoice3 model. Args: text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. """ sequence = np.array(self._frontend.text_to_sequence(text)) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long() text_positions = Variable(text_positions) speaker_ids = None if speaker_id is None else Variable(torch.LongTensor([speaker_id])) if self.use_cuda: sequence = sequence.cuda() text_positions = text_positions.cuda() speaker_ids = None if speaker_ids is None else speaker_ids.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments, done = self.model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) linear_output = linear_outputs[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) out = io.BytesIO() audio.save_wav(waveform, out) return out
def eval_batch(self, batch_x, batch_xl, batch_ym=None, batch_ys=None, batch_yl=None): time_start = time() logging.debug('batch_x.shape=%s, batch_xl.shape=%s' % (batch_x.shape, batch_xl.shape)) # if self.write_debug_files: # np.save('eval_x', self.batch_x[0]) # logging.debug ('eval_x.npy written.') # np.save('eval_xl', self.batch_xl[0]) # logging.debug ('eval_xl.npy written.') logging.debug(u'%fs self.session.run...' % (time() - time_start)) if batch_ym is None: spectrograms = self.sess.run(fetches=self.linear_outputs, feed_dict={ self.inputs: batch_x, self.input_lengths: batch_xl, }) else: step_out, loss_out, opt_out, spectrograms, alignment = self.sess.run( [ self.global_step, self.loss, self.optimize, self.linear_outputs, self.alignments ], feed_dict={ self.inputs: batch_x, self.input_lengths: batch_xl, self.mel_targets: batch_ym, self.linear_targets: batch_ys, self.target_lengths: batch_yl }) logging.debug(u'generating wav for %s' % self.decode_input(batch_x[0])) spectrogram = spectrograms[0] logging.debug('spectrogram.shape=%s' % repr(spectrogram.shape)) # if self.write_debug_files: # np.save('eval_spectrogram', spectrogram) # logging.debug ('eval_spectrogram.npy written.') logging.debug(u'%fs audio.inv_spectrogram...' % (time() - time_start)) wav = audio.inv_spectrogram(spectrogram.T, self.hp) logging.debug(u'%fs wav.' % (time() - time_start)) return wav
def tts(model, text, p=0, speaker_id=None, fast=False, wavenet=None): """Convert text to speech waveform given a deepvoice3 model. Args: text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. """ model = model.to(device) model.eval() if fast: model.make_generation_fast_() sequence = np.array(_frontend.text_to_sequence(text, p=p)) print('sequence to synthesize: ', sequence) sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device) speaker_ids = None if speaker_id is None else torch.LongTensor([speaker_id]).to(device) # Greedy decoding with torch.no_grad(): mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() mel = mel_outputs[0].cpu().data.numpy() mel = audio._denormalize(mel) # Predicted audio signal if wavenet is not None: wavenet = wavenet.to(device) wavenet.eval() if fast: wavenet.make_generation_fast_() # TODO: assuming scalar input initial_value = 0.0 initial_input = torch.zeros(1, 1, 1).fill_(initial_value).to(device) # (B, T, C) -> (B, C, T) c = mel_outputs.transpose(1, 2).contiguous() g = None Tc = c.size(-1) length = Tc * 256 initial_input = initial_input.to(device) c = c.to(device) waveform = wavenet.incremental_forward( initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=float(np.log(1e-14))) waveform = waveform.view(-1).cpu().data.numpy() else: waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
def say(self, txt, trim_silence=True, dyn_range_compress=True): time_start = time() logging.debug(u'%fs synthesizing %s' % (time() - time_start, txt)) input_data = np.zeros((1, self.hp['max_inp_len']), dtype='int32') input_lengths = np.zeros((1, ), dtype='int32') logging.debug('input_data.shape=%s, input_lengths.shape=%s' % (input_data.shape, input_lengths.shape)) self._encode_input(txt, 0, input_data, input_lengths) logging.debug('input_data=%s input_lengths=%s' % (input_data[0], input_lengths[0])) if self.write_debug_files: np.save('say_x', input_data[0]) logging.debug('say_x.npy written.') np.save('say_xl', input_lengths[0]) logging.debug('say_xl.npy written.') logging.debug(u'%fs self.session.run...' % (time() - time_start)) spectrograms = self.sess.run(fetches=self.linear_outputs, feed_dict={ self.inputs: input_data, self.input_lengths: input_lengths, }) spectrogram = spectrograms[0] logging.debug('spectrogram.shape=%s' % repr(spectrogram.shape)) if self.write_debug_files: np.save('say_spectrogram', spectrogram) logging.debug('say_spectrogram.npy written.') # np.set_printoptions(threshold=np.inf) logging.debug(u'%fs audio.inv_spectrogram...' % (time() - time_start)) wav = audio.inv_spectrogram(spectrogram.T, self.hp, use_fgla=True) if dyn_range_compress: logging.debug(u'%fs dynamic range compression...' % (time() - time_start)) wav = audio.dyn_range_compress(wav, self.hp) if trim_silence: logging.debug(u'%fs trim silence...' % (time() - time_start)) wav = audio.trim_silence(wav, self.hp) logging.debug(u'%fs wav.' % (time() - time_start)) return wav
def save_states(global_step, mel_outputs, linear_outputs, attn, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment # Multi-hop attention if attn.dim() == 4: for i, alignment in enumerate(attn): alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) os.makedirs(alignment_dir, exist_ok=True) path = join( alignment_dir, "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1)) alignment = alignment[idx].cpu().data.numpy() save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(checkpoint_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) else: assert False # Predicted spectrogram path = join(checkpoint_dir, "step{:09d}_predicted_spectrogram.png".format(global_step)) linear_output = linear_outputs[idx].cpu().data.numpy() save_spectrogram(path, linear_output) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step)) audio.save_wav(signal, path) # Target spectrogram path = join(checkpoint_dir, "step{:09d}_target_spectrogram.png".format(global_step)) linear_output = y[idx].cpu().data.numpy() save_spectrogram(path, linear_output)
def tts(model, text, p=0., speaker_id=None): """ Convert text to speech waveform given a deepvoice3 model. Args: model (DeepVoiceTTS): Model used to synthesize waveform. text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. Returns: waveform (numpy.ndarray): Shape(T_wav, ), predicted wave form, where T_wav means the length of the synthesized wave form. alignment (numpy.ndarray): Shape(T_dec, T_enc), predicted alignment matrix, where T_dec means the time steps of decoder outputs, T_enc means the time steps of encoder outoputs. spectrogram (numpy.ndarray): Shape(T_lin, C_lin), predicted linear spectrogram, where T__lin means the time steps of linear spectrogram and C_lin mean sthe channels of linear spectrogram. mel (numpy.ndarray): Shape(T_mel, C_mel), predicted mel spectrogram, where T_mel means the time steps of mel spectrogram and C_mel means the channels of mel spectrogram. """ model.eval() sequence = np.array(_frontend.text_to_sequence(text, p=p)).astype("int64") sequence = np.reshape(sequence, (1, -1)) text_positions = np.arange(1, sequence.shape[1] + 1, dtype="int64") text_positions = np.reshape(text_positions, (1, -1)) sequence = dg.to_variable(sequence) text_positions = dg.to_variable(text_positions) speaker_ids = None if speaker_id is None else fluid.layers.fill_constant( shape=[1, 1], value=speaker_id) # sequence: shape(1, input_length, 1) # text_positions: shape(1, input_length, 1) # Greedy decoding mel_outputs, linear_outputs, alignments, done = model.transduce( sequence, text_positions, speaker_ids) # reshape to the desired shape linear_output = linear_outputs.numpy().squeeze().T spectrogram = audio._denormalize(linear_output) alignment = alignments.numpy()[0] mel = mel_outputs.numpy().squeeze().T mel = audio._denormalize(mel) # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
def save_and_plot_fn(args, log_dir, step, loss, prefix): idx, (seq, spec, align) = args audio_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) align_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.png'.format(prefix, step, idx)) waveform = inv_spectrogram(spec.T) save_audio(waveform, audio_path) info_text = 'step={:d}, loss={:.5f}'.format(step, loss) plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=True))
def copy_synthesis(wav_file, out_path): """Perform copy synthesis on the wav file and write the synthesized wav to disk at out_path """ filename = os.path.splitext(os.path.basename(wav_file))[0] y = audio.load_wav(wav_file) if cfg.rescaling: y = y / np.abs(y).max() * cfg.rescaling_max mag = audio.spectrogram(y) y_hat = audio.inv_spectrogram(mag) out_path = os.path.join(out_path, filename + "_synthesized.wav") print(f"Writing {out_path} to disk") audio.save_wav(y_hat, out_path)
def save_states(global_step, attn, linear_outputs, input_lengths, logs_dir): """Save intermediate states """ print(f"Save intermediate states at step {global_step:09d}") idx = min(1, len(input_lengths) - 1) # Alignment # Multi-hop attention if attn is not None and attn.dim() == 4: for i, alignment in enumerate(attn): # Save alignment to disk alignment = alignment[idx].cpu().data.numpy() alignment_dir = join(logs_dir, f"alignment_layer{i + 1}") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, f"step{global_step:09d}_layer_{i + 1}_alignment.png") save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(logs_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, f"step{global_step:09d}_layer_alignment.png") alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) linear_output = linear_outputs[idx].cpu().data.numpy() # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) wavs_dir = join(logs_dir, "wavs") os.makedirs(wavs_dir, exist_ok=True) path = join(wavs_dir, f"step{global_step:09d}_predicted.wav") audio.save_wav(signal, path)
def generate(model_path,model_name, generate_path, generate_name, piece): """Synthesize audio from an array of embeddings. Args: encodings: Numpy array with shape [batch_size, time, dim]. save_paths: Iterable of output file names. checkpoint_path: Location of the pretrained model. [model.ckpt-200000] samples_per_save: Save files after every amount of generated samples. """ # Create directory for encoding if os.path.exists(generate_path) is False: os.makedirs(generate_path) net = AutoEncoder() net = load_model(net,model_path,model_name) cuda_available = torch.cuda.is_available() if cuda_available is True: net = net.cuda() net.eval() # Load audio for encoding piece = audio.load_wav(piece) spec = audio.spectrogram(piece).astype(np.float32) spec = torch.from_numpy(spec.T) spec = torch.FloatTensor(spec) spec = torch.unsqueeze(spec, 0) spec = Variable(spec, volatile=True).contiguous() if cuda_available is True: spec = spec.cuda() generated_spec = net(spec) generated_spec = generated_spec.data.cpu().numpy() generated_spec = np.squeeze(generated_spec) waveform = audio.inv_spectrogram(generated_spec.T) wav_name = generate_path + generate_name + '.wav' audio.save_wav(waveform , wav_name)
def tts(model, text, speaker_id=None, fast=False): """Convert text to speech waveform given a deepvoice3 model. """ model = model.to(device) model.eval() if fast: model.make_generation_fast_() if cfg.frontend == "en": sequence = np.array(english.text_to_sequence(text)) else: raise NotImplementedError sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device) speaker_ids = None if speaker_id is None else torch.LongTensor( [speaker_id]).to(device) # Greedy decoding with torch.no_grad(): mel_outputs, linear_outputs, alignments, done = model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() mel = mel_outputs[0].cpu().data.numpy() mel = audio._denormalize(mel) # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram, mel
MEL_DIR = join(train_dir, 'Acoustic_frame/mel') LINEAR_DIR = join(train_dir, 'Acoustic_frame/linear') ling_name = ling + '.npy' ling = np.load(join(LING_DIR, ling_name)) ling = norm_minmax(ling, np.load(join(train_dir, 'stat_linguistic_frame.npy'))) ling = torch.from_numpy(ling).unsqueeze(0).to(device) speaker_list = ['ema', 'emb', 'emc', 'emd', 'eme'] emotions = [0, 1, 2, 3] for ref_spk in speaker_list: for emo in emotions: spk_emo = '{}00{}27.npy'.format(ref_spk, str(emo)) mel = np.load(join(MEL_DIR, spk_emo)) mel = torch.from_numpy(mel).unsqueeze(0).to(device) _, _, linear_output = model(ling, mel) linear_output = linear_output[0].data.cpu().numpy() signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(result_dir, spk_emo.replace('.npy', '.wav')) audio.save_wav(signal, path) linear = np.load(join(LINEAR_DIR, spk_emo)) signal = audio.inv_spectrogram(linear.T) signal /= np.max(np.abs(signal)) path = join(result_dir, spk_emo.replace('.npy', '_refer.wav')) audio.save_wav(signal, path) print('%s' % spk_emo)
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, use_manual_attention=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None #plot_path = add_prefix(plot_path, time_str) if use_manual_attention: plot_path = add_postfix(plot_path, "manual") if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: end_idx_counter = 0 attention_argmax = alignment.argmax(0) end_idx = min(len(sequence) - 1, max(attention_argmax)) max_counter = min((attention_argmax == end_idx).sum(), 5) for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] audio_out = inv_spectrogram(wav.T) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_audio(audio_out, current_path) return True else: io_out = io.BytesIO() save_audio(audio_out, io_out) result = io_out.getvalue() return result
def save_waveform_from_spec(spectrogram, filename): waveform = audio.inv_spectrogram(spectrogram) audio.save_wav(waveform, filename)
processed_to_raw_map = { idx: speaker for idx, speaker in enumerate(vctk.available_speakers) } for i in range(5): random_voice_idx = random.randint(0, len(Mel)) random_speaker_id = X[random_voice_idx][1] lin = Y[random_voice_idx] mel = Mel[random_voice_idx] speaker_dir = join(current_model_dir, 'speaker{}'.format(random_speaker_id)) os.makedirs(speaker_dir, exist_ok=True) audio.save_wav(audio.inv_spectrogram(lin.T), join(speaker_dir, 'sample_voice.wav')) with open(text_list_file_path, "rb") as f: lines = f.readlines() for idx, line in enumerate(lines): text = line.decode("utf-8")[:-1] waveform, alignment, _, _ = tts( model, mel, text, p=replace_pronunciation_prob, fast=False) dst_wav_path = join(speaker_dir, "text{}.wav".format(idx)) audio.save_wav(waveform, dst_wav_path)
File "/Users/huangshengjie/Desktop/测试lmdb/第三方mel-griffin.py", line 203, in <module> wav = spectrogram2wav(mag) File "/Users/huangshengjie/Desktop/测试lmdb/第三方mel-griffin.py", line 112, in spectrogram2wav wav = griffin_lim(mag) File "/Users/huangshengjie/Desktop/测试lmdb/第三方mel-griffin.py", line 134, in griffin_lim X_t = invert_spectrogram(X_best) File "/Users/huangshengjie/Desktop/测试lmdb/第三方mel-griffin.py", line 155, in invert_spectrogram return librosa.istft(spectrogram, hop_length, win_length=win_length) File "/Users/huangshengjie/opt/anaconda3/envs/py36/lib/python3.6/site-packages/librosa/core/spectrum.py", line 288, in istft ifft_window = util.pad_center(ifft_window, n_fft) File "/Users/huangshengjie/opt/anaconda3/envs/py36/lib/python3.6/site-packages/librosa/util/utils.py", line 304, in pad_center 'at least input size ({:d})').format(size, n)) librosa.util.exceptions.ParameterError: Target size (894) must be at least input size (1000) ''' # wav = spectrogram2wav(mag) # 两个都是空白 wav = audio.inv_spectrogram(mag) audio.save_wav(wav, './hello2.wav') ''' 第三方数据: (318帧, 512维) mel.shape = (318, 512) mel.T.shape = (512, 318) # 当 n_mels = 80: (318, 80) mel.shape = (318, 80) mel.T.shape = (80, 318) ''' '''
def save_states(global_step, mel_outputs, linear_outputs, attn, mel, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment # Multi-hop attention if attn is not None and attn.dim() == 4: for i, alignment in enumerate(attn): alignment = alignment[idx].cpu().data.numpy() tag = "alignment_layer{}".format(i + 1) # save files as well for now alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) os.makedirs(alignment_dir, exist_ok=True) path = join( alignment_dir, "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1)) save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(checkpoint_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) tag = "averaged_alignment" # Predicted mel spectrogram if mel_outputs is not None: mel_output = mel_outputs[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) # Predicted spectrogram if linear_outputs is not None: linear_output = linear_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step)) audio.save_wav(signal, path) # Target mel spectrogram if mel_outputs is not None: mel_output = mel[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) # Target spectrogram if linear_outputs is not None: linear_output = y[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output))
def train(self, num_epochs=DEFAULT_NUM_EPOCHS): logging.info('counting steps...') num_steps = 0 while True: if os.path.exists(DSFN_X % (self.voice, num_steps)): num_steps += 1 else: break logging.info('counting steps... %d steps found.' % num_steps) if DEBUG_LIMIT: logging.warn('limiting number of steps to %d for debugging' % DEBUG_LIMIT) num_steps = DEBUG_LIMIT batch_size = self.hp['batch_size'] max_inp_len = self.hp['max_inp_len'] max_num_frames = self.hp['max_iters'] * self.hp[ 'outputs_per_step'] * self.hp['frame_shift_ms'] * self.hp[ 'sample_rate'] / 1000 n_fft, hop_length, win_length = audio.stft_parameters(self.hp) max_mfc_frames = 1 + int((max_num_frames - n_fft) / hop_length) batch_x = np.zeros((batch_size, max_inp_len), dtype='int32') batch_xl = np.zeros((batch_size, ), dtype='int32') batch_ys = np.zeros((batch_size, max_mfc_frames, self.hp['num_freq']), dtype='float32') batch_ym = np.zeros((batch_size, max_mfc_frames, self.hp['num_mels']), dtype='float32') batch_yl = np.zeros((batch_size, ), dtype='int32') sample_idxs = range(0, num_steps) for epoch_idx in range(num_epochs): epoch = self.epoch_start + epoch_idx random.shuffle(sample_idxs) epoch_loss = 0 num_batches = 0 for i, sample_idx in enumerate(sample_idxs): x = np.load(DSFN_X % (self.voice, sample_idx)) xl = np.load(DSFN_XL % (self.voice, sample_idx)) ys = np.load(DSFN_YS % (self.voice, sample_idx)) ym = np.load(DSFN_YM % (self.voice, sample_idx)) yl = np.load(DSFN_YL % (self.voice, sample_idx)) batch_x[i % batch_size] = x[0] batch_xl[i % batch_size] = xl[0] batch_ys[i % batch_size] = ys[0] batch_ym[i % batch_size] = ym[0] batch_yl[i % batch_size] = yl[0] if (i % batch_size) == (batch_size - 1): num_batches += 1 ts = self.decode_input(x[0]) logging.debug(u'ts %d %s' % (sample_idx, ts)) step_out, loss_out, opt_out, spectrogram, alignment = self.sess.run( [ self.global_step, self.loss, self.optimize, self.linear_outputs, self.alignments ], feed_dict={ self.inputs: batch_x, self.input_lengths: batch_xl, self.mel_targets: batch_ym, self.linear_targets: batch_ys, self.target_lengths: batch_yl }) epoch_loss += loss_out logging.info( 'epoch: %5d, step %4d/%4d loss: %7.5f, avg loss: %7.5f' % (epoch, i + 1, num_steps, loss_out, epoch_loss / num_batches)) cpfn = CHECKPOINT_FN % (self.voice, epoch) logging.info('Saving checkpoint to: %s' % cpfn) self.saver.save(self.sess, cpfn, global_step=step_out) logging.info('Saving audio and alignment...') # import pdb; pdb.set_trace() # input_seq, spectrogram, alignment = sess.run([inputs, input_lengths, linear_outputs, alignments], # feed_dict={inputs : eval_x, # input_lengths : eval_xl, # mel_targets : eval_ym, # linear_targets : eval_ys}) waveform = audio.inv_spectrogram(spectrogram[0].T, self.hp) wavfn = WAV_FN % (self.voice, epoch) audio.save_wav(waveform, wavfn, self.hp) logging.info('%s written.' % wavfn) specfn = SPEC_FN % (self.voice, epoch) cmd = 'sox %s -n spectrogram -o %s' % (wavfn, specfn) logging.info(cmd) os.system(cmd) # import pdb; pdb.set_trace() plotfn = ALIGN_FN % (self.voice, epoch) self._plot_alignment(alignment[0], plotfn, info='epoch=%d, loss=%.5f' % (epoch, loss_out)) logging.info('alignment %s plotted to %s' % (alignment[0].shape, plotfn)) # save batch as well so we can debug training later if needed np.save(BATCH_X_FN % (self.voice, epoch), batch_x) logging.info('%s written.' % (BATCH_X_FN % (self.voice, epoch))) np.save(BATCH_XL_FN % (self.voice, epoch), batch_xl) logging.info('%s written.' % (BATCH_XL_FN % (self.voice, epoch))) np.save(BATCH_YM_FN % (self.voice, epoch), batch_ym) logging.info('%s written.' % (BATCH_YM_FN % (self.voice, epoch))) np.save(BATCH_YS_FN % (self.voice, epoch), batch_ys) logging.info('%s written.' % (BATCH_YS_FN % (self.voice, epoch))) np.save(BATCH_YL_FN % (self.voice, epoch), batch_yl) logging.info('%s written.' % (BATCH_YL_FN % (self.voice, epoch)))
def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment # Multi-hop attention if attn is not None and attn.dim() == 4: for i, alignment in enumerate(attn): alignment = alignment[idx].cpu().data.numpy() tag = "alignment_layer{}".format(i + 1) writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # save files as well for now alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_layer_{}_alignment.png".format( global_step, i + 1)) save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(checkpoint_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) tag = "averaged_alignment" writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # Predicted mel spectrogram if mel_outputs is not None: mel_output = mel_outputs[idx].cpu().data.numpy() if hparams.vocoder != "world": mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Predicted mel spectrogram", mel_output, global_step) else: mel_output_prep = mel_output try: writer.add_image("Predicted WORLD output", mel_output_prep, global_step) except: pass mel_output = denormalize(mel_output) nfft = pw.get_cheaptrick_fft_size(hparams.sample_rate) f0 = mel_output[:,0].astype(np.float64) sp = pw.decode_spectral_envelope(mel_output[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft) ap = pw.decode_aperiodicity(mel_output[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft) signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period) path = join(checkpoint_dir, "step{:09d}_out.wav".format( global_step)) audio.save_wav(signal, path) try: signal /= np.max(np.abs(signal)) writer.add_audio("Target audio signal", signal, global_step, sample_rate=fs) except: print("Unexpected error :", sys.exc_info()) mel_tgt = mel[idx].cpu().data.numpy() mel_tgt = denormalize(mel_tgt) f0 = mel_tgt[:,0].astype(np.float64) sp = pw.decode_spectral_envelope(mel_tgt[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft) ap = pw.decode_aperiodicity(mel_tgt[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft) signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period) try: signal /= np.max(np.abs(signal)) writer.add_audio("Target audio signal", signal, global_step, sample_rate=hparams.sample_rate) except: print("Unexpected error :", sys.exc_info()) # Predicted spectrogram if linear_outputs is not None: linear_output = linear_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Predicted linear spectrogram", spectrogram, global_step) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format( global_step)) try: writer.add_audio("Predicted audio signal", signal, global_step, sample_rate=fs) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path) # Target mel spectrogram if mel_outputs is not None: mel_output = mel[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Target mel spectrogram", mel_output, global_step) # Target spectrogram if linear_outputs is not None: linear_output = y[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Target linear spectrogram", spectrogram, global_step) #ei path = join(checkpoint_dir, "step{:09d}_mel_target.npy".format( global_step)) mel_output = mel[idx].cpu().data.numpy() np.save(path, denormalize(mel_output)) path = join(checkpoint_dir, "step{:09d}_mel_out.npy".format( global_step)) mel_output = denormalize(mel_outputs[idx].cpu().data.numpy()) np.save(path, mel_output)
def save_states(global_step, writer, mel_outputs, converter_outputs, attn, mel, y, input_lengths, checkpoint_dir=None): def save_world(tuple_outputs, save_str, global_step=global_step): _, tar_f0, tar_sp, tar_ap = tuple_outputs fig = plt.figure() f0 = tar_f0[idx].cpu().data.numpy() * 400 ax = fig.add_subplot(1, 1, 1) ax.plot(f0) save_f0 = save_str + ' f0' writer.add_figure(save_f0, fig, global_step) # sp save sp = tar_sp[idx].cpu().data.numpy() s = prepare_spec_image(sp) save_sp = save_str + ' sp' writer.add_image(save_sp, s.transpose(2, 0, 1), global_step) # ap save ap = tar_ap[idx].cpu().data.numpy() a = prepare_spec_image(ap) save_ap = save_str + ' ap' writer.add_image(save_ap, a.transpose(2, 0, 1), global_step) print("Save intermediate states at step {}".format(global_step)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment # Multi-hop attention if attn is not None and attn.dim() == 4: for i, alignment in enumerate(attn): alignment = alignment[idx].cpu().data.numpy() tag = "alignment_layer{}".format(i + 1) writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1)) * 255).T, global_step) #転置消去 # save files as well for now alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) os.makedirs(alignment_dir, exist_ok=True) path = join( alignment_dir, "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1)) save_alignment(path, alignment, global_step) # Predicted mel spectrogram if mel_outputs is not None: mel_output = mel_outputs[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Predicted mel spectrogram", mel_output.transpose(2, 0, 1), global_step) #target mel_output = mel[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Target mel spectrogram", mel_output.transpose(2, 0, 1), global_step) if converter_outputs is not None: # Predicted world parameter if type(converter_outputs) is tuple: #save predicted save_world(converter_outputs, 'Predicted') #save target save_world(y, 'Target') #save world signal _, f0s, sps, aps = converter_outputs f0 = f0s[idx].cpu().data.numpy() * 400 sp = sps[idx].cpu().data.numpy() ap = aps[idx].cpu().data.numpy() # world vocoder signal = audio.world_synthesize(f0, sp, ap) signal /= np.max(np.abs(signal)) # Predicted spectrogram else: linear_output = converter_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Predicted linear spectrogram", spectrogram.transpose(2, 0, 1), global_step) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step)) #target linear_output = y[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Target linear spectrogram", spectrogram.transpose(2, 0, 1), global_step) try: writer.add_audio("Predicted audio signal", signal, global_step, sample_rate=hparams.sample_rate) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path)
def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment # Multi-hop attention if attn is not None and attn.dim() == 4: for i, alignment in enumerate(attn): alignment = alignment[idx].cpu().data.numpy() tag = "alignment_layer{}".format(i + 1) writer.add_image( tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # save files as well for now alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) os.makedirs(alignment_dir, exist_ok=True) path = join( alignment_dir, "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1)) save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(checkpoint_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) tag = "averaged_alignment" writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # Predicted mel spectrogram if mel_outputs is not None: mel_output = mel_outputs[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Predicted mel spectrogram", mel_output, global_step) # Predicted spectrogram if linear_outputs is not None: linear_output = linear_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Predicted linear spectrogram", spectrogram, global_step) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step)) try: writer.add_audio("Predicted audio signal", signal, global_step, sample_rate=hparams.sample_rate) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path) # Target mel spectrogram if mel_outputs is not None: mel_output = mel[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image("Target mel spectrogram", mel_output, global_step) # Target spectrogram if linear_outputs is not None: linear_output = y[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image("Target linear spectrogram", spectrogram, global_step)
def spec_to_wav(decode, wav_name): spec = np.load(decode) waveform = audio.inv_spectrogram(spec.T) audio.save_wav(waveform , wav_name)
def eval_model(self, global_epoch, train_seq2seq, train_postnet): happy_ref = np.load('../feat/Acoustic_frame/mel/emc00103.npy') happy_ref = torch.from_numpy(happy_ref).unsqueeze(0) sad_ref = np.load('../feat/Acoustic_frame/mel/ema00203.npy') sad_ref = torch.from_numpy(sad_ref).unsqueeze(0) angry_ref = np.load('../feat/Acoustic_frame/mel/eme00303.npy') angry_ref = torch.from_numpy(angry_ref).unsqueeze(0) running_loss = 0. running_linear_loss = 0. running_mel_loss = 0. for step, (ling, mel, linear, lengths, speaker_ids) in enumerate(self.valid_loader): self.model.eval() ismultispeaker = speaker_ids is not None if train_seq2seq: ling = ling.to(self.device) mel = mel.to(self.device) happy_ref = happy_ref.to(self.device) sad_ref = sad_ref.to(self.device) angry_ref = angry_ref.to(self.device) if train_postnet: linear = linear.to(self.device) lengths = lengths.to(self.device) speaker_ids = speaker_ids.to( self.device) if ismultispeaker else None target_mask = sequence_mask(lengths, max_len=mel.size(1)).unsqueeze(-1) with torch.no_grad(): # Apply model if train_seq2seq and train_postnet: _, mel_outputs, linear_outputs = self.model( ling, mel, speaker_ids=speaker_ids) """ elif train_seq2seq: mel_style = self.model.gst(tmel) style_embed = mel_style.expand_as(smel) mel_input = smel + style_embed mel_outputs = self.model.seq2seq(mel_input) linear_outputs = None elif train_postnet: linear_outputs = self.model.postnet(tmel) mel_outputs = None """ # Losses if train_seq2seq: mel_l1_loss, mel_binary_div = self.spec_loss( mel_outputs, mel, target_mask) mel_loss = (1 - self.w) * mel_l1_loss + self.w * mel_binary_div if train_postnet: linear_l1_loss, linear_binary_div = self.spec_loss( linear_outputs, linear, target_mask) linear_loss = ( 1 - self.w) * linear_l1_loss + self.w * linear_binary_div # Combine losses if train_seq2seq and train_postnet: loss = mel_loss + linear_loss elif train_seq2seq: loss = mel_loss elif train_postnet: loss = linear_loss running_loss += loss.item() running_linear_loss += linear_loss.item() running_mel_loss += mel_loss.item() B = ling.size(0) if ismultispeaker: speaker_ids = np.zeros(B) speaker_ids = torch.LongTensor(speaker_ids).to(self.device) else: speaker_ids = None _, happy_mel_outputs, happy_linear_outputs = self.model( ling, happy_ref, speaker_ids) _, sad_mel_outputs, sad_linear_outputs = self.model( ling, sad_ref, speaker_ids) _, angry_mel_outputs, angry_linear_outputs = self.model( ling, angry_ref, speaker_ids) if global_epoch % self.eval_interval == 0: for idx in range(B): if mel_outputs is not None: happy_mel_output = happy_mel_outputs[idx].cpu().data.numpy( ) happy_mel_output = prepare_spec_image( audio._denormalize(happy_mel_output)) self.writer.add_image( "(Eval) Happy mel spectrogram {}".format(idx), happy_mel_output, global_epoch) sad_mel_output = sad_mel_outputs[idx].cpu().data.numpy() sad_mel_output = prepare_spec_image( audio._denormalize(sad_mel_output)) self.writer.add_image( "(Eval) Sad mel spectrogram {}".format(idx), sad_mel_output, global_epoch) angry_mel_output = angry_mel_outputs[idx].cpu().data.numpy( ) angry_mel_output = prepare_spec_image( audio._denormalize(angry_mel_output)) self.writer.add_image( "(Eval) Angry mel spectrogram {}".format(idx), angry_mel_output, global_epoch) mel_output = mel_outputs[idx].cpu().data.numpy() mel_output = prepare_spec_image( audio._denormalize(mel_output)) self.writer.add_image( "(Eval) Predicted mel spectrogram {}".format(idx), mel_output, global_epoch) mel1 = mel[idx].cpu().data.numpy() mel1 = prepare_spec_image(audio._denormalize(mel1)) self.writer.add_image( "(Eval) Source mel spectrogram {}".format(idx), mel1, global_epoch) if linear_outputs is not None: linear_output = linear_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image( audio._denormalize(linear_output)) self.writer.add_image( "(Eval) Predicted spectrogram {}".format(idx), spectrogram, global_epoch) signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join( self.checkpoint_dir, "epoch{:09d}_{}_predicted.wav".format( global_epoch, idx)) audio.save_wav(signal, path) try: self.writer.add_audio( "(Eval) Predicted audio signal {}".format(idx), signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass happy_linear_output = happy_linear_outputs[idx].cpu( ).data.numpy() spectrogram = prepare_spec_image( audio._denormalize(happy_linear_output)) self.writer.add_image( "(Eval) Happy spectrogram {}".format(idx), spectrogram, global_epoch) signal = audio.inv_spectrogram(happy_linear_output.T) signal /= np.max(np.abs(signal)) path = join( self.checkpoint_dir, "epoch{:09d}_{}_happy.wav".format(global_epoch, idx)) audio.save_wav(signal, path) try: self.writer.add_audio( "(Eval) Happy audio signal {}".format(idx), signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass angry_linear_output = angry_linear_outputs[idx].cpu( ).data.numpy() spectrogram = prepare_spec_image( audio._denormalize(angry_linear_output)) self.writer.add_image( "(Eval) Angry spectrogram {}".format(idx), spectrogram, global_epoch) signal = audio.inv_spectrogram(angry_linear_output.T) signal /= np.max(np.abs(signal)) path = join( self.checkpoint_dir, "epoch{:09d}_{}_angry.wav".format(global_epoch, idx)) audio.save_wav(signal, path) try: self.writer.add_audio( "(Eval) Angry audio signal {}".format(idx), signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass sad_linear_output = sad_linear_outputs[idx].cpu( ).data.numpy() spectrogram = prepare_spec_image( audio._denormalize(sad_linear_output)) self.writer.add_image( "(Eval) Sad spectrogram {}".format(idx), spectrogram, global_epoch) signal = audio.inv_spectrogram(sad_linear_output.T) signal /= np.max(np.abs(signal)) path = join( self.checkpoint_dir, "epoch{:09d}_{}_sad.wav".format(global_epoch, idx)) audio.save_wav(signal, path) try: self.writer.add_audio( "(Eval) Sad audio signal {}".format(idx), signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass linear1 = linear[idx].cpu().data.numpy() spectrogram = prepare_spec_image( audio._denormalize(linear1)) self.writer.add_image( "(Eval) Target spectrogram {}".format(idx), spectrogram, global_epoch) signal = audio.inv_spectrogram(linear1.T) signal /= np.max(np.abs(signal)) try: self.writer.add_audio( "(Eval) Target audio signal {}".format(idx), signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass avg_loss = running_loss / len(self.valid_loader) avg_linear_loss = running_linear_loss / len(self.valid_loader) avg_mel_loss = running_mel_loss / len(self.valid_loader) self.writer.add_scalar("valid loss (per epoch)", avg_loss, global_epoch) self.writer.add_scalar("valid linear loss (per epoch)", avg_linear_loss, global_epoch) self.writer.add_scalar("valid mel loss (per epoch)", avg_mel_loss, global_epoch) print("Valid Loss: {}".format(avg_loss))
sad = np.load(join(args.mel_dir, sad_name)) angry = np.load(join(args.mel_dir, angry_name)) lingX = torch.from_numpy(Xling).unsqueeze(0).to(device) melX = torch.from_numpy(melX).unsqueeze(0).to(device) happy = torch.from_numpy(happy).unsqueeze(0).to(device) sad = torch.from_numpy(sad).unsqueeze(0).to(device) angry = torch.from_numpy(angry).unsqueeze(0).to(device) style_n, mel_output, linear_output = model(lingX, melX) style_h, happy_mel_output, happy_linear_output = model(lingX, happy) style_s, sad_mel_output, sad_linear_output = model(lingX, sad) style_a, angry_mel_output, angry_linear_output = model(lingX, angry) linear_output = linear_output[0].data.cpu().numpy() signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(args.result_dir, Xmel_name.replace('.npy', '.wav')) audio.save_wav(signal, path) happy_linear_output = happy_linear_output[0].data.cpu().numpy() signal = audio.inv_spectrogram(happy_linear_output.T) signal /= np.max(np.abs(signal)) path = join(args.result_dir, happy_name.replace('.npy', '.wav')) audio.save_wav(signal, path) sad_linear_output = sad_linear_output[0].data.cpu().numpy() signal = audio.inv_spectrogram(sad_linear_output.T) signal /= np.max(np.abs(signal)) path = join(args.result_dir, sad_name.replace('.npy', '.wav')) audio.save_wav(signal, path)