def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model in ('Both', 'Tacotron-2'): assert os.path.normpath(eval_dir) == os.path.normpath( args.mels_dir) # mels_dir = wavenet_input_dir # Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): if is_korean_text(text): text = normalize_number(text) # 한글을 자소 단위로 쪼갠다. text = split_to_jamo(text, hparams.cleaners) mel_filename = synth.synthesize(text, i + 1, eval_dir, log_dir, None) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def view_method(): text = request.args.get('text') if is_korean_text(text): text = normalize_number(text) text = split_to_jamo(text, hparams.cleaners) speaker_id = int(request.args.get('speaker_id')) if text: return generate_audio_response(text, speaker_id) else: return jsonify(success=True), 200
def _get_next_example(self): """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk """ if self._train_offset >= len(self._train_meta): self._train_offset = 0 np.random.shuffle(self._train_meta) meta = self._train_meta[self._train_offset] self._train_offset += 1 speaker_id = int(meta[5]) text = meta[6] if is_korean_text(text): text = normalize_number(text) # 한글을 자소 단위로 쪼갠다. text = split_to_jamo(text, self._cleaner_names) input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) # Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) return (input_data, mel_target, token_target, linear_target, speaker_id, len(mel_target))
def synthesize(self, text, index, out_dir, log_dir, mel_filename, speaker_id): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] if is_korean_text(text): text = normalize_number(text) text = split_to_jamo(text, cleaner_names) seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), self.model.speaker_ids: np.asarray([speaker_id], dtype=np.int32) } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80) if self.gta or not hparams.predict_linear: mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape(-1, hparams.num_mels) # Thanks to @imdatsolak for pointing this out if index is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) np.save(mel_filename, mels, allow_pickle=False) if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate) if hparams.predict_linear: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linear.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate) if is_korean_char(text): text = j2h(text) # save alignments plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) # save mel spectrogram plot plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) return mel_filename