def synthesize(self, texts, is_sequence=False, pml_filenames=None, tgt_filenames=None): if tgt_filenames: pml_filenames = tgt_filenames # hacky way to handle tgts other than pml hp = self._hparams cleaner_names = [x.strip() for x in hp.cleaners.split(',')] if isinstance(texts, str): seqs = [ np.asarray(text_to_sequence(texts, cleaner_names), dtype=np.int32) ] elif is_sequence: seqs = [np.asarray(texts, dtype=np.int32)] else: seqs = [ np.asarray(text_to_sequence(text, cleaner_names), dtype=np.int32) for text in texts ] input_seqs = self._prepare_inputs(seqs) feed_dict = { self.model.inputs: np.asarray(input_seqs, dtype=np.int32), self.model.input_lengths: np.asarray([len(seq) for seq in seqs], dtype=np.int32) } if self.gta: np_targets = [ np.load(pml_filename) for pml_filename in pml_filenames ] prepared_targets = self._prepare_targets(np_targets, hp.outputs_per_step) feed_dict[self.targets] = prepared_targets assert len(np_targets) == len(texts) alignments, = self.session.run([self.alignments], feed_dict=feed_dict) if not self.cut_lengths: max_length = hp.max_iters alignments = self.pad_along_axis(alignments, max_length, axis=2) if len(alignments) == 1: return alignments[0] return alignments
def eval(self, batch): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in batch ] input_lengths = [len(seq) for seq in seqs] seqs = self._prepare_inputs(seqs) feed_dict = { self.model.inputs: seqs, self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32), } features, stop_tokens = self.session.run( [self.model.final_outputs, self.stop_token_outputs], feed_dict=feed_dict) #Get feature output lengths for the entire batch from stop_tokens outputs output_lengths = self._get_output_lengths(stop_tokens) features = [ feature[:output_length, :] for feature, output_length in zip(features, output_lengths) ] assert len(features) == len(batch) wavs = [] for i, feature in enumerate(features): np.save('tacotron_output/{}.npy'.format(i + 1), feature) wavs.append(audio.synthesize(feature, hparams)) return np.concatenate(wavs)
def _get_next_example(self): """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk """ if self._train_offset >= len(self._train_meta): self._train_offset = 0 np.random.shuffle(self._train_meta) meta = self._train_meta[self._train_offset] self._train_offset += 1 text = meta[5] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) num = int((meta[1].split('-')[2]).split('.')[0]) folder_num = str(num // 1000) sub_folder_num = str((num % 1000) // 100) mel_target = np.load( os.path.join(self._mel_dir, folder_num, sub_folder_num, meta[1])) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load( os.path.join(self._linear_dir, folder_num, sub_folder_num, meta[2])) return (input_data, mel_target, token_target, linear_target, len(mel_target))
def _get_test_groups(self): meta = self._test_meta[self._test_offset] self._test_offset += 1 text = meta[5] if self._hparams.preload_spectrogram: if meta[1] in self._mel_target: mel_target = self._mel_target[meta[1]] else: mel_target = np.load(os.path.join(self._mel_dir, meta[1])) self._mel_target[meta[1]] = mel_target if meta[2] in self._linear_target: linear_target = self._inear_target[meta[1]] else: linear_target = np.load(os.path.join(self._linear_dir, meta[2])) self._linear_target[meta[1]] = linear_target else: mel_target = np.load(os.path.join(self._mel_dir, meta[1])) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) return (input_data, mel_target, token_target, linear_target, len(mel_target))
def eval(self, batch): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in batch] input_lengths = [len(seq) for seq in seqs] seqs = self._prepare_inputs(seqs) feed_dict = { self.model.inputs: seqs, self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32), } linears, stop_tokens = self.session.run([self.linear_outputs, self.stop_token_prediction], feed_dict=feed_dict) #Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)] assert len(linears) == len(batch) #save wav (linear -> wav) results = [] for i, linear in enumerate(linears): linear_wav = self.session.run(self.linear_wav_outputs, feed_dict={self.linear_spectrograms: linear}) wav = audio.inv_preemphasis(linear_wav, hparams.preemphasis) results.append(wav) return np.concatenate(results)
def inference(self, texts): cleaner_names = [x.strip() for x in self._hp.cleaners.split(',')] seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) input_seqs = None split_infos = [] for i in range(self._hp.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), self.split_infos: np.asarray(split_infos, dtype=np.int32) } self.sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) linear = self.sess.run(self.linear_outputs, feed_dict=feed_dict) wav = self.sess.run(self.GLGPU_lin_outputs, feed_dict={self.GLGPU_lin_inputs: linear[0]}) return wav
def synthesize(self, text, index, out_dir, log_dir, mel_filename): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80) mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) mels = mels.reshape(-1, 80) #Thanks to @imdatsolak for pointing this out # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'ljspeech-mel-{:05d}.npy'.format(index)) np.save(mel_filename, mels, allow_pickle=False) if log_dir is not None: #save wav wav = audio.inv_mel_spectrogram(mels.T) audio.save_wav(wav, os.path.join(log_dir, 'wavs/ljspeech-wav-{:05d}.wav'.format(index))) #save alignments plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/ljspeech-alignment-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) #save mel spectrogram plot plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/ljspeech-mel-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) return mel_filename
def run(self, text, speaker_id, play=True): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), self.model.speaker_ids: np.asarray([speaker_id], dtype=np.int32) } mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) mels = mels.reshape(-1, hparams.num_mels) # Thanks to @imdatsolak for pointing this out if play: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return mels
def synthesize(self, text, idx, out_dir, mel_filename): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) mels = mels.reshape(-1, hparams.num_mels) wav = audio.inv_mel_spectrogram(mels.T) audio.save_wav(wav, os.path.join(out_dir, 'audio-{:02d}.wav'.format(idx))) # save mel spectrogram plot plot.plot_spectrogram(mels, os.path.join(out_dir, 'mel-{:02d}.png'.format(idx)), info='{}'.format(text), split_title=True) return 1
def _get_next_example(self): """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk """ if self._train_offset >= len(self._train_meta): self._train_offset = 0 np.random.shuffle(self._train_meta) meta = self._train_meta[self._train_offset] self._train_offset += 1 dur_file = meta[1].lstrip('mel-') dur_file = os.path.join(self.duration_dir, dur_file) dur = np.squeeze(np.load(dur_file)) alignment = convert_dur2alignment(dur) text = meta[5] input_data = np.asarray(text_to_sequence(text), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) # Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) if len(dur) != len(input_data): raise RuntimeError('wrong dur') return (input_data, mel_target, token_target, linear_target, dur, alignment, len(mel_target))
def preprocess_text(texts): seqs = [ np.array(text_to_sequence(text, ['english_cleaners'])) for text in texts ] input_legths = [seq.shape[0] for seq in seqs] max_len = max(input_legths) seqs = np.stack([_pad_input(x, max_len) for x in seqs]) return seqs, input_legths
def synthesize(self, texts, pml_filenames=None, to_wav=False, num_workers=4, **kwargs): hp = self._hparams kwargs.setdefault('pp_mcep', self.cfg.pp_mcep) kwargs.setdefault('spec_type', hp.spec_type) cleaner_names = [x.strip() for x in hp.cleaners.split(',')] seqs = [ np.asarray(text_to_sequence(text, cleaner_names), dtype=np.int32) for text in texts ] input_seqs = self._prepare_inputs(seqs) feed_dict = { self.model.inputs: np.asarray(input_seqs, dtype=np.int32), self.model.input_lengths: np.asarray([len(seq) for seq in seqs], dtype=np.int32) } # if self.gta: if self.gta or self.eal: np_targets = [ np.load(pml_filename) for pml_filename in pml_filenames ] prepared_targets = self._prepare_targets(np_targets, hp.outputs_per_step) feed_dict[self.targets] = prepared_targets assert len(np_targets) == len(texts) if self.flag_online: pml_features_matrix = self.session.run(self.pml_outputs_eal, feed_dict=feed_dict) else: pml_features_matrix = self.session.run(self.pml_outputs, feed_dict=feed_dict) if to_wav: executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] for pml_features in pml_features_matrix: futures.append( executor.submit( partial(_pml_to_wav, pml_features, self.cfg, **kwargs))) wavs = [future.result() for future in futures] return wavs return pml_features_matrix
def synthesize(self, text, mel, out_dir, idx): hparams = self._hparams r = hparams.outputs_per_step T2_output_range = ( -hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) target = np.load(mel) target = np.clip(target, T2_output_range[0], T2_output_range[1]) target_length = target.shape[0] targets = padding_targets(target, r, T2_output_range[0]) new_target_length = targets.shape[0] pyin, text = get_pyin(text) print(text) inputs = [np.asarray(text_to_sequence(pyin.split(' ')))] print(inputs) input_lengths = [len(inputs[0])] feed_dict = { self.inputs: np.asarray(inputs, dtype=np.int32), self.input_lengths: np.asarray(input_lengths, dtype=np.int32), self.targets: np.asarray([targets], dtype=np.float32), self.target_lengths: np.asarray([new_target_length], dtype=np.int32), } mels, alignments = self.session.run( [self.mel_outputs, self.alignments], feed_dict=feed_dict) mel = mels[0] print('pred_mel.shape', mel.shape) mel = np.clip(mel, T2_output_range[0], T2_output_range[1]) mel = mel[:target_length, :] mel = (mel + T2_output_range[1]) / (2 * T2_output_range[1]) mel = np.clip(mel, 0.0, 1.0) # 0~1.0 print(target_length, new_target_length) pred_mel_path = os.path.join(out_dir, 'mel-{}-pred.npy'.format(idx)) np.save(pred_mel_path, mel, allow_pickle=False) plot.plot_spectrogram(mel, pred_mel_path.replace('.npy', '.png'), title='') alignment = alignments[0] alignment_path = os.path.join(out_dir, 'align-{}.png'.format(idx)) plot.plot_alignment(alignment, alignment_path, title='') #alignment_path = os.path.join(out_dir, 'align-{}.npy'.format(idx)) #np.save(alignment_path, alignment, allow_pickle=False) return pred_mel_path, alignment_path
def _get_test_groups(self): meta = self._test_meta[self._test_offset] self._test_offset += 1 text = meta[5] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) return (input_data, mel_target, token_target, linear_target, len(mel_target))
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape( 1, -1, 80) if self.gta or not hparams.predict_linear: mels, alignment = self.session.run( [self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape( -1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out wav = audio.inv_mel_spectrogram(mels.T) out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue() # # Write the spectrogram to disk # # Note: outputs mel-spectrogram files and target ones have same names, just different folders # mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) # np.save(mel_filename, mels, allow_pickle=False) # if log_dir is not None: # #save wav (mel -> wav) # wav = audio.inv_mel_spectrogram(mels.T) # audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index))) # if hparams.predict_linear: # #save wav (linear -> wav) # wav = audio.inv_linear_spectrogram(linear.T) # audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index))) # #save alignments # plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), # info='{}'.format(text), split_title=True) # #save mel spectrogram plot # plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), # info='{}'.format(text), split_title=True) # return mel_filename
def _get_test_groups(self): meta = self._test_meta[self._test_offset] self._test_offset += 1 text = meta[4] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) return (input_data, mel_target, token_target, len(mel_target))
def reverse_equal_original(text): print('text:', text, 'len:', len(text)) seq_text = text_to_sequence(text, cleaner_names) print('seq text:', seq_text, 'len:', len(seq_text)) textReverse_seq_text = sequence_to_text(seq_text) print('textReverse_seq_txt:', textReverse_seq_text, 'len:', len(textReverse_seq_text)) if len(textReverse_seq_text) - 1 != len( text) or textReverse_seq_text[-1] != '~': return False if len(text) != len(seq_text) - 1: return False return True
def _get_next_example(self): """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk """ if self._train_offset >= len(self._train_meta): self._train_offset = 0 np.random.shuffle(self._train_meta) meta = self._train_meta[self._train_offset] self._train_offset += 1 if not self._hparams.tacotron_phoneme_transcription: text = meta[5] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) # Phoneme transcription else: ''' text_as_words = meta[5].split(' ') text_as_phonemes = meta[6].split(' ') assert len(text_as_words) == len(text_as_phonemes) for i in range(0, len(text_as_words)): random_number = np.random.random() if random_number < self._proba_phoneme: text_as_words[i] = text_as_phonemes[i] text = " ".join(text_as_words) ''' text = meta[6] input_data = np.asarray(ipa_to_articulatory_sequence(text), dtype=np.int32) if self._hparams.tacotron_multi_speaker: speaker_id = [0 for i in range(int(self._nb_speaker))] speaker_id[int(meta[7])] = 1 print() mel_reference = np.load(os.path.join(self._mel_dir, meta[2])) # input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[8])) # Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) if self._hparams.tacotron_multi_speaker: return (input_data, mel_target, token_target, linear_target, len(mel_target), speaker_id, mel_reference) else: return (input_data, mel_target, token_target, linear_target, len(mel_target), mel_reference)
def eval(self, text): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seqs = [np.asarray(text_to_sequence(text, cleaner_names))] input_lengths = [len(seq) for seq in seqs] feed_dict = { self.model.inputs: seqs, self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32), } linear_wavs = self.session.run(self.linear_wav_outputs, feed_dict=feed_dict) wav = audio.inv_preemphasis(linear_wavs, hparams.preemphasis) out = io.BytesIO() audio.save_wav(wav, out ,sr=hparams.sample_rate) return out.getvalue()
def synthesize_check(self, texts, pml_filenames=None, tgt_filenames=None, to_wav=False, num_workers=4, **kwargs): if tgt_filenames is None: tgt_filenames = pml_filenames hp = self._hparams kwargs.setdefault('pp_mcep', self.cfg.pp_mcep) kwargs.setdefault('spec_type', hp.spec_type) cleaner_names = [x.strip() for x in hp.cleaners.split(',')] seqs = [ np.asarray(text_to_sequence(text, cleaner_names), dtype=np.int32) for text in texts ] input_seqs = self._prepare_inputs(seqs) feed_dict = { self.model.inputs: np.asarray(input_seqs, dtype=np.int32), self.model.input_lengths: np.asarray([len(seq) for seq in seqs], dtype=np.int32) } # if self.gta: if self.gta or self.eal: np_targets = [ np.load(tgt_filename) for tgt_filename in tgt_filenames ] prepared_targets = self._prepare_targets(np_targets, hp.outputs_per_step) feed_dict[self.targets] = prepared_targets assert len(np_targets) == len(texts) alignments, = self.session.run([self.model.alignments], feed_dict=feed_dict) # alignments, pml_intermediates = self.session.run([self.model.alignments, self.model.pml_intermediates], feed_dict=feed_dict) if True: # not self.cut_lengths max_length = hp.max_iters alignments = self.pad_along_axis(alignments, max_length, axis=2) if len(alignments) == 1: return alignments[0] return alignments
def synthesize(self, texts, basenames, out_dir, log_dir): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] seqs = self._prepare_inputs(seqs) feed_dict = { self.model.inputs: seqs, self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32), } features, alignments, stop_tokens = self.session.run( [self.final_outputs, self.alignments, self.stop_token_outputs], feed_dict=feed_dict) #Get feature output lengths for the entire batch from stop_tokens outputs output_lengths = self._get_output_lengths(stop_tokens) features = [ feature[:output_length, :] for feature, output_length in zip(features, output_lengths) ] assert len(features) == len(texts) for i, feature in enumerate(features): # Write the predicted features to disk # Note: outputs files and target ones have same names, just different folders np.save(os.path.join(out_dir, 'feature-{:03d}.npy'.format(i + 1)), feature, allow_pickle=False) if log_dir is not None: #save alignments plot.plot_alignment( alignments[i], os.path.join(log_dir, 'plots/alignment-{:03d}.png'.format(i + 1)), info='{}'.format(texts[i]), split_title=True) #save wav wav = audio.synthesize(feature, hparams) audio.save_wav( wav, os.path.join(log_dir, 'wavs/wav-{:03d}.wav'.format(i + 1)), hparams)
def _get_next_example(self): if self._offset >= len(self._metadata): self._offset = 0 np.random.shuffle(self._metadata) meta = self._metadata[self._offset] self._offset += 1 text = meta[5] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) token_target = np.asarray([0.] * len(mel_target)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) return (input_data, mel_target, token_target, linear_target, len(mel_target))
def _get_test_groups(self): meta = self._test_meta[self._test_offset] self._test_offset += 1 dur_file = meta[1].lstrip('mel-') dur_file = os.path.join(self.duration_dir, dur_file) dur = np.squeeze(np.load(dur_file)) alignment = convert_dur2alignment(dur) text = meta[5] input_data = np.asarray(text_to_sequence(text), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) # Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) return (input_data, mel_target, token_target, linear_target, dur, alignment, len(mel_target))
def _get_test_groups(self): meta = self._test_meta[self._test_offset] self._test_offset += 1 text = meta[ 5] #train.txt it is text sentences meta[0] = audio/f32 meta[1] = mel meta[2] = linear input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.fromfile(os.path.join(self._audio_dir, meta[0]), dtype='float32') mel_target = np.resize(mel_target, (-1, self._hparams.num_mels)) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) return (input_data, mel_target, token_target, linear_target, len(mel_target))
def predict(self, text, out_dir, speaker_id): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), self.model.speaker_ids: np.asarray([speaker_id], dtype=np.int32) } mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) mels = mels.reshape(-1, hparams.num_mels) # Thanks to @imdatsolak for pointing this out wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, out_dir, sr=hparams.sample_rate) return out_dir
def _get_test_groups(self): meta = self._test_meta[self._test_offset] self._test_offset += 1 dataset = meta[0] text = meta[7] emt_label = meta[8] spk_label = meta[9] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load( os.path.join(self.data_folder, dataset, 'mels', meta[2])) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) # linear_target_path = os.path.join(self.data_folder, dataset, 'linear', meta[3]) # if hparams.predict_linear: # if os.path.exists(linear_target_path): # linear_target = np.load(linear_target_path) # else: # raise ("linear target does not exist -", linear_target_path) # else: # linear_target = np.zeros((1,hparams.num_freq)) #check for speaker embedding # spk_emb_path = os.path.join(self.data_folder,dataset,'spkemb', meta[4]) # if os.path.exists(spk_emb_path): # spk_emb = np.load(spk_emb_path) # else: # spk_emb = np.zeros(hparams.tacotron_spk_emb_dim) # assert spk_emb.shape[0] == hparams.tacotron_spk_emb_dim #just use the same sample for the reference when testing ref_mel_emt = mel_target ref_mel_spk = mel_target assert ( (ref_mel_emt == mel_target).all() ) #using the mel target lengths as the lengths for attention, must adjust accordingly # print("in gen", dataset, input_data[0:5], mel_target[0][0:5], emt_label, spk_label) # return (input_data, mel_target, token_target, linear_target, spk_emb, emt_label, spk_label, ref_mel_emt, ref_mel_spk, len(mel_target)) return (input_data, mel_target, token_target, emt_label, spk_label, ref_mel_emt, ref_mel_spk, len(mel_target))
def _get_next_example(self): """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk """ if self._train_offset >= len(self._train_meta): self._train_offset = 0 np.random.shuffle(self._train_meta) meta = self._train_meta[self._train_offset] self._train_offset += 1 text = meta[-1].strip().split(' ') input_data = np.asarray(text_to_sequence(text), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[0])) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) return (input_data, mel_target, token_target, len(mel_target))
def eval(self, text): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seqs = [np.asarray(text_to_sequence(text, cleaner_names))] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus #Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) #linear_wavs = self.session.run(self.linear_wav_outputs, feed_dict=feed_dict) linear_wavs, linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_wav_outputs, self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) linear_wavs = [ linear_wav for gpu_linear_wav in linear_wavs for linear_wav in gpu_linear_wav ] wav = audio.inv_preemphasis(linear_wavs, hparams.preemphasis) #audio.save_wav(wav, 'wavs/wav-1-linear.wav', sr=hparams.sample_rate) out = io.BytesIO() audio.save_wav(wav, out, sr=hparams.sample_rate) return out.getvalue(), wav
def _get_test_groups(self): meta = self._test_meta[self._test_offset] self._test_offset += 1 text = meta[5] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.fromfile(os.path.join( '/mnt/training_data/audio', meta[0].replace('audio-', '').replace('/mels/', '/audio/')), dtype='float32') mel_target = np.resize(mel_target, (-1, self._hparams.num_mels)) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) return (input_data, mel_target, token_target, linear_target, len(mel_target))
def _get_next_example(self): """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk """ if self._train_offset >= len(self._train_meta): self._train_offset = 0 np.random.shuffle(self._train_meta) meta = self._train_meta[self._train_offset] self._train_offset += 1 text = meta[5] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) return (input_data, mel_target, token_target, linear_target, len(mel_target))
def _get_next_example(self): """ Gets a single example (input, mel_target, token_target) from disk """ if self._offset >= len(self._metadata): self._offset = 0 np.random.shuffle(self._metadata) meta = self._metadata[self._offset] self._offset += 1 text = meta[4] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * len(mel_target)) return (input_data, mel_target, token_target, len(mel_target))
def synthesize(self, text, index, out_dir, log_dir, mel_filename): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape( 1, -1, 80) if self.gta or not hparams.predict_linear: mels, alignment = self.session.run( [self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape( -1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out #convert checkpoint to frozen model minimal_graph = tf.graph_util.convert_variables_to_constants( self.session, self.session.graph_def, ["model/inference/add"]) tf.train.write_graph(minimal_graph, '.', 'inference_model.pb', as_text=False) npy_data = mels.reshape((-1, )) print(mels) print("==============================================") print(npy_data) text = text.replace(" ", "_") text = text.replace("?", ".") filename = text + 'f32' npy_data.tofile(filename) return
def synthesize(self, text, index, out_dir, log_dir, mel_filename): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if self.gta: feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80) if self.gta or not hparams.predict_linear: mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) else: linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) linear = linear.reshape(-1, hparams.num_freq) mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out if index is None: #Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data=f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) np.save(mel_filename, mels, allow_pickle=False) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linear.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) #save mel spectrogram plot plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), info='{}'.format(text), split_title=True) return mel_filename