def load_data(self, idx): if self.cached: wav_name = self.items[idx][1] mel_name = self.items[idx][2] linear_name = self.items[idx][3] text = self.items[idx][0] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) if wav_name.split('.')[-1] == 'npy': wav = self.load_np(wav_name) else: wav = np.asarray(self.load_wav(wav_name), dtype=np.float32) mel = self.load_np(mel_name) linear = self.load_np(linear_name) sample = { 'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'mel': mel, 'linear': linear } else: text, wav_file = self.items[idx] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1]} return sample
def tsau(input_text, save_path): if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f ] if args.vocoder == 'wavernn': voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A') ]) elif args.vocoder == 'griffinlim': tts_k = tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', args.iters)]) for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = tts_model.generate(x) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if args.vocoder == 'griffinlim': v_type = args.vocoder elif args.vocoder == 'wavernn' and args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' if save_attn: save_attention(attention, save_path) if args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) voc_model.generate(m, save_path, batched, target, overlap, hp.mu_law) elif args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=args.iters) save_wav(wav, save_path) print('\n\nDone.\n')
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000
def tts(model, text): """Convert text to speech waveform given a Tacotron model. """ if USE_CUDA: model = model.cuda() # NOTE: dropout in the decoder should be activated for generalization! # model.decoder.eval() model.encoder.eval() model.postnet.eval() sequence = np.array(text_to_sequence(text)) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) if USE_CUDA: sequence = sequence.cuda() # Greedy decoding mel_outputs, linear_outputs, gate_outputs, alignments = model(sequence) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def load_data(self, idx): text, wav_file, speaker_name = self.items[idx] # print(" | > load_data idx: {}".format(self.items[idx])) # print(" | > load_data wav_file: {}".format(wav_file)) # print(" | > load_data text: {}".format(text)) # print(" | > load_data speaker_name: {}".format(speaker_name)) wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) if self.use_phonemes: text = self.load_phoneme_sequence(wav_file, text) else: text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] sample = { 'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'speaker_name': speaker_name } return sample
def tts(model, text): """Convert text to speech waveform given a Tacotron model. """ if USE_CUDA: model = model.cuda() # TODO: Turning off dropout of decoder's prenet causes serious performance regression, not sure why. # model.decoder.eval() model.encoder.eval() model.postnet.eval() sequence = np.array(text_to_sequence(text)) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) if USE_CUDA: sequence = sequence.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments = model(sequence) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def synthesis(self, text, speaker_embedding, noise_embedding, wave_path="log/synthesis/wave/", plot_path="log/synthesis/plot/"): """ TODO: Provide Batch Synthesis :param text: "hello, world" :param speaker_embedding: Any[Speaker] :param noise_embedding: Any [Noise] :param wave_path: "log/synthesis/wave" :param plot_path: "log/synthesis/plot" :return: FloatTensor [Time] for wave, FloatTensor [Encoder, Decoder] for attention """ with torch.no_grad(): makedirs(str(wave_path), exist_ok=True) makedirs(str(plot_path), exist_ok=True) phone = text_to_sequence(text.strip(), hp.cleaner_names) mel, linear, attention = self.tacotron.generate( phone, speaker_embedding, noise_embedding) audio_path = f'{wave_path}_GL_input_{text[:10]}_{self.tts_k}k.wav' atten_path = f"{plot_path}_Attention_input_{text[:10]}_{self.tts_k}k" save_attention(attention, atten_path) print(list(linear.shape)) wave = self.stft.inverse_linear(linear, iteration=40)[0] save_from_torch(wave, audio_path, hp.sampling_rate) return wave, attention
def create_attention_guides(fpath): dataset_ids = [] mel_lengths = [] text_lengths = [] with open(f'{fpath}/dataset.pkl', 'rb') as f: dataset = pickle.load(f) for (item_id, l) in dataset: dataset_ids += [item_id] mel_lengths += [l] with open(f'{fpath}/text_dict.pkl', 'rb') as f: text_dict = pickle.load(f) for item_id in dataset_ids: x = text_to_sequence(text_dict[item_id], ['blizz_cleaners']) text_lengths += [len(x)] for i, id in enumerate(dataset_ids): attfile = os.path.join(fpath, 'diagonal_attention_guides', id + '.npy') att = get_attention_guide(text_lengths[i], mel_lengths[i], g=0.2) np.save(attfile, att)
def load_data(self, idx): if self.cached: wav_name = self.items[idx][1] mel_name = self.items[idx][2] linear_name = self.items[idx][3] text = self.items[idx][0] if wav_name.split('.')[-1] == 'npy': wav = self.load_np(wav_name) else: wav = np.asarray(self.load_wav(wav_name), dtype=np.float32) mel = self.load_np(mel_name) linear = self.load_np(linear_name) else: text, wav_file = self.items[idx] wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) mel = None linear = None if self.use_phonemes: text = self.load_phoneme_sequence(wav_file, text) else: text = np.asarray( text_to_sequence(text, [self.cleaners]), dtype=np.int32) sample = {'text': text, 'wav': wav, 'item_idx': os.path.basename(self.items[idx][1]).split('.')[0], 'mel':mel, 'linear': linear} return sample
def load_model(self, model_path, model_config, wavernn_path, use_cuda): self.model_file = model_path print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True) self.model.decoder.max_decoder_steps = 8000 # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() self.vocoder=WaveRNNVocoder.Vocoder() self.vocoder.loadWeights(wavernn_path) self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def __getitem__(self, index): id = self.metadata[index] x = text_to_sequence(self.text_dict[id], hp.tts_cleaner_names) mel = np.load(f'{self.path}mel/{id}.npy') spk_embed = np.load(f'{self.path}spk_embeds/{id}.npy') mel_len = mel.shape[-1] return x, mel, id, mel_len, spk_embed
def __getitem__(self, index): item_id = self.metadata[index] text = self.text_dict[item_id] x = text_to_sequence(text) x = np.array(x) pad_idx = 10 punc_level = np.full_like(x, pad_idx) new_x = [] in_quote = False for i, ph_idx in enumerate(x[::-1]): if ph_idx in PUNCTUATION_INDICES: punc_level[:len(x) - i] = ph_idx if ph_idx == 3: # closing bracket punc_level[:len(x) - i] = pad_idx if ph_idx == 2: if in_quote: punc_level[:len(x) - i] = pad_idx else: in_quote = True # if ph_idx not in PUNCTUATION_INDICES: else: new_x.append(ph_idx) new_x = np.array(new_x[::-1]) x = np.stack([new_x, punc_level]) # print("!" * 100) # print("LENS", len(punc_level), len(new_x)) # print(new_x) # print(punc_level) mel = np.load(str(self.path / 'mel' / f'{item_id}.npy')) mel_len = mel.shape[-1] dur = np.load(str(self.path / 'alg' / f'{item_id}.npy')) pitch = np.load(str(self.path / 'phon_pitch' / f'{item_id}.npy')) return x, mel, item_id, mel_len, dur, pitch
def __getitem__(self, index): item_id = self.metadata[index] text = self.text_dict[item_id] x = text_to_sequence(text) mel = np.load(self.path / 'mel' / f'{item_id}.npy') mel_len = mel.shape[-1] return x, mel, item_id, mel_len
def __getitem__(self, idx): text, wav_file = self.items[idx] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1]} return sample
def synthesize(self, inputs): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq_input = [text_to_sequence(j, cleaner_names) for j in inputs] seq_length = [len(j) for j in seq_input] max_len = max(seq_length) inputs = [_pad_input(j, max_len) for j in seq_input] seq = np.stack((x for x in inputs)) # seq = text_to_sequence(text, cleaner_names) if not self.model_filename.endswith('.pb'): feed_dict = { self.model.inputs: np.asarray(seq, dtype=np.int32), self.model.input_lengths: np.asarray(seq_length, dtype=np.int32) } else: feed_dict = { self.inputs: np.asarray(seq, dtype=np.int32), self.input_lengths: np.asarray(seq_length, dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) output = [] print('wav.shape:', wav.shape) for wav_index in range(wav.shape[0]): wav_index_temp = audio.inv_preemphasis(wav[wav_index]) wav_index_temp = wav_index_temp[:audio.find_endpoint(wav_index_temp)] # wav_index_temp = vad_check(wav_index_temp, hparams.sample_rate) out = io.BytesIO() audio.save_wav(wav_index_temp, out) output.append(out) return output
def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav' text = self.frames[idx][1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample
def __getitem__(self, index): item_id = self.metadata[index] text = self.text_dict[item_id] x = text_to_sequence(text) mel = np.load(str(self.path / 'mel' / f'{item_id}.npy')) mel_len = mel.shape[-1] dur = np.load(str(self.path / 'alg' / f'{item_id}.npy')) pitch = np.load(str(self.path / 'phon_pitch' / f'{item_id}.npy')) return x, mel, item_id, mel_len, dur, pitch
def loading_thread(self): while True: try: text, wave, speaker, male, augmented = self.loader.sample() phoneme = text_to_sequence(text, hp.cleaner_names) phoneme = torch.from_numpy(np.int64(phoneme)) self.loading_queue.put((phoneme.to(self.device, non_blocking=True), wave.to(self.device, non_blocking=True), speaker, augmented)) except Exception as e: print("Loading Thread Error", str(e))
def synthesis(model, ap, text, use_cuda, text_cleaner): text_cleaner = [text_cleaner] seq = np.array(text_to_sequence(text, text_cleaner)) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda().long() _, linear_out, alignments, _ = model.forward(chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = ap.inv_spectrogram(linear_out.T) return wav, linear_out, alignments
def __getitem__(self, index): item_id = self.metadata[index] x = text_to_sequence(self.text_dict[item_id], hp.tts_cleaner_names) mel = np.load(self.path / 'mel' / f'{item_id}.npy') mel_len = mel.shape[-1] if hp.mode in ['teacher_forcing', 'attention_forcing_online']: return x, mel, item_id, mel_len elif hp.mode == 'attention_forcing_offline': attn_ref = np.load(self.path / hp.attn_ref_path / f'{item_id}.npy') return x, mel, item_id, mel_len, attn_ref
def getTTS(input_text, batched, voc_model, tts_model, hp): if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f ] voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.r simple_table([('WaveRNN', str(voc_k) + 'k'), (f'Tacotron(r={r})', str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', 11_000 if batched else 'N/A'), ('Overlap Samples', 550 if batched else 'N/A')]) wav_list = [] for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = tts_model.generate(x) save_path = './sound/' + str(uuid.uuid4()) + '.wav' m = torch.tensor(m).unsqueeze(0) m = (m + 4) / 8 wav_file = voc_model.generate(m, save_path, batched, 3000, 550, hp.mu_law) wav_list.append(wav_file) wav_list.append(save_path) print('\n\nDone.\n') return wav_list
def __getitem__(self, index): item_id = self.metadata[index] x = text_to_sequence(self.text_dict[item_id], hp.tts_cleaner_names) mel = np.load(self.path / 'mel' / f'{item_id}.npy') mel_len = mel.shape[-1] if self.alignments: dur = np.load(self.path / 'alg' / f'{item_id}.npy') else: # dummy durations to simplify collate func dur = np.zeros((mel.shape[0], 1)) return x, mel, item_id, mel_len, dur
def __getitem__(self, index): item_id = self.metadata[index] text = self.text_dict[item_id] x = text_to_sequence(text) x = np.array([ch for ch in x if ch not in PUNCTUATION_INDICES]) x = np.stack([x, np.zeros_like(x)]) mel = np.load(str(self.path / 'mel' / f'{item_id}.npy')) mel_len = mel.shape[-1] return x, mel, item_id, mel_len
def __getitem__(self, idx): sidx = self.frames[idx][0] sidx_files = self.wav_files_dict[sidx] file_name = random.choice(sidx_files) wav_name = os.path.join(self.wav_dir, file_name) text = self.frames[idx][2] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name), dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample
def generate(self, 華, input_text): inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])] if hp.tts_model == 'tacotron2': self.gen_tacotron2(華, inputs) elif hp.tts_model == 'tacotron': self.gen_tacotron(華, inputs) else: print(f"Wrong tts model type {{{tts_model_type}}}") print('\n\nDone.\n')
def _get_next_example(self): """ Gets a single example (input, mel_target, linear_target, cost) from disk """ if self._offset >= len(self._metadata): self._offset = 0 np.random.shuffle(self._metadata) meta = self._metadata[self._offset] self._offset += 1 text = meta[2] input_data = np.asarray(text_to_sequence(text, Config.Cleaners), dtype=np.int32) mel_target = np.load(os.path.join(self._datadir, meta[0])) return (input_data, mel_target, len(mel_target))
def synthesize(self, text, index, out_dir): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } mels = self.session.run(self.mel_outputs, feed_dict=feed_dict) # Write the spectrogram to disk mel_filename = 'ljspeech-mel-eval-{:05d}.npy'.format(index) np.save(os.path.join(out_dir, mel_filename), mels, allow_pickle=False) print('mel spectrograms saved under {}'.format(out_dir))
def synthesize(input_text, tts_model, voc_model, alpha=1.0, device=torch.device('cuda')): text = clean_text(input_text.strip()) x = text_to_sequence(text) _, m, _ = tts_model.generate(x, alpha=alpha) if voc_model == 'griffinlim': wav = reconstruct_waveform(m, n_iter=32) elif isinstance(voc_model, WaveRNN): m = torch.tensor(m).unsqueeze(0) wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law) else: m = torch.tensor(m).unsqueeze(0).to(device) with torch.no_grad(): wav = voc_model.inference(m).cpu().numpy() return wav
def synthesize(input_text, tts_model, voc_model, alpha=1.0): x = text_to_sequence(input_text.strip(), ['english_cleaners']) m = tts_model.generate(x, alpha=alpha) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if voc_model == 'griffinlim': wav = reconstruct_waveform(m, n_iter=32) else: m = torch.tensor(m).unsqueeze(0) wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law) print() return wav