def create_align_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path: Path): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(val_set) + len(train_set) dataset = itertools.chain(train_set, val_set) for i, (x, mels, ids, mel_lens) in enumerate(dataset, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, _, attn = model(x, mels) attn = np_now(attn) bs, chars = attn.shape[0], attn.shape[2] argmax = np.argmax(attn[:, :, :], axis=2) mel_counts = np.zeros(shape=(bs, chars), dtype=np.int32) for b in range(attn.shape[0]): # fix random jumps in attention for j in range(1, argmax.shape[1]): if abs(argmax[b, j] - argmax[b, j - 1]) > 10: argmax[b, j] = argmax[b, j - 1] count = np.bincount(argmax[b, :mel_lens[b]]) mel_counts[b, :len(count)] = count[:len(count)] for j, item_id in enumerate(ids): np.save(str(save_path / f'{item_id}.npy'), mel_counts[j, :], allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def generate_plots(self, model: Tacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, x_lens, m_lens = session.val_sample x, m = x.to(device), m.to(device) m1_hat, m2_hat, att = model(x, m) att = np_now(att)[0] m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, att = model.generate(x[0].tolist(), steps=m_lens[0] + 20) att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/attention', att_fig, model.step) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def generate_plots(self, model: Tacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device batch = session.val_sample batch = to_device(batch, device=device) m1_hat, m2_hat, att = model(batch['x'], batch['mel']) att = np_now(att)[0] m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m_target = np_now(batch['mel'])[0, :600, :] att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_target_fig = plot_mel(m_target) self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) target_wav = self.dsp.griffinlim(m_target) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) m1_hat, m2_hat, att = model.generate(batch['x'][0:1], steps=batch['mel_len'][0] + 20) att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/attention', att_fig, model.step) self.writer.add_figure('Generated/target', m_target_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate)
class Synthesizer(object): def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.ap = AudioProcessor(**config.audio) self.model = Tacotron(61, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def tts(self, text): text_cleaner = [self.config.text_cleaner] wavs = [] for sen in text.split('.'): if len(sen) < 3: continue sen = sen.strip() sen += '.' print(sen) sen = sen.strip() seq = np.array( phoneme_to_sequence(sen, text_cleaner, self.config.phoneme_language)) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() mel_out, linear_out, alignments, stop_tokens = self.model.forward( chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = self.ap.inv_spectrogram(linear_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
def create_align_features( model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path_alg: Path, # save_path_pitch: Path ): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' model.eval() device = next( model.parameters()).device # use same device as model parameters if val_set is not None: iters = len(val_set) + len(train_set) dataset = itertools.chain(train_set, val_set) else: # print('here') iters = len(train_set) # print(iters) dataset = itertools.chain(train_set) att_score_dict = {} if hp.extract_durations_with_dijkstra: print('Extracting durations using dijkstra...') dur_extraction_func = extract_durations_with_dijkstra else: print('Extracting durations using attention peak counts...') dur_extraction_func = extract_durations_per_count # for i in dataset: # print(i) for i, (x, mels, ids, x_lens, mel_lens) in enumerate(dataset, 1): x, mels = x.to(device), mels.to(device) # print(x) # print(mels) with torch.no_grad(): _, _, att_batch = model(x, mels) align_score, sharp_score = attention_score(att_batch, mel_lens, r=1) att_batch = np_now(att_batch) seq, att, mel_len, item_id = x[0], att_batch[0], mel_lens[0], ids[0] align_score, sharp_score = float(align_score[0]), float(sharp_score[0]) att_score_dict[item_id] = (align_score, sharp_score) durs = dur_extraction_func(seq, att, mel_len) if np.sum(durs) != mel_len: print( f'WARNINNG: Sum of durations did not match mel length for item {item_id}!' ) np.save(str(save_path_alg / f'{item_id}.npy'), durs, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg) pickle_binary(att_score_dict, paths.data / 'att_score_dict.pkl')
def evaluate(self, model: Tacotron, val_set: Dataset) -> float: model.eval() val_loss = 0 device = next(model.parameters()).device for i, (x, m, ids, _) in enumerate(val_set, 1): x, m = x.to(device), m.to(device) with torch.no_grad(): m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) val_loss += m1_loss.item() + m2_loss.item() return val_loss / len(val_set)
def evaluate(self, model: Tacotron, val_set: Dataset) -> Tuple[float, float]: model.eval() val_loss = 0 val_att_score = 0 device = next(model.parameters()).device for i, (x, m, ids, x_lens, mel_lens) in enumerate(val_set, 1): x, m = x.to(device), m.to(device) with torch.no_grad(): m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) val_loss += m1_loss.item() + m2_loss.item() _, att_score = attention_score(attention, mel_lens) val_att_score += torch.mean(att_score).item() return val_loss / len(val_set), val_att_score / len(val_set)
def evaluate(self, model: Tacotron, val_set: Dataset) -> Tuple[float, float]: model.eval() val_loss = 0 val_att_score = 0 device = next(model.parameters()).device for i, batch in enumerate(val_set, 1): batch = to_device(batch, device=device) with torch.no_grad(): m1_hat, m2_hat, attention = model(batch['x'], batch['mel']) m1_loss = F.l1_loss(m1_hat, batch['mel']) m2_loss = F.l1_loss(m2_hat, batch['mel']) val_loss += m1_loss.item() + m2_loss.item() _, att_score = attention_score(attention, batch['mel_len']) val_att_score += torch.mean(att_score).item() return val_loss / len(val_set), val_att_score / len(val_set)
def create_gta_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path: Path): model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) + len(val_set) dataset = itertools.chain(train_set, val_set) for i, (x, mels, ids, mel_lens, dur) in enumerate(dataset, 1): x, mels, dur = x.to(device), mels.to(device), dur.to(device) with torch.no_grad(): _, gta, _ = model(x, mels, dur) gta = gta.cpu().numpy() for j, item_id in enumerate(ids): mel = gta[j][:, :mel_lens[j]] np.save(str(save_path / f'{item_id}.npy'), mel, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def create_gta_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path: Path) -> None: model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) + len(val_set) dataset = itertools.chain(train_set, val_set) for i, batch in enumerate(dataset, 1): batch = to_device(batch, device=device) with torch.no_grad(): pred = model(batch) gta = pred['mel_post'].cpu().numpy() for j, item_id in enumerate(batch['item_id']): mel = gta[j][:, :batch['mel_len'][j]] np.save(str(save_path / f'{item_id}.npy'), mel, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
class Synthesizer(object): def load_model(self, model_path, model_config, wavernn_path, use_cuda): self.model_file = model_path print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True) self.model.decoder.max_decoder_steps = 8000 # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() self.vocoder=WaveRNNVocoder.Vocoder() self.vocoder.loadWeights(wavernn_path) self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000) def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) #split text into chunks that are smaller than maxlen. Preferably, split on punctuation. def ttmel(self, text): mel_ret = [] text_list = split_text(text, maxlen) for t in text_list: if len(t) < 3: continue seq = np.array(self.input_adapter(t)) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() mel_out, _, alignments, stop_tokens = self.model.forward(chars_var) mel_out = mel_out[0].data.cpu().numpy().T mel_ret.append(mel_out) return np.hstack(mel_ret) def tts(self, mel): wav = self.vocoder.melToWav(mel) return wav
class TaiwaneseTacotron(): def __init__(self): # Parse Arguments parser = argparse.ArgumentParser(description='TTS') self.args = parser.parse_args() self.args.vocoder = 'wavernn' self.args.hp_file = 'hparams.py' self.args.voc_weights = False self.args.tts_weights = False self.args.save_attn = False self.args.batched = True self.args.target = None self.args.overlap = None self.args.force_cpu = False #================ vocoder ================# if self.args.vocoder in ['griffinlim', 'gl']: self.args.vocoder = 'griffinlim' elif self.args.vocoder in ['wavernn', 'wr']: self.args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(self.args.hp_file) # Load hparams from file # set defaults for any arguments that depend on hparams if self.args.vocoder == 'wavernn': if self.args.target is None: self.args.target = hp.voc_target if self.args.overlap is None: self.args.overlap = hp.voc_overlap if self.args.batched is None: self.args.batched = hp.voc_gen_batched #================ others ================# paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print("hello") print(paths.base) if not self.args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) # === Wavernn === # if self.args.vocoder == 'wavernn': print('\nInitialising WaveRNN Model...\n') self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights #print(paths.voc_latest_weights) self.voc_model.load(voc_load_path) # === Tacotron === # if hp.tts_model == 'tacotron': print('\nInitialising Tacotron Model...\n') self.tts_model = Tacotron( embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Tacotron2 === # elif hp.tts_model == 'tacotron2': print('\nInitializing Tacotron2 Model...\n') self.tts_model = Tacotron2().to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Infomation === # if hp.tts_model == 'tacotron': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) elif hp.tts_model == 'tacotron2': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) def generate(self, 華, input_text): inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])] if hp.tts_model == 'tacotron2': self.gen_tacotron2(華, inputs) elif hp.tts_model == 'tacotron': self.gen_tacotron(華, inputs) else: print(f"Wrong tts model type {{{tts_model_type}}}") print('\n\nDone.\n') # custom function def gen_tacotron2(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') print(x) x = np.array(x)[None, :] x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long() self.tts_model.eval() mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference( x) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = mel_outputs_postnet self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy() wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path) # custom function def gen_tacotron(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = self.tts_model.generate(x) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path)
texts = [] with open(args.text) as f: for line in f: texts.append(line.strip()) if use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 800 batch_size = 32 for n in range(math.ceil(len(texts) / batch_size)): batch_texts = texts[n: max(n + batch_size, len(texts))] wavs, alignments = text2audio(texts, model, CONFIG, use_cuda, ap) for i, wav in enumerate(wavs): ap.save_wav(wav, os.path.join(OUT_FOLDER, 'CommonVoice_{}_{}.wav'.format(args.step, n * batch_size + i))) if save_alignment: # alignments can be used to train FastSpeech alignment = alignments[i] duration = get_duration(alignment) print(duration) np.save(os.path.join(OUT_FOLDER, 'duration', 'duration_{}.npy'.format(n * batch_size + i)), duration)
class Synthesizer(object): """ Summary: Config is loaded and the model from the given path is loaded and prepared for inference. Parameters: @model_path = model's file directory path @model_name = model's file name @model_config = config's file name @use_cuda = GPU flag """ def load_model(self, model_path, model_name, model_config, use_cuda): #build the config's path model_config = os.path.join(model_path, model_config) #build the model's path model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > Model config path: ", model_config) print(" | > Model file path: ", model_file) config = load_config(model_config) self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [config.text_cleaner], config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [config.text_cleaner]) self.model = Tacotron(num_chars=config['num_chars'], embedding_dim=config['embedding_size'], linear_dim=self.ap.num_freq, mel_dim=self.ap.num_mels, r=config['r']) #load model state if use_cuda: cp = torch.load(model_file) else: cp = torch.load(model_file, map_location=lambda storage, loc: storage) #load the model self.model.load_state_dict(cp['model']) #if cuda is enabled & available move tensors to GPU if use_cuda: self.model.cuda() #disables normalization techniques present in code self.model.eval() """ Summary: Saves the wav at the given path Parameters: @wav = wav array @path = destination path """ def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) """ Summary: Gets an input, prepares it for the model and returns the predicted output. Parameters: @text = input sentence """ def tts(self, text, gl_mode=None): wavs = [] #split the input in sentences for sen in text.split('.'): if len(sen) < 3: continue sen = sen.strip() sen += '.' #print('Input : {}'.format(sen)) #character => phonem => index seq = np.array(self.input_adapter(sen)) #numpy to pytorch array chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() #begin the inference mel_out, linear_out, alignments, stop_tokens = self.model.forward( chars_var) #move output tensor to cpu linear_out = linear_out[0].data.cpu().numpy() t = time.time() wav = self.ap.inv_spectrogram(linear_out.T, gl_mode) t = time.time() - t wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) self.save_wav(wavs, 'gla.wav') return out