def create_gta_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path: Path): model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) + len(val_set) dataset = itertools.chain(train_set, val_set) for i, (x, mels, ids, mel_lens, dur) in enumerate(dataset, 1): x, mels, dur = x.to(device), mels.to(device), dur.to(device) with torch.no_grad(): _, gta, _ = model(x, mels, dur) gta = gta.cpu().numpy() for j, item_id in enumerate(ids): mel = gta[j][:, :mel_lens[j]] mel = (mel + 4) / 8 np.save(str(save_path / f'{item_id}.npy'), mel, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
class Synthesizer(object): def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.ap = AudioProcessor(**config.audio) self.model = Tacotron(61, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def tts(self, text): text_cleaner = [self.config.text_cleaner] wavs = [] for sen in text.split('.'): if len(sen) < 3: continue sen = sen.strip() sen += '.' print(sen) sen = sen.strip() seq = np.array( phoneme_to_sequence(sen, text_cleaner, self.config.phoneme_language)) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() mel_out, linear_out, alignments, stop_tokens = self.model.forward( chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = self.ap.inv_spectrogram(linear_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
def test_train_step(self): input = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0 stop_targets = stop_targets.view(input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron(32, c.audio['num_freq'], c.audio['num_mels'], c.r, memory_size=c.memory_size).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): mel_out, linear_out, align, stop_tokens = model.forward( input, input_lengths, mel_spec) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1
def create_align_features(model: Tacotron, train_set: DataLoader, val_set: DataLoader, save_path_alg: Path, save_path_pitch: Path): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' model.eval() device = next( model.parameters()).device # use same device as model parameters iters = len(val_set) + len(train_set) dataset = itertools.chain(train_set, val_set) att_score_dict = {} if hp.extract_durations_with_dijkstra: print('Extracting durations using dijkstra...') dur_extraction_func = extract_durations_with_dijkstra else: print('Extracting durations using attention peak counts...') dur_extraction_func = extract_durations_per_count for i, (x, mels, ids, x_lens, mel_lens) in enumerate(dataset, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, _, att_batch = model(x, mels) align_score, sharp_score = attention_score(att_batch, mel_lens, r=1) att_batch = np_now(att_batch) seq, att, mel_len, item_id = x[0], att_batch[0], mel_lens[0], ids[0] align_score, sharp_score = float(align_score[0]), float(sharp_score[0]) att_score_dict[item_id] = (align_score, sharp_score) durs = dur_extraction_func(seq, att, mel_len) if np.sum(durs) != mel_len: print( f'WARNINNG: Sum of durations did not match mel length for item {item_id}!' ) np.save(str(save_path_alg / f'{item_id}.npy'), durs, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg) pickle_binary(att_score_dict, paths.data / 'att_score_dict.pkl') print('Extracting Pitch Values...') extract_pitch(save_path_pitch)
def tacotron(pretrained=True, **kwargs): model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold) if pretrained: state_dict = fetch_and_load_state_dict("tacotron") state_dict["decoder.r"] = state_dict.pop("r") state_dict["stop_threshold"] = torch.tensor(hp.tts_stop_threshold, dtype=torch.float32) model.load_state_dict(state_dict) return model
def train(args): save_dir = './checkpoint/1' checkpoint_path = os.path.join(save_dir, 'model.ckpt') coord = tf.train.Coordinator() feeder = DataFeeder(coord, mode=1) model = Tacotron() model.initialize(feeder.enc_input, feeder.sequence_length, feeder.dec_input, feeder.mel_target) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) with tf.Session() as sess: try: sess.run(tf.global_variables_initializer()) if args.step: load_dir = '{}-{}'.format(checkpoint_path, args.step) saver.restore(sess, load_dir) feeder.start_in_session(sess) while not coord.should_stop(): step, loss, optimize = sess.run( [model.global_step, model.loss, model.optimize]) print('Step: {}, Loss: {:.5f}'.format(step, loss)) if step % checkpoint_step == 0: saver.save(sess, checkpoint_path, global_step=step) input_seq, alignment, pred, target = \ sess.run([model.enc_input[0], model.alignment[0], model.mel_output[0], model.mel_target[0]]) input_seq = sequence_to_text(input_seq) alignment_dir = os.path.join( save_dir, 'step-{}-align.png'.format(step)) plot_alignment(alignment, alignment_dir, input_seq) except Exception as e: traceback.print_exc() coord.request_stop(e)
class Synthesizer: def load(self, checkpoint_path, model_name='tacotron'): print('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') with tf.variable_scope('model') as scope: self.model = Tacotron(hparams) self.model.initialize(inputs, input_lengths) pprint('>>> Model Linear Ouputs:') pprint(self.model.linear_outputs[0]) self.wav_output = audio.inv_spectrogram_tensorflow( self.model.linear_outputs[0]) print('Loading checkpoint: %s' % checkpoint_path) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path) def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) pprint('Text: ' + text) #pprint('Seq') #pprint(seq) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } pprint(self.wav_output) pprint('>>> Getting wav') wav = self.session.run(self.wav_output, feed_dict=feed_dict) pprint('>>> Gotten wav') #wav = audio.inv_preemphasis(wav) # The audio is typically ~13 seconds unless truncated: #wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.ap = AudioProcessor(**config.audio) self.model = Tacotron(61, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def train(self, model: Tacotron, optimizer: Optimizer) -> None: for i, session_params in enumerate(hp.tts_schedule, 1): r, lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets(path=self.paths.data, batch_size=bs, r=r, model_type='tacotron') session = TTSSession(index=i, r=r, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session)
def create_align_features(model: Tacotron, train_set, save_path: Path): assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \ f'Reduction factor was: {model.r}' device = next(model.parameters()).device # use same device as model parameters iters = len(train_set) for i, (x, mels, ids, mel_lens, _) in enumerate(train_set, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, _, attn = model(x, mels) attn = np_now(attn) bs, chars = attn.shape[0], attn.shape[2] argmax = np.argmax(attn[:, :, :], axis=2) mel_counts = np.zeros(shape=(bs, chars), dtype=np.int32) for b in range(attn.shape[0]): count = np.bincount(argmax[b, :]) mel_counts[b, :len(count)] = count[:len(count)] for j, item_id in enumerate(ids): np.save(save_path / f'{item_id}.npy', mel_counts[j, :], allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def train(self, model: Tacotron, optimizer: Optimizer) -> None: tts_schedule = self.train_cfg['schedule'] tts_schedule = parse_schedule(tts_schedule) for i, session_params in enumerate(tts_schedule, 1): r, lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=r, model_type='tacotron', max_mel_len=self.train_cfg['max_mel_len'], filter_attention=False) session = TTSSession(index=i, r=r, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session=session)
def create_gta_features(model: Tacotron, train_set, save_path: Path): save_path.mkdir(parents=False, exist_ok=True) device = next( model.parameters()).device # use same device as model parameters iters = len(train_set) for i, (x, mels, ids, mel_lens) in enumerate(train_set, 1): x, mels = x.to(device), mels.to(device) with torch.no_grad(): _, gta, _ = model(x, mels) gta = gta.cpu().numpy() for j, item_id in enumerate(ids): mel = gta[j][:, :mel_lens[j]] mel = (mel + 4) / 8 np.save(save_path / f'{item_id}.npy', mel, allow_pickle=False) bar = progbar(i, iters) msg = f'{bar} {i}/{iters} Batches ' stream(msg)
def main(args): # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) model = Tacotron(num_chars=num_chars, embedding_dim=c.embedding_size, linear_dim=ap.num_freq, mel_dim=ap.num_mels, r=c.r, memory_size=c.memory_size) optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) try: model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) except: print(" > Partial model initialization.") partial_init_flag = True model_dict = model.state_dict() # Partial initialization: if there is a mismatch with new and old layer, it is skipped. # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in checkpoint['model'].items() if k in model_dict } # 2. filter out different size layers pretrained_dict = { k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel() } # 3. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 4. load the new state dict model.load_state_dict(model_dict) print(" | > {} / {} layers are initialized".format( len(pretrained_dict), len(model_dict))) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['epoch'] best_loss = checkpoint['linear_loss'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, current_step, epoch) print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, target_loss, best_loss, OUT_PATH, current_step, epoch)
class Synthesizer(object): def load_model(self, model_path, model_config, wavernn_path, use_cuda): self.model_file = model_path print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True) self.model.decoder.max_decoder_steps = 8000 # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() self.vocoder=WaveRNNVocoder.Vocoder() self.vocoder.loadWeights(wavernn_path) self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000) def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) #split text into chunks that are smaller than maxlen. Preferably, split on punctuation. def ttmel(self, text): mel_ret = [] text_list = split_text(text, maxlen) for t in text_list: if len(t) < 3: continue seq = np.array(self.input_adapter(t)) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() mel_out, _, alignments, stop_tokens = self.model.forward(chars_var) mel_out = mel_out[0].data.cpu().numpy().T mel_ret.append(mel_out) return np.hstack(mel_ret) def tts(self, mel): wav = self.vocoder.melToWav(mel) return wav
loss = tf.reduce_mean(MAE(dec_target, pred)) variables = model.trainable_variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) return loss, pred[0], alignment[0] dataset = tf.data.Dataset.from_generator(generator=DataGenerator, output_types=(tf.float32, tf.float32, tf.float32, tf.int32), output_shapes=(tf.TensorShape([batch_size, None]), tf.TensorShape([batch_size, None, mel_dim]), tf.TensorShape([batch_size, None, mel_dim]), tf.TensorShape([batch_size])))\ .prefetch(tf.data.experimental.AUTOTUNE) model = Tacotron(K=16, conv_dim=[128, 128]) optimizer = Adam() step = tf.Variable(0) checkpoint_dir = './checkpoint/1' os.makedirs(checkpoint_dir, exist_ok=True) checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model, step=step) manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5) checkpoint.restore(manager.latest_checkpoint) if manager.latest_checkpoint: print('Restore checkpoint from {}'.format(manager.latest_checkpoint)) try: for text, dec, mel, text_length in dataset: loss, pred, alignment = train_step(text, dec, mel, text_length)
res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate).cuda() voc_model.restore('quick_start/voc_weights/latest_weights.pyt') print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(r=hp.tts_r, embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).cuda() tts_model.restore('quick_start/tts_weights/latest_weights.pyt') if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate).cuda() voc_model.restore(paths.voc_latest_weights) print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).cuda() tts_restore_path = weights_path if weights_path else paths.tts_latest_weights tts_model.restore(tts_restore_path) if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
def main(args): model = Tacotron(c.embedding_size, ap.num_freq, ap.num_mels, c.r) print(" | > Num output units : {}".format(ap.num_freq), flush=True) optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer_st = optim.Adam( model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) try: model.load_state_dict(checkpoint['model']) except: model_dict = model.state_dict() # Partial initialization: if there is a mismatch with new and old layer, it is skipped. # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in checkpoint['model'].items() if k in model_dict } # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() optimizer.load_state_dict(checkpoint['optimizer']) print( " > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['epoch'] best_loss = checkpoint['linear_loss'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 print("\n > Starting a new training", flush=True) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() if c.lr_decay: scheduler = NoamLR( optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params), flush=True) if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, current_step) print( " | > Train Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode='MOL').to(device) voc_model.load('quick_start/voc_weights/latest_weights.pyt') print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_model.load('quick_start/tts_weights/latest_weights.pyt') if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_train = args.force_train force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.tts_schedule: _, _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Tacotron Model print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.tts_schedule): current_step = model.get_step() r, lr, max_step, batch_size = session training_steps = max_step - current_step # Do we need to change to the next session? if current_step >= max_step: # Are there no further sessions than the current one? if i == len(hp.tts_schedule) - 1: # There are no more sessions. Check if we force training. if force_train: # Don't finish the loop - train forever training_steps = 999_999_999 else: # We have completed training. Breaking is same as continue break else: # There is a following session, go to it continue model.r = r simple_table([('Steps with r=%s' % (repr1(r)), str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr), ('Outputs/Step (r)', model.r)]) train_set, attn_example = get_tts_datasets(paths.data, batch_size, r) tts_train_loop(paths, model, optimizer, train_set, lr, training_steps, attn_example) print('Training Complete.') print( 'To continue training increase tts_total_steps in hparams.py or use --force_train\n' ) print('Creating Ground Truth Aligned Dataset...\n') train_set, attn_example = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' )
def train_session(self, model: Tacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 model.r = session.r simple_table([(f'Steps with r={session.r}', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Outputs/Step (r)', model.r)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, (x, m, ids, x_lens, mel_lens) in enumerate(session.train_set, 1): start = time.time() model.train() x, m = x.to(device), m.to(device) m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', self.paths, model, optimizer, name=ckpt_name, is_silent=True) if step % hp.tts_plot_every == 0: self.generate_plots(model, session) _, att_score = attention_score(attention, mel_lens) att_score = torch.mean(att_score) self.writer.add_scalar('Attention_Score/train', att_score, model.get_step()) self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/reduction_factor', session.r, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss, val_att_score = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) self.writer.add_scalar('Attention_Score/val', val_att_score, model.get_step()) save_checkpoint('tts', self.paths, model, optimizer, is_silent=True) loss_avg.reset() duration_avg.reset() print(' ')
sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(phonemes), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights tts_model.load(tts_load_path) if input_text: text = clean_text(input_text.strip()) inputs = [text_to_sequence(text)] else: with open('sentences.txt') as f:
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = 'taco_step%sK' % (repr1(k)) save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / '%s' % (repr1(step))) save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / '%s' % (repr1(step)), 600) msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % ( repr1(e), repr1(epochs), repr1(i), repr1(total_iters), avg_loss, speed, repr1(k)) stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def thak(): class Tshamsoo(): force_cpu = os.getenv('FORCE_CPU', False) hp_file = 'hparams.py' vocoder = os.getenv('VOCODER', 'wavernn') batched = os.getenv('BATCHED', True) target = os.getenv('TARGET', None) overlap = os.getenv('OVERLAP', None) tts_weights = None save_attn = os.getenv('SAVE_ATTN', False) voc_weights = None iters = os.getenv('GL_ITERS', 32) args = Tshamsoo() if args.vocoder in ['griffinlim', 'gl']: args.vocoder = 'griffinlim' elif args.vocoder in ['wavernn', 'wr']: args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(args.hp_file) # Load hparams from file tts_weights = args.tts_weights save_attn = args.save_attn paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) if args.vocoder == 'wavernn': # set defaults for any arguments that depend on hparams if args.target is None: args.target = hp.voc_target if args.overlap is None: args.overlap = hp.voc_overlap if args.batched is None: args.batched = hp.voc_gen_batched batched = args.batched target = int(args.target) overlap = int(args.overlap) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) else: voc_model = None batched = None target = None overlap = None print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights tts_model.load(tts_load_path) return args, voc_model, tts_model, batched, target, overlap, save_attn
def main(args): # Setup the dataset train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'train_metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power, min_seq_len=c.min_seq_len ) train_loader = DataLoader(train_dataset, batch_size=c.batch_size, shuffle=False, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=c.num_loader_workers, pin_memory=True) ''' val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'valid_metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size, shuffle=False, collate_fn=val_dataset.collate_fn, drop_last=False, num_workers=4, pin_memory=True) ''' model = Tacotron(c.embedding_size, c.num_freq, c.num_mels, c.r) optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) model.load_state_dict(checkpoint['model']) optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer.load_state_dict(checkpoint['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() print(" > Model restored from step %d" % checkpoint['step']) start_epoch = checkpoint['step'] // len(train_loader) best_loss = checkpoint['linear_loss'] start_epoch = 0 args.restore_step = checkpoint['step'] optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr) else: args.restore_step = 0 print("\n > Starting a new training") if use_cuda: model = nn.DataParallel(model.cuda()) criterion.cuda() criterion_st.cuda() num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train( model, criterion, criterion_st, train_loader, optimizer, optimizer_st, epoch) #val_loss = evaluate(model, criterion, criterion_st, val_loader, current_step) best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
class TaiwaneseTacotron(): def __init__(self): # Parse Arguments parser = argparse.ArgumentParser(description='TTS') self.args = parser.parse_args() self.args.vocoder = 'wavernn' self.args.hp_file = 'hparams.py' self.args.voc_weights = False self.args.tts_weights = False self.args.save_attn = False self.args.batched = True self.args.target = None self.args.overlap = None self.args.force_cpu = False #================ vocoder ================# if self.args.vocoder in ['griffinlim', 'gl']: self.args.vocoder = 'griffinlim' elif self.args.vocoder in ['wavernn', 'wr']: self.args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(self.args.hp_file) # Load hparams from file # set defaults for any arguments that depend on hparams if self.args.vocoder == 'wavernn': if self.args.target is None: self.args.target = hp.voc_target if self.args.overlap is None: self.args.overlap = hp.voc_overlap if self.args.batched is None: self.args.batched = hp.voc_gen_batched #================ others ================# paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print("hello") print(paths.base) if not self.args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) # === Wavernn === # if self.args.vocoder == 'wavernn': print('\nInitialising WaveRNN Model...\n') self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights #print(paths.voc_latest_weights) self.voc_model.load(voc_load_path) # === Tacotron === # if hp.tts_model == 'tacotron': print('\nInitialising Tacotron Model...\n') self.tts_model = Tacotron( embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Tacotron2 === # elif hp.tts_model == 'tacotron2': print('\nInitializing Tacotron2 Model...\n') self.tts_model = Tacotron2().to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Infomation === # if hp.tts_model == 'tacotron': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) elif hp.tts_model == 'tacotron2': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) def generate(self, 華, input_text): inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])] if hp.tts_model == 'tacotron2': self.gen_tacotron2(華, inputs) elif hp.tts_model == 'tacotron': self.gen_tacotron(華, inputs) else: print(f"Wrong tts model type {{{tts_model_type}}}") print('\n\nDone.\n') # custom function def gen_tacotron2(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') print(x) x = np.array(x)[None, :] x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long() self.tts_model.eval() mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference( x) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = mel_outputs_postnet self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy() wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path) # custom function def gen_tacotron(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = self.tts_model.generate(x) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path)
args = parser.parse_args() batch_size = args.batch_size force_train = args.force_train force_gta = args.force_gta lr = args.lr print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).cuda() paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) model.restore(paths.tts_latest_weights) # model.reset_step() # model.set_r(hp.tts_r) optimiser = optim.Adam(model.parameters())
def __init__(self): # Parse Arguments parser = argparse.ArgumentParser(description='TTS') self.args = parser.parse_args() self.args.vocoder = 'wavernn' self.args.hp_file = 'hparams.py' self.args.voc_weights = False self.args.tts_weights = False self.args.save_attn = False self.args.batched = True self.args.target = None self.args.overlap = None self.args.force_cpu = False #================ vocoder ================# if self.args.vocoder in ['griffinlim', 'gl']: self.args.vocoder = 'griffinlim' elif self.args.vocoder in ['wavernn', 'wr']: self.args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(self.args.hp_file) # Load hparams from file # set defaults for any arguments that depend on hparams if self.args.vocoder == 'wavernn': if self.args.target is None: self.args.target = hp.voc_target if self.args.overlap is None: self.args.overlap = hp.voc_overlap if self.args.batched is None: self.args.batched = hp.voc_gen_batched #================ others ================# paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print("hello") print(paths.base) if not self.args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) # === Wavernn === # if self.args.vocoder == 'wavernn': print('\nInitialising WaveRNN Model...\n') self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights #print(paths.voc_latest_weights) self.voc_model.load(voc_load_path) # === Tacotron === # if hp.tts_model == 'tacotron': print('\nInitialising Tacotron Model...\n') self.tts_model = Tacotron( embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Tacotron2 === # elif hp.tts_model == 'tacotron2': print('\nInitializing Tacotron2 Model...\n') self.tts_model = Tacotron2().to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Infomation === # if hp.tts_model == 'tacotron': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) elif hp.tts_model == 'tacotron2': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)])
def main(args): # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = os.path.join(_, c.output_path) OUT_PATH = create_experiment_folder(OUT_PATH) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # save config to tmp place to be loaded by subsequent modules. file_name = str(os.getpid()) tmp_path = os.path.join("/tmp/", file_name+'_tts') pickle.dump(c, open(tmp_path, "wb")) # setup tensorboard LOG_DIR = OUT_PATH tb = SummaryWriter(LOG_DIR) # Ctrl+C handler to remove empty experiment folder def signal_handler(signal, frame): print(" !! Pressed Ctrl+C !!") remove_experiment_folder(OUT_PATH) sys.exit(1) signal.signal(signal.SIGINT, signal_handler) # Setup the dataset dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) dataloader = DataLoader(dataset, batch_size=c.batch_size, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) # setup the model model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) # plot model on tensorboard dummy_input = dataset.get_dummy_data() ## TODO: onnx does not support RNN fully yet # model_proto_path = os.path.join(OUT_PATH, "model.proto") # onnx.export(model, dummy_input, model_proto_path, verbose=True) # tb.add_graph_onnx(model_proto_path) if use_cuda: model = nn.DataParallel(model.cuda()) optimizer = optim.Adam(model.parameters(), lr=c.lr) if args.restore_step: checkpoint = torch.load(os.path.join( args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % args.restore_step) start_epoch = checkpoint['step'] // len(dataloader) best_loss = checkpoint['linear_loss'] else: start_epoch = 0 print("\n > Starting a new training") num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) model = model.train() if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay, # patience=c.lr_patience, verbose=True) epoch_time = 0 best_loss = float('inf') for epoch in range(0, c.epochs): print("\n | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(dataset) / c.batch_size) for num_iter, data in enumerate(dataloader): start_time = time.time() text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step) for params_group in optimizer.param_groups: params_group['lr'] = current_lr optimizer.zero_grad() # Add a single frame of zeros to Mel Specs for better end detection #try: # mel_input = np.concatenate((np.zeros( # [c.batch_size, 1, c.num_mels], dtype=np.float32), # mel_input[:, 1:, :]), axis=1) #except: # raise TypeError("not same dimension") # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length. # TODO: might be unnecessary sorted_lengths, indices = torch.sort( text_lengths.view(-1), dim=0, descending=True) sorted_lengths = sorted_lengths.long().numpy() text_input_var = text_input_var[indices] mel_spec_var = mel_spec_var[indices] linear_spec_var = linear_spec_var[indices] if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var, input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths))) mel_loss = criterion(mel_output, mel_spec_var) #linear_loss = torch.abs(linear_output - linear_spec_var) #linear_loss = 0.5 * \ #torch.mean(linear_loss) + 0.5 * \ #torch.mean(linear_loss[:, :n_priority_freq, :]) linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) loss = mel_loss + linear_loss # loss = loss.cuda() loss.backward() grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.) ## TODO: maybe no need optimizer.step() step_time = time.time() - start_time epoch_time += step_time progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) # Plot Learning Stats tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step) tb.add_scalar('Loss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) tb.add_scalar('Time/StepTime', step_time, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, linear_loss.data[0], OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_spec_var[0].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, dataset.ap) gt_spec = plot_spectrogram(gt_spec, dataset.ap) tb.add_image('Spec/Reconstruction', const_spec, current_step) tb.add_image('Spec/GroundTruth', gt_spec, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) # Sample audio audio_signal = linear_output[0].data.cpu().numpy() dataset.ap.griffin_lim_iters = 60 audio_signal = dataset.ap.inv_spectrogram(audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: print("\n > Error at audio signal on TB!!") print(audio_signal.max()) print(audio_signal.min()) # average loss after the epoch avg_epoch_loss = np.mean( progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1])) best_loss = save_best_model(model, optimizer, avg_epoch_loss, best_loss, OUT_PATH, current_step, epoch) #lr_scheduler.step(loss.data[0]) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0
if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Tacotron Model print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(phonemes), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'Num Params: {params}') optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer,