def main(): model = Tacotron().to(DEVICE) print('Model {} is working...'.format(model.name)) print('{} threads are used...'.format(torch.get_num_threads())) ckpt_dir = os.path.join(args.logdir, model.name) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=args.lr_decay_step // 10, gamma=0.933) # around 1/2 per decay step if not os.path.exists(ckpt_dir): os.makedirs(os.path.join(ckpt_dir, 'A', 'train')) elif not os.path.exists(os.path.join(ckpt_dir, 'ckpt.csv')): shutil.rmtree(ckpt_dir) os.makedirs(os.path.join(ckpt_dir, 'A', 'train')) else: print('Already exists. Retrain the model.') ckpt = pd.read_csv(os.path.join(ckpt_dir, 'ckpt.csv'), sep=',', header=None) ckpt.columns = ['models', 'loss'] ckpt = ckpt.sort_values(by='loss', ascending=True) state = torch.load(os.path.join(ckpt_dir, ckpt.models.loc[0])) model.load_state_dict(state['model']) args.global_step = state['global_step'] optimizer.load_state_dict(state['optimizer']) scheduler.load_state_dict(state['scheduler']) # model = torch.nn.DataParallel(model, device_ids=list(range(args.no_gpu))).to(DEVICE) dataset = SpeechDataset(args.data_path, args.meta_train, model.name, mem_mode=args.mem_mode) validset = SpeechDataset(args.data_path, args.meta_eval, model.name, mem_mode=args.mem_mode) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, pin_memory=True) valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch, shuffle=False, collate_fn=collate_fn, pin_memory=True) writer = SummaryWriter(ckpt_dir) train(model, data_loader, valid_loader, optimizer, scheduler, batch_size=args.batch_size, ckpt_dir=ckpt_dir, writer=writer) return None
def main(): testset = TextDataset(args.testset) test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False, shuffle=False, collate_fn=synth_collate_fn, pin_memory=True) model = Tacotron().to(DEVICE) model_path = sorted( glob.glob(os.path.join(args.logdir, model.name, 'model-*.tar')))[-1] # latest model state = torch.load(model_path) model.load_state_dict(state['model']) args.global_step = state['global_step'] print('The model is loaded. Step: {}'.format(args.global_step)) model.eval() if not os.path.exists(os.path.join(args.sampledir, 'A')): os.makedirs(os.path.join(args.sampledir, 'A')) synthesize(model, test_loader, args.test_batch)
def main(): ap = AudioProcessor() # load model num_chars = len(phonemes) model = Tacotron(num_chars).to(device) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.eval() print('Text: {}'.format(args.text)) wav = tts(model, args.text, ap) file_name = args.text.replace(' ', '_') + '.wav' out_path = os.path.join(args.out_path, file_name) ap.save_wav(wav, out_path)
pin_memory=hparams.pin_memory) valset = PyTorchDataset(X_val, Mel_val, Y_val) val_loader = data_utils.DataLoader(valset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, shuffle=True, collate_fn=collate_fn_phonesNqF0s, pin_memory=hparams.pin_memory) # Model model = Tacotron( n_vocab=1 + len(ph_ids), embedding_dim=256, mel_dim=hparams.num_mels, linear_dim=hparams.num_freq, r=hparams.outputs_per_step, padding_idx=hparams.padding_idx, use_memory_mask=hparams.use_memory_mask, ) model = model.cuda() #model = DataParallelFix(model) optimizer = optim.Adam(model.parameters(), lr=hparams.initial_learning_rate, betas=(hparams.adam_beta1, hparams.adam_beta2), weight_decay=hparams.weight_decay) # Load checkpoint if checkpoint_path: print("Load checkpoint from: {}".format(checkpoint_path))
def main(): # DataSet Loader if args.dataset == "ljspeech": from datasets.ljspeech import LJSpeech # LJSpeech-1.1 dataset loader ljs = LJSpeech( path=cfg.dataset_path, save_to='npy', load_from=None if not os.path.exists(cfg.dataset_path + "/npy") else "npy", verbose=cfg.verbose) else: raise NotImplementedError("[-] Not Implemented Yet...") # Train/Test split tr_size = int(len(ljs) * (1. - cfg.test_size)) tr_text_data, va_text_data = \ ljs.text_data[:tr_size], ljs.text_data[tr_size:] tr_text_len_data, va_text_len_data = \ ljs.text_len_data[:tr_size], ljs.text_len_data[tr_size:] tr_mels, va_mels = ljs.mels[:tr_size], ljs.mels[tr_size:] tr_mags, va_mags = ljs.mags[:tr_size], ljs.mags[tr_size:] del ljs # memory release # Data Iterator di = DataIterator(text=tr_text_data, text_len=tr_text_len_data, mel=tr_mels, mag=tr_mags, batch_size=cfg.batch_size) if cfg.verbose: print("[*] Train/Test split : %d/%d (%.2f/%.2f)" % (tr_text_data.shape[0], va_text_data.shape[0], 1. - cfg.test_size, cfg.test_size)) print(" Train") print("\ttext : ", tr_text_data.shape) print("\ttext_len : ", tr_text_len_data.shape) print("\tmels : ", tr_mels.shape) print("\tmags : ", tr_mags.shape) print(" Test") print("\ttext : ", va_text_data.shape) print("\ttext_len : ", va_text_len_data.shape) print("\tmels : ", va_mels.shape) print("\tmags : ", va_mags.shape) # Model Loading gpu_config = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_config) with tf.Session(config=config) as sess: if cfg.model == "Tacotron": model = Tacotron(sess=sess, mode=args.mode, sample_rate=cfg.sample_rate, vocab_size=cfg.vocab_size, embed_size=cfg.embed_size, n_mels=cfg.n_mels, n_fft=cfg.n_fft, reduction_factor=cfg.reduction_factor, n_encoder_banks=cfg.n_encoder_banks, n_decoder_banks=cfg.n_decoder_banks, n_highway_blocks=cfg.n_highway_blocks, lr=cfg.lr, lr_decay=cfg.lr_decay, optimizer=cfg.optimizer, grad_clip=cfg.grad_clip, model_path=cfg.model_path) else: raise NotImplementedError("[-] Not Implemented Yet...") if cfg.verbose: print("[*] %s model is loaded!" % cfg.model) # Initializing sess.run(tf.global_variables_initializer()) # Load model & Graph & Weights global_step = 0 ckpt = tf.train.get_checkpoint_state(cfg.model_path) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint model.saver.restore(sess, ckpt.model_checkpoint_path) global_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) print("[+] global step : %d" % global_step, " successfully loaded") else: print('[-] No checkpoint file found') start_time = time.time() best_loss = np.inf batch_size = cfg.batch_size model.global_step.assign(tf.constant(global_step)) restored_epochs = global_step // (di.text.shape[0] // batch_size) for epoch in range(restored_epochs, cfg.epochs): for text, text_len, mel, mag in di.iterate(): batch_start = time.time() _, y_loss, z_loss = sess.run( [model.train_op, model.y_loss, model.z_loss], feed_dict={ model.x: text, model.x_len: text_len, model.y: mel, model.z: mag, }) batch_end = time.time() if global_step and global_step % cfg.logging_step == 0: va_y_loss, va_z_loss = 0., 0. va_batch = 20 va_iter = len(va_text_data) for idx in range(0, va_iter, va_batch): va_y, va_z = sess.run( [model.y_loss, model.z_loss], feed_dict={ model.x: va_text_data[va_batch * idx:va_batch * (idx + 1)], model.x_len: va_text_len_data[va_batch * idx:va_batch * (idx + 1)], model.y: va_mels[va_batch * idx:va_batch * (idx + 1)], model.z: va_mags[va_batch * idx:va_batch * (idx + 1)], }) va_y_loss += va_y va_z_loss += va_z va_y_loss /= (va_iter // va_batch) va_z_loss /= (va_iter // va_batch) print( "[*] epoch %03d global step %07d [%.03f sec/step]" % (epoch, global_step, (batch_end - batch_start)), " Train \n" " y_loss : {:.6f} z_loss : {:.6f}".format( y_loss, z_loss), " Valid \n" " y_loss : {:.6f} z_loss : {:.6f}".format( va_y_loss, va_z_loss)) # summary summary = sess.run(model.merged, feed_dict={ model.x: va_text_data[:batch_size], model.x_len: va_text_len_data[:batch_size], model.y: va_mels[:batch_size], model.z: va_mags[:batch_size], }) # getting/plotting alignment (important) alignment = sess.run(model.alignments, feed_dict={ model.x: va_text_data[:batch_size], model.x_len: va_text_len_data[:batch_size], model.y: va_mels[:batch_size], }) plot_alignment(alignments=alignment, gs=global_step, path=os.path.join(cfg.model_path, "alignments")) # Summary saver model.writer.add_summary(summary, global_step) # Model save model.saver.save(sess, cfg.model_path + '%s.ckpt' % cfg.model, global_step=global_step) if va_y_loss + va_z_loss < best_loss: model.best_saver.save(sess, cfg.model_path + '%s-best_loss.ckpt' % cfg.model, global_step=global_step) best_loss = va_y_loss + va_z_loss model.global_step.assign_add(tf.constant(1)) global_step += 1 end_time = time.time() print("[+] Training Done! Elapsed {:.8f}s".format(end_time - start_time))
file_name_suffix = args["--file-name-suffix"] checkpoint = torch.load(checkpoint_path) checkpoints_dir = os.path.dirname(checkpoint_path) with open(checkpoints_dir + '/ids_phones.json') as f: phids = json.load(f) with open(checkpoints_dir + '/spk_ids') as f: speakers_dict = json.load(f) model = Tacotron(n_vocab=len(phids)+1, embedding_dim=256, mel_dim=hparams.num_mels, linear_dim=hparams.num_freq, r=hparams.outputs_per_step, padding_idx=hparams.padding_idx, use_memory_mask=hparams.use_memory_mask, num_spk=len(speakers_dict.keys()) ) checkpoint = torch.load(checkpoint_path) checkpoints_dir = os.path.dirname(checkpoint_path) with open(checkpoints_dir + '/ids_phones.json') as f: phids = json.load(f) phids = dict(phids) model.load_state_dict(checkpoint["state_dict"]) model.decoder.max_decoder_steps = max_decoder_steps ids2speakers = {v:k for (k,v) in speakers_dict.items()}
collate_fn=collate_fn_spk, pin_memory=hparams.pin_memory) phi_loader = data_utils.DataLoader(phiset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, shuffle=True, collate_fn=collate_fn_spk, pin_memory=hparams.pin_memory) # Model theta_model = learn2learn.algorithms.MAML(Tacotron( n_vocab=1 + len(ph_ids), num_spk=2, embedding_dim=256, mel_dim=hparams.num_mels, linear_dim=hparams.num_freq, r=hparams.outputs_per_step, padding_idx=hparams.padding_idx, use_memory_mask=hparams.use_memory_mask, ), lr=0.01, allow_unused=True) theta_model = theta_model.cuda() phi_model = Tacotron( n_vocab=1 + len(ph_ids), num_spk=2, embedding_dim=256, mel_dim=hparams.num_mels, linear_dim=hparams.num_freq, r=hparams.outputs_per_step,
def main(): ap = AudioProcessor() train_dataset = TTSDataset('data/LJSpeech-1.1', 'train.list', outputs_per_step=r) valid_dataset = TTSDataset('data/LJSpeech-1.1', 'valid.list', outputs_per_step=r) print('train data:', len(train_dataset)) print('valid data:', len(valid_dataset)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=0, pin_memory=False) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=valid_dataset.collate_fn, drop_last=False, num_workers=0, pin_memory=False) # Create models num_chars = len(phonemes) model = Tacotron(num_chars, r=r).to(device) optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0) # StopNetは二値分類タスクなので独自に訓練する optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=lr, weight_decay=0.0) criterion = L1LossMasked() criterion_st = nn.BCEWithLogitsLoss() num_params = count_parameters(model) print('Model has {} parameters'.format(num_params)) # Training best_loss = float('inf') global_step = 0 for epoch in range(0, epochs + 1): train_loss, global_step = train(train_loader, model, criterion, criterion_st, optimizer, optimizer_st, ap, global_step, epoch) valid_loss = evaluate(valid_loader, model, criterion, criterion_st, ap, global_step, epoch) print('Epoch [{}/{}] train_loss: {:.5f} valid_loss: {:.5f}'.format( epoch, epochs, train_loss, valid_loss)) if valid_loss < best_loss: print(' => valid_loss improved from {:.5f} to {:.5f}!'.format( best_loss, valid_loss)) new_state_dict = model.state_dict() state = { 'model': new_state_dict, 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'linear_loss': valid_loss } best_loss = valid_loss best_model_path = os.path.join(writer.logdir, 'best_model.pth') torch.save(state, best_model_path)
map_location=lambda storage, loc: storage) args = checkpoint['args'] for i in new_args: args.__dict__[i] = new_args[i] torch.manual_seed(args.seed) if args.gpu is None: args.use_gpu = False args.gpu = [] else: args.use_gpu = True torch.cuda.manual_seed(args.seed) torch.cuda.set_device(args.gpu[0]) model = Tacotron(args) if args.init_from: model.load_state_dict(checkpoint['state_dict']) model.reset_decoder_states() print('loaded checkpoint %s' % (args.init_from)) stft = STFT(filter_length=args.n_fft) model = model.eval() if args.use_gpu: model = model.cuda() stft = stft.cuda() def main(): db = TTSDataset() collate = collate_class(use_txt=args.use_txt)
checkpoint_path = args["<checkpoint>"] text_list_file_path = args["<text_list_file>"] dst_dir = args["<dst_dir>"] max_decoder_steps = int(args["--max-decoder-steps"]) file_name_suffix = args["--file-name-suffix"] checkpoints_dir = os.path.dirname(checkpoint_path) with open(checkpoints_dir + '/ids_phones.json') as f: phids = json.load(f) with open(checkpoints_dir + '/ids_tones.json') as f: toneids = json.load(f) toneids = dict(toneids) checkpoint = torch.load(checkpoint_path) model = Tacotron(n_vocab=len(phids) + 1, n_tones=1 + len(toneids)) model.load_state_dict(checkpoint["state_dict"]) #model.decoder.max_decoder_steps = max_decoder_steps os.makedirs(dst_dir, exist_ok=True) with open(text_list_file_path, "rb") as f: lines = f.readlines() for idx, line in enumerate(lines): fname = line.decode("utf-8").split()[0].zfill(8) cmd = 'cp vox/wav/' + fname + '.wav ' + dst_dir + '/' + fname + '_original.wav' print(cmd) os.system(cmd) # Load phones
def main(): parser = argparse.ArgumentParser(description='training script') # data load parser.add_argument('--data', type=str, default='blizzard', help='blizzard / nancy') parser.add_argument('--batch_size', type=int, default=32, help='batch size') parser.add_argument('--text_limit', type=int, default=1000, help='maximum length of text to include in training set') parser.add_argument('--wave_limit', type=int, default=1400, help='maximum length of spectrogram to include in training set') parser.add_argument('--trunc_size', type=int, default=700, help='used for truncated-BPTT when memory is not enough.') parser.add_argument('--shuffle_data', type=int, default=1, help='whether to shuffle data loader') parser.add_argument('--load_queue_size', type=int, default=8, help='maximum number of batches to load on the memory') parser.add_argument('--n_workers', type=int, default=2, help='number of workers used in data loader') # model parser.add_argument('--charvec_dim', type=int, default=256, help='') parser.add_argument('--hidden_size', type=int, default=128, help='') parser.add_argument('--dec_out_size', type=int, default=80, help='decoder output size') parser.add_argument('--post_out_size', type=int, default=1025, help='should be n_fft / 2 + 1(check n_fft from "input_specL" ') parser.add_argument('--num_filters', type=int, default=16, help='number of filters in filter bank of CBHG') parser.add_argument('--r_factor', type=int, default=5, help='reduction factor(# of multiple output)') parser.add_argument('--dropout', type=float, default=0.5, help='') # optimization parser.add_argument('--max_epochs', type=int, default=100000, help='maximum epoch to train') parser.add_argument('--grad_clip', type=float, default=1, help='gradient clipping') parser.add_argument('--learning_rate', type=float, default=1e-3, help='2e-3 from Ito, I used to use 5e-4') parser.add_argument('--lr_decay_every', type=int, default=25000, help='decay learning rate every...') parser.add_argument('--lr_decay_factor', type=float, default=0.5, help='decay learning rate by this factor') parser.add_argument('--teacher_forcing_ratio', type=float, default=1, help='value between 0~1, use this for scheduled sampling') # loading parser.add_argument('--init_from', type=str, default='', help='load parameters from...') parser.add_argument('--resume', type=int, default=0, help='1 for resume from saved epoch') # misc parser.add_argument('--exp_no', type=int, default=0, help='') parser.add_argument('--print_every', type=int, default=-1, help='') parser.add_argument('--plot_every', type=int, default=-1, help='') parser.add_argument('--save_every', type=int, default=-1, help='') parser.add_argument('--save_dir', type=str, default='checkpoint', help='') parser.add_argument('--pinned_memory', type=int, default=1, help='1 to use pinned memory') parser.add_argument('--gpu', type=int, nargs='+', help='index of gpu machines to run') # debug parser.add_argument('--debug', type=int, default=0, help='1 for debug mode') args = parser.parse_args() torch.manual_seed(0) # set dataset option if args.data == 'blizzard': args.dir_bin = '/home/lyg0722/TTS_corpus/blizzard/segmented/bin/' elif args.data == 'etri': args.dir_bin = '/data2/lyg0722/TTS_corpus/etri/bin/' else: print('no dataset') return if args.gpu is None: args.use_gpu = False args.gpu = [] else: args.use_gpu = True torch.cuda.manual_seed(0) torch.cuda.set_device(args.gpu[0]) loader = DataLoader(args) # set misc options args.vocab_size = loader.get_num_vocab() if args.print_every == -1: args.print_every = loader.iter_per_epoch if args.plot_every == -1: args.plot_every = args.print_every if args.save_every == -1: args.save_every = loader.iter_per_epoch * 10 # save every 10 epoch by default model = Tacotron(args) model_optim = optim.Adam(model.parameters(), args.learning_rate) criterion_mel = nn.L1Loss(size_average=False) criterion_lin = nn.L1Loss(size_average=False) start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every start_epoch = 0 iter = 1 if args.init_from: checkpoint = torch.load(args.init_from, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict']) if args.resume != 0: start_epoch = checkpoint['epoch'] plot_losses = checkpoint['plot_losses'] print('loaded checkpoint %s (epoch %d)' % (args.init_from, start_epoch)) model = model.train() if args.use_gpu: model = model.cuda() criterion_mel = criterion_mel.cuda() criterion_lin = criterion_lin.cuda() print('Start training... (1 epoch = %s iters)' % (loader.iter_per_epoch)) while iter < args.max_epochs * loader.iter_per_epoch + 1: if loader.is_subbatch_end: prev_h = (None, None, None) # set prev_h = h_0 when new sentences are loaded enc_input, target_mel, target_lin, wave_lengths, text_lengths = loader.next_batch('train') max_wave_len = max(wave_lengths) enc_input = Variable(enc_input, requires_grad=False) target_mel = Variable(target_mel, requires_grad=False) target_lin = Variable(target_lin, requires_grad=False) prev_h = loader.mask_prev_h(prev_h) model_optim.zero_grad() pred_mel, pred_lin, prev_h = model(enc_input, target_mel[:, :-1], wave_lengths, text_lengths, prev_h) loss_mel = criterion_mel(pred_mel, target_mel[:, 1:])\ .div(max_wave_len * args.batch_size * args.dec_out_size) loss_linear = criterion_lin(pred_lin, target_lin[:, 1:])\ .div(max_wave_len * args.batch_size * args.post_out_size) loss = torch.sum(loss_mel + loss_linear) loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) # gradient clipping model_optim.step() print_loss_total += loss.data[0] plot_loss_total += loss.data[0] if iter % args.print_every == 0: print_loss_avg = print_loss_total / args.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / args.max_epochs), iter, iter / args.max_epochs * 100, print_loss_avg)) if iter % args.plot_every == 0: plot_loss_avg = plot_loss_total / args.plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 save_name = '%s/%dth_exp_loss.png' % (args.save_dir, args.exp_no) savePlot(plot_losses, save_name) if iter % args.save_every == 0: epoch = start_epoch + iter // loader.iter_per_epoch save_name = '%s/%d_%dth.t7' % (args.save_dir, args.exp_no, epoch) state = { 'epoch': epoch, 'args': args, 'state_dict': model.state_dict(), 'optimizer': model_optim.state_dict(), 'plot_losses': plot_losses } torch.save(state, save_name) print('model saved to', save_name) # if is_best: # TODO: implement saving best model. # shutil.copyfile(save_name, '%s/%d_best.t7' % (args.save_dir, args.exp_no)) iter += 1
trainset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, shuffle=True, collate_fn=collate_fn, pin_memory=hparams.pin_memory) valset = PyTorchDataset(X_val, Mel_val, Y_val) val_loader = data_utils.DataLoader( valset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, shuffle=True, collate_fn=collate_fn, pin_memory=hparams.pin_memory) # Model model = Tacotron(n_vocab=1+ len(ph_ids), embedding_dim=256, mel_dim=hparams.num_mels, linear_dim=hparams.num_freq, r=hparams.outputs_per_step, num_attention_heads = 4, num_encoder_layers = 4, padding_idx=hparams.padding_idx, use_memory_mask=hparams.use_memory_mask, ) model = model.cuda() #model = DataParallelFix(model) optimizer = optim.Adam(model.parameters(), lr=hparams.initial_learning_rate, betas=( hparams.adam_beta1, hparams.adam_beta2), weight_decay=hparams.weight_decay) # Load checkpoint if checkpoint_path: print("Load checkpoint from: {}".format(checkpoint_path))
checkpoint_path = args["<checkpoint>"] text_list_file_path = args["<text_list_file>"] dst_dir = args["<dst_dir>"] max_decoder_steps = int(args["--max-decoder-steps"]) file_name_suffix = args["--file-name-suffix"] checkpoint = torch.load(checkpoint_path) checkpoints_dir = os.path.dirname(checkpoint_path) with open(checkpoints_dir + '/ids_phones.json') as f: phids = json.load(f) model = Tacotron(n_vocab=len(phids)+1, embedding_dim=256, mel_dim=80, linear_dim=1025, r=5, padding_idx=hparams.padding_idx, use_memory_mask=hparams.use_memory_mask, ) checkpoint = torch.load(checkpoint_path) checkpoints_dir = os.path.dirname(checkpoint_path) with open(checkpoints_dir + '/ids_phones.json') as f: phids = json.load(f) phids = dict(phids) model.load_state_dict(checkpoint["state_dict"]) model.decoder.max_decoder_steps = max_decoder_steps os.makedirs(dst_dir, exist_ok=True) with open(text_list_file_path, "rb") as f:
def main(): parser = argparse.ArgumentParser(description='training script') # data load parser.add_argument('--data', type=str, default='blizzard', help='blizzard / nancy') parser.add_argument('--batch_size', type=int, default=6, help='batch size') parser.add_argument('--text_limit', type=int, default=1500, help='maximum length of text to include in training set') parser.add_argument('--wave_limit', type=int, default=800, help='maximum length of spectrogram to include in training set') parser.add_argument('--shuffle_data', type=int, default=0, help='whether to shuffle data loader') parser.add_argument('--batch_idx', type=int, default=0, help='n-th batch of the dataset') parser.add_argument('--load_queue_size', type=int, default=1, help='maximum number of batches to load on the memory') parser.add_argument('--n_workers', type=int, default=1, help='number of workers used in data loader') # generation option parser.add_argument('--exp_no', type=int, default=0, help='') parser.add_argument('--out_dir', type=str, default='generated', help='') parser.add_argument('--init_from', type=str, default='', help='load parameters from...') parser.add_argument('--caption', type=str, default='', help='text to generate speech') parser.add_argument('--teacher_forcing_ratio', type=float, default=0, help='value between 0~1, use this for scheduled sampling') # audio related option parser.add_argument('--n_fft', type=int, default=2048, help='fft bin size') parser.add_argument('--sample_rate', type=int, default=16000, help='sampling rate') parser.add_argument('--frame_len_inMS', type=int, default=50, help='used to determine window size of fft') parser.add_argument('--frame_shift_inMS', type=int, default=12.5, help='used to determine stride in sfft') parser.add_argument('--num_recon_iters', type=int, default=50, help='# of iteration in griffin-lim recon') # misc parser.add_argument('--gpu', type=int, nargs='+', help='index of gpu machines to run') parser.add_argument('--seed', type=int, default=0, help='random seed') new_args = vars(parser.parse_args()) # load and override some arguments checkpoint = torch.load(new_args['init_from'], map_location=lambda storage, loc: storage) args = checkpoint['args'] for i in new_args: args.__dict__[i] = new_args[i] torch.manual_seed(args.seed) # set dataset option if args.data == 'blizzard': args.dir_bin = '/data2/lyg0722/TTS_corpus/blizzard/segmented/bin/' elif args.data == 'etri': args.dir_bin = '/data2/lyg0722/TTS_corpus/etri/bin/' else: print('no dataset') return if args.gpu is None: args.use_gpu = False args.gpu = [] else: args.use_gpu = True torch.cuda.manual_seed(0) torch.cuda.set_device(args.gpu[0]) model = Tacotron(args) criterion_mel = nn.L1Loss(size_average=False) criterion_lin = nn.L1Loss(size_average=False) window_len = int(np.ceil(args.frame_len_inMS * args.sample_rate / 1000)) hop_length = int(np.ceil(args.frame_shift_inMS * args.sample_rate / 1000)) if args.init_from: model.load_state_dict(checkpoint['state_dict']) print('loaded checkpoint %s' % (args.init_from)) model = model.eval() if args.use_gpu: model = model.cuda() criterion_mel = criterion_mel.cuda() criterion_lin = criterion_lin.cuda() if args.caption: text_raw = args.caption if args.data == 'etri': text_raw = decompose_hangul(text_raw) # For Korean dataset vocab_dict = torch.load(args.dir_bin + 'vocab.t7') enc_input = [vocab_dict[i] for i in text_raw] enc_input = enc_input + [0] # null-padding at tail text_lengths = [len(enc_input)] enc_input = Variable(torch.LongTensor(enc_input).view(1,-1)) dec_input = torch.Tensor(1, 1, args.dec_out_size).fill_(0) # null-padding for start flag dec_input = Variable(dec_input) wave_lengths = [args.wave_limit] # TODO: use <EOS> later... prev_h = (None, None, None) # set prev_h = h_0 when new sentences are loaded if args.gpu: enc_input = enc_input.cuda() dec_input = dec_input.cuda() _, pred_lin, prev_h = model(enc_input, dec_input, wave_lengths, text_lengths, prev_h) # start generation wave = spectrogram2wav( pred_lin.data.view(-1, args.post_out_size).cpu().numpy(), n_fft=args.n_fft, win_length=window_len, hop_length=hop_length, num_iters=args.num_recon_iters ) # write to file outpath1 = '%s/%s_%s.wav' % (args.out_dir, args.exp_no, args.caption) outpath2 = '%s/%s_%s.png' % (args.out_dir, args.exp_no, args.caption) librosa.output.write_wav(outpath1, wave, 16000) saveAttention(text_raw, torch.cat(model.attn_weights, dim=-1).squeeze(), outpath2) else: loader = DataLoader(args) args.vocab_size = loader.get_num_vocab() for iter in range(1, loader.iter_per_epoch + 1): if loader.is_subbatch_end: prev_h = (None, None, None) # set prev_h = h_0 when new sentences are loaded for i in range(args.batch_idx): loader.next_batch('train') enc_input, target_mel, target_lin, wave_lengths, text_lengths = loader.next_batch('train') enc_input = Variable(enc_input, volatile=True) target_mel = Variable(target_mel, volatile=True) target_lin = Variable(target_lin, volatile=True) prev_h = loader.mask_prev_h(prev_h) if args.gpu: enc_input = enc_input.cuda() target_mel = target_mel.cuda() target_lin = target_lin.cuda() pred_mel, pred_lin, prev_h = model(enc_input, target_mel[:, :-1], wave_lengths, text_lengths, prev_h) loss_mel = criterion_mel(pred_mel, target_mel[:, 1:]) \ .div(max(wave_lengths) * args.batch_size * args.dec_out_size) loss_linear = criterion_lin(pred_lin, target_lin[:, 1:]) \ .div(max(wave_lengths) * args.batch_size * args.post_out_size) loss = torch.sum(loss_mel + loss_linear) print('loss:' , loss.data[0]) attentions = torch.cat(model.attn_weights, dim=-1) # write to file for n in range(enc_input.size(0)): wave = spectrogram2wav( pred_lin.data[n].view(-1, args.post_out_size).cpu().numpy(), n_fft=args.n_fft, win_length=window_len, hop_length=hop_length, num_iters=args.num_recon_iters ) outpath1 = '%s/%s_%s_%s.wav' % (args.out_dir, args.exp_no, n, args.caption) librosa.output.write_wav(outpath1, wave, 16000) outpath2 = '%s/%s_%s_%s.png' % (args.out_dir, args.exp_no, n, args.caption) saveAttention(None, attentions[n], outpath2) # showPlot(plot_losses) break
args = docopt(__doc__) print("Command line args:\n", args) checkpoint_path = args["<checkpoint>"] text_list_file_path = args["<text_list_file>"] dst_dir = args["<dst_dir>"] max_decoder_steps = int(args["--max-decoder-steps"]) file_name_suffix = args["--file-name-suffix"] #checkpoint = torch.load(checkpoint_path) checkpoints_dir = os.path.dirname(checkpoint_path) with open(checkpoints_dir + '/ids_phones.json') as f: phids = json.load(f) model = Tacotron(n_vocab=len(phids) + 1) checkpoint = torch.load(checkpoint_path) #checkpoints_dir = os.path.dirname(checkpoint_path) with open(checkpoints_dir + '/ids_phones.json') as f: phids = json.load(f) phids = dict(phids) model.load_state_dict(checkpoint["state_dict"]) #model.decoder.max_decoder_steps = max_decoder_steps os.makedirs(dst_dir, exist_ok=True) with open(text_list_file_path, "rb") as f: lines = f.readlines() for idx, line in enumerate(lines):
def main(): parser = argparse.ArgumentParser(description='training script') # Mendatory arguments parser.add_argument('--data', type=str, default='KEspeech', help='dataset type') parser.add_argument('-m', '--message', type=str, help='') # data load parser.add_argument('--batch_size', type=int, default=32, help='batch size') # model parser.add_argument('--charvec_dim', type=int, default=256, help='') parser.add_argument('--hidden_size', type=int, default=128, help='') parser.add_argument('--dec_out_size', type=int, default=80, help='decoder output size') parser.add_argument('--post_out_size', type=int, default=1025, help='should be n_fft / 2 + 1(check n_fft from "input_specL" ') parser.add_argument('--style_embed_size', type=int, default=32, help='should be n_fft / 2 + 1(check n_fft from "input_specL" ') parser.add_argument('--num_filters', type=int, default=16, help='number of filters in filter bank of CBHG') parser.add_argument('--r_factor', type=int, default=5, help='reduction factor(# of multiple output)') parser.add_argument('--use_txt', type=float, default=0.5, help='0~1, higher value means y_t batch is more sampled') # optimization parser.add_argument('--max_epochs', type=int, default=100000, help='maximum epoch to train') parser.add_argument('--grad_clip', type=float, default=1., help='gradient clipping') parser.add_argument('--learning_rate', type=float, default=1e-3, help='2e-3 from Ito, I used to use 5e-4') parser.add_argument('--teacher_forcing_ratio', type=float, default=1, help='value between 0~1, use this for scheduled sampling') # loading parser.add_argument('--init_from', type=str, default='', help='load parameters from...') parser.add_argument('--resume', type=int, default=0, help='1 for resume from saved epoch') # misc parser.add_argument('--print_every', type=int, default=10, help='') parser.add_argument('--save_every', type=int, default=10, help='') parser.add_argument('--save_dir', type=str, default='result', help='') parser.add_argument('-g', '--gpu', type=int, nargs='+', help='index of gpu machines to run') args = parser.parse_args() torch.manual_seed(0) kwargs = {'num_workers': 0, 'pin_memory': True} if args.gpu is None: args.use_gpu = False args.gpu = [] else: args.use_gpu = True torch.cuda.manual_seed(0) torch.cuda.set_device(args.gpu[0]) if args.data == 'KEspeech': dataset = TTSDataset() print('[*] Dataset: {}'.format(args.data)) assert args.message is not None, "You have to set message" today = time.strftime('%y%m%d') savepath = join('result', '{}_{}'.format(today, args.message)) if not exists(savepath): os.makedirs(savepath) elif args.message=='test': os.system("rm -rf {}/*".format(savepath)) else: input("Path already exists, wish to continue?") os.system("rm -rf {}/*".format(savepath)) os.system("rm -rf wandb/*{}*{}*".format(today, args.message)) collate = collate_class(use_txt=0.5) loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, \ shuffle=True, collate_fn=collate.fn, drop_last=True, **kwargs) # set misc options args.vocab_size = dataset.get_vocab_size() args.gender_num = len(dataset.gen_lu) args.age_num = len(dataset.age_lu) args.emotion_num = len(dataset.emo_lu) # model define model = Tacotron(args) model_optim = optim.Adam(model.parameters(), args.learning_rate) scheduler = lr_scheduler.StepLR(model_optim, step_size=10) criterion_mel = nn.L1Loss() criterion_lin = nn.L1Loss() # wandb wandb.init(project='disentangle_tts', name=args.message) wandb.config['hostname'] = os.uname()[1] wandb.config.update(args) wandb.watch(model) with open(join(savepath, 'model.txt'), 'w') as f: f.write(str(model)) torch.save(args, join(savepath, 'arg.pt')) os.system('cp *.py {}'.format(savepath)) start = time.time() iter_per_epoch = len(dataset)//args.batch_size losses = [] loss_total = 0 start_epoch = 0 it = 1 if args.init_from: checkpoint = torch.load(args.init_from, map_location=lambda storage, loc: storage) pretrained_weight = checkpoint['state_dict'].copy() our_state_dict = model.state_dict() for k, v in checkpoint['state_dict'].items(): if k in our_state_dict.keys(): if checkpoint['state_dict'][k].shape != our_state_dict[k].shape: pretrained_weight[k] = our_state_dict[k] else: del pretrained_weight[k] for k, v in our_state_dict.items(): if k not in pretrained_weight.keys(): pretrained_weight[k] = v for name, param in model.named_parameters(): print('{}\t{}'.format(name, param.requires_grad)) if args.resume: start_epoch = checkpoint['epoch'] model_optim.load_state_dict(checkpoint['optimizer']) plot_losses = checkpoint['plot_losses'] print('loaded checkpoint %s (epoch %d)' % (args.init_from, start_epoch)) epoch = start_epoch model = model.train() if args.use_gpu: model = model.cuda() criterion_mel = criterion_mel.cuda() criterion_lin = criterion_lin.cuda() print('Start training... {} iter per epoch'.format(iter_per_epoch)) for epoch in range(args.max_epochs): for it, this_batch in enumerate(loader): start_it = time.time() if args.use_gpu: for k, v in this_batch.items(): try: this_batch[k] = Variable(v.cuda(), requires_grad=False) except AttributeError: pass for param_group in model_optim.param_groups: param_group['lr'] = decay_learning_rate(args.learning_rate, it, iter_per_epoch, start_epoch) model.reset_decoder_states() model.mask_decoder_states() model_optim.zero_grad() pred_mel, pred_lin, att = model(**this_batch) loss_mel = criterion_mel(pred_mel, this_batch['target_mel']) loss_linear = criterion_lin(pred_lin, this_batch['lin']) loss = loss_mel + loss_linear loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) model_optim.step() #scheduler.step() losses.append(loss.data.item()) loss_total += loss.data.item() if it % args.print_every == 0: seen_it = iter_per_epoch * epoch + it seen_samples = epoch * len(loader.dataset) + it * args.batch_size seen_epochs = seen_samples / float(len(loader.dataset)) print('epoch: {:2d} iter: {:3d} loss: {:5.3f} elapsed: {} periter: {:4.2f}s'.format( epoch, it, np.mean(losses[-args.print_every:]), asHMS(time.time()-start), time.time()-start_it)) log_dict = { 'epoch/train': seen_epochs, 'mel_loss/train': loss_mel, 'lin_loss/train': loss_linear, 'total_loss/train': loss, 'att': wandb.Image(torch.cat(att, dim=-1)[0].detach().cpu().numpy().T, caption='Attention graph'), } wandb.log(log_dict, step=seen_it) if epoch % args.save_every == 0: save_name = '{}/model_{}th.pt'.format(savepath, epoch) state = { 'epoch': epoch, 'args': args, 'state_dict': model.state_dict(), 'optimizer': model_optim.state_dict(), 'plot_losses': losses } torch.save(state, save_name) print('model saved to', save_name)