def main(): testset = TextDataset(args.testset) test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False, shuffle=False, collate_fn=synth_collate_fn, pin_memory=True) model = Tacotron().to(DEVICE) model_path = sorted( glob.glob(os.path.join(args.logdir, model.name, 'model-*.tar')))[-1] # latest model state = torch.load(model_path) model.load_state_dict(state['model']) args.global_step = state['global_step'] print('The model is loaded. Step: {}'.format(args.global_step)) model.eval() if not os.path.exists(os.path.join(args.sampledir, 'A')): os.makedirs(os.path.join(args.sampledir, 'A')) synthesize(model, test_loader, args.test_batch)
def main(): ap = AudioProcessor() # load model num_chars = len(phonemes) model = Tacotron(num_chars).to(device) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.eval() print('Text: {}'.format(args.text)) wav = tts(model, args.text, ap) file_name = args.text.replace(' ', '_') + '.wav' out_path = os.path.join(args.out_path, file_name) ap.save_wav(wav, out_path)
if args.gpu is None: args.use_gpu = False args.gpu = [] else: args.use_gpu = True torch.cuda.manual_seed(args.seed) torch.cuda.set_device(args.gpu[0]) model = Tacotron(args) if args.init_from: model.load_state_dict(checkpoint['state_dict']) model.reset_decoder_states() print('loaded checkpoint %s' % (args.init_from)) stft = STFT(filter_length=args.n_fft) model = model.eval() if args.use_gpu: model = model.cuda() stft = stft.cuda() def main(): db = TTSDataset() collate = collate_class(use_txt=args.use_txt) loader = torch.utils.data.DataLoader(db, batch_size=1, shuffle=False, collate_fn=collate.fn, drop_last=True) model_name = args.init_from.split('/')[-1][:-3]
def main(): parser = argparse.ArgumentParser(description='training script') # data load parser.add_argument('--data', type=str, default='blizzard', help='blizzard / nancy') parser.add_argument('--batch_size', type=int, default=6, help='batch size') parser.add_argument('--text_limit', type=int, default=1500, help='maximum length of text to include in training set') parser.add_argument('--wave_limit', type=int, default=800, help='maximum length of spectrogram to include in training set') parser.add_argument('--shuffle_data', type=int, default=0, help='whether to shuffle data loader') parser.add_argument('--batch_idx', type=int, default=0, help='n-th batch of the dataset') parser.add_argument('--load_queue_size', type=int, default=1, help='maximum number of batches to load on the memory') parser.add_argument('--n_workers', type=int, default=1, help='number of workers used in data loader') # generation option parser.add_argument('--exp_no', type=int, default=0, help='') parser.add_argument('--out_dir', type=str, default='generated', help='') parser.add_argument('--init_from', type=str, default='', help='load parameters from...') parser.add_argument('--caption', type=str, default='', help='text to generate speech') parser.add_argument('--teacher_forcing_ratio', type=float, default=0, help='value between 0~1, use this for scheduled sampling') # audio related option parser.add_argument('--n_fft', type=int, default=2048, help='fft bin size') parser.add_argument('--sample_rate', type=int, default=16000, help='sampling rate') parser.add_argument('--frame_len_inMS', type=int, default=50, help='used to determine window size of fft') parser.add_argument('--frame_shift_inMS', type=int, default=12.5, help='used to determine stride in sfft') parser.add_argument('--num_recon_iters', type=int, default=50, help='# of iteration in griffin-lim recon') # misc parser.add_argument('--gpu', type=int, nargs='+', help='index of gpu machines to run') parser.add_argument('--seed', type=int, default=0, help='random seed') new_args = vars(parser.parse_args()) # load and override some arguments checkpoint = torch.load(new_args['init_from'], map_location=lambda storage, loc: storage) args = checkpoint['args'] for i in new_args: args.__dict__[i] = new_args[i] torch.manual_seed(args.seed) # set dataset option if args.data == 'blizzard': args.dir_bin = '/data2/lyg0722/TTS_corpus/blizzard/segmented/bin/' elif args.data == 'etri': args.dir_bin = '/data2/lyg0722/TTS_corpus/etri/bin/' else: print('no dataset') return if args.gpu is None: args.use_gpu = False args.gpu = [] else: args.use_gpu = True torch.cuda.manual_seed(0) torch.cuda.set_device(args.gpu[0]) model = Tacotron(args) criterion_mel = nn.L1Loss(size_average=False) criterion_lin = nn.L1Loss(size_average=False) window_len = int(np.ceil(args.frame_len_inMS * args.sample_rate / 1000)) hop_length = int(np.ceil(args.frame_shift_inMS * args.sample_rate / 1000)) if args.init_from: model.load_state_dict(checkpoint['state_dict']) print('loaded checkpoint %s' % (args.init_from)) model = model.eval() if args.use_gpu: model = model.cuda() criterion_mel = criterion_mel.cuda() criterion_lin = criterion_lin.cuda() if args.caption: text_raw = args.caption if args.data == 'etri': text_raw = decompose_hangul(text_raw) # For Korean dataset vocab_dict = torch.load(args.dir_bin + 'vocab.t7') enc_input = [vocab_dict[i] for i in text_raw] enc_input = enc_input + [0] # null-padding at tail text_lengths = [len(enc_input)] enc_input = Variable(torch.LongTensor(enc_input).view(1,-1)) dec_input = torch.Tensor(1, 1, args.dec_out_size).fill_(0) # null-padding for start flag dec_input = Variable(dec_input) wave_lengths = [args.wave_limit] # TODO: use <EOS> later... prev_h = (None, None, None) # set prev_h = h_0 when new sentences are loaded if args.gpu: enc_input = enc_input.cuda() dec_input = dec_input.cuda() _, pred_lin, prev_h = model(enc_input, dec_input, wave_lengths, text_lengths, prev_h) # start generation wave = spectrogram2wav( pred_lin.data.view(-1, args.post_out_size).cpu().numpy(), n_fft=args.n_fft, win_length=window_len, hop_length=hop_length, num_iters=args.num_recon_iters ) # write to file outpath1 = '%s/%s_%s.wav' % (args.out_dir, args.exp_no, args.caption) outpath2 = '%s/%s_%s.png' % (args.out_dir, args.exp_no, args.caption) librosa.output.write_wav(outpath1, wave, 16000) saveAttention(text_raw, torch.cat(model.attn_weights, dim=-1).squeeze(), outpath2) else: loader = DataLoader(args) args.vocab_size = loader.get_num_vocab() for iter in range(1, loader.iter_per_epoch + 1): if loader.is_subbatch_end: prev_h = (None, None, None) # set prev_h = h_0 when new sentences are loaded for i in range(args.batch_idx): loader.next_batch('train') enc_input, target_mel, target_lin, wave_lengths, text_lengths = loader.next_batch('train') enc_input = Variable(enc_input, volatile=True) target_mel = Variable(target_mel, volatile=True) target_lin = Variable(target_lin, volatile=True) prev_h = loader.mask_prev_h(prev_h) if args.gpu: enc_input = enc_input.cuda() target_mel = target_mel.cuda() target_lin = target_lin.cuda() pred_mel, pred_lin, prev_h = model(enc_input, target_mel[:, :-1], wave_lengths, text_lengths, prev_h) loss_mel = criterion_mel(pred_mel, target_mel[:, 1:]) \ .div(max(wave_lengths) * args.batch_size * args.dec_out_size) loss_linear = criterion_lin(pred_lin, target_lin[:, 1:]) \ .div(max(wave_lengths) * args.batch_size * args.post_out_size) loss = torch.sum(loss_mel + loss_linear) print('loss:' , loss.data[0]) attentions = torch.cat(model.attn_weights, dim=-1) # write to file for n in range(enc_input.size(0)): wave = spectrogram2wav( pred_lin.data[n].view(-1, args.post_out_size).cpu().numpy(), n_fft=args.n_fft, win_length=window_len, hop_length=hop_length, num_iters=args.num_recon_iters ) outpath1 = '%s/%s_%s_%s.wav' % (args.out_dir, args.exp_no, n, args.caption) librosa.output.write_wav(outpath1, wave, 16000) outpath2 = '%s/%s_%s_%s.png' % (args.out_dir, args.exp_no, n, args.caption) saveAttention(None, attentions[n], outpath2) # showPlot(plot_losses) break