Exemplo n.º 1
0
def synthesis(text, args):
    m = Model()
    m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(args.restore_step1, "transformer"))
    m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1,1, 80]).cuda()
    pos_text = t.arange(1, text.size(1)+1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m=m.cuda()
    m_post = m_post.cuda()
    m.train(False)
    m_post.train(False)
    
    pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for i in pbar:
            pos_mel = t.arange(1,mel_input.size(1)+1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(text, mel_input, pos_text, pos_mel)
            mel_input = t.cat([t.zeros([1,1, 80]).cuda(),postnet_pred], dim=1)

        mag_pred = m_post.forward(postnet_pred)
        
    wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
    write(hp.sample_path + "/test.wav", hp.sr, wav)
Exemplo n.º 2
0
def synthesis(text, num):
    m = Model()
    # m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(num, "transformer"))
    # m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1, 1, 80]).cuda()
    pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m = m.cuda()
    # m_post = m_post.cuda()
    m.train(False)
    # m_post.train(False)

    # pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for _ in range(1000):
            pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(
                text, mel_input, pos_text, pos_mel)
            mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1)

        # mag_pred = m_post.forward(postnet_pred)

    # wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
    mel_postnet = postnet_pred[0].cpu().numpy().T
    plot_data([mel_postnet for _ in range(2)])
    wav = audio.inv_mel_spectrogram(mel_postnet)
    wav = wav[0:audio.find_endpoint(wav)]
    audio.save_wav(wav, "result.wav")
Exemplo n.º 3
0
def synthesis(text, args):
    m = Model()
    m.load_state_dict(load_checkpoint(args.restore_path))
    print("[%s][%s] Synthesizing:" % (args.lang, args.spk), text)

    text = np.asarray([1] + list(text.encode('utf-8')) + [2])
    text = t.LongTensor(text).unsqueeze(0)
    text = text
    mel_input = t.zeros([1, 1, 80])
    pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text
    lang_to_id = json.load(open(os.path.join(args.data_path, 'lang_id.json')))
    spk_to_id = json.load(open(os.path.join(args.data_path, 'spk_id.json')))
    lang_id = lang_to_id[args.lang]
    spk_id = spk_to_id[args.spk]

    lang_id = t.LongTensor([lang_id])
    spk_id = t.LongTensor([spk_id])
    m.train(False)
    pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for i in pbar:
            pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0)
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = \
                m.forward(text, mel_input, pos_text, pos_mel, lang_id, spk_id)
            mel_input = t.cat([mel_input, mel_pred[:, -1:, :]], dim=1)
            if stop_token[:, -1].item() > 0:
                break

    mel = postnet_pred.squeeze(0).cpu().numpy()
    wav = mel2wav(mel)
    np.save(args.out_path + "_mel.npy", mel)
    write(args.out_path + ".wav", hp.sr, wav)
    plot_mel(args.out_path + "_mel.png", mel)
    plot_attn(attn, args.out_path + '_align.png')
Exemplo n.º 4
0
def synthesis(text, args, num):
    m = Model()
    m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(args.restore_step1, "transformer"))
    m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1, 1, 80]).cuda()
    pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m = m.cuda()
    m_post = m_post.cuda()
    m.train(False)
    m_post.train(False)

    pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for i in pbar:
            pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(
                text, mel_input, pos_text, pos_mel)
            # print('mel_pred==================',mel_pred.shape)
            # print('postnet_pred==================', postnet_pred.shape)
            mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1)
            #print(postnet_pred[:, -1:, :])
            #print(t.argmax(attn[1][1][i]).item())
            #print('mel_input==================', mel_input.shape)

    # #直接用真实mel测试postnet效果
    #aa = t.from_numpy(np.load('D:\SSHdownload\\000101.pt.npy')).cuda().unsqueeze(0)
    # # print(aa.shape)
    mag_pred = m_post.forward(postnet_pred)
    #real_mag = t.from_numpy((np.load('D:\SSHdownload\\003009.mag.npy'))).cuda().unsqueeze(dim=0)
    #wav = spectrogram2wav(postnet_pred)

    #print('shappe============',attn[2][0].shape)
    # count = 0
    # for j in range(4):
    #     count += 1
    #     attn1 = attn[0][j].cpu()
    #     plot_alignment(attn1, path='./training_loss/'+ str(args.restore_step1)+'_'+str(count)+'_'+'S'+str(num)+'.png', title='sentence'+str(num))

    attn1 = attn[0][1].cpu()
    plot_alignment(attn1,
                   path='./training_loss/' + str(args.restore_step1) + '_' +
                   'S' + str(num) + '.png',
                   title='sentence' + str(num))

    wav = spectrogram2wav(mag_pred.squeeze(0).cpu().detach().numpy())
    write(
        hp.sample_path + '/' + str(args.restore_step1) + '-' + "test" +
        str(num) + ".wav", hp.sr, wav)
Exemplo n.º 5
0
def synthesis(args):
    m = Model()
    m_post = ModelPostNet()
    m_stop = ModelStopToken()
    m.load_state_dict(load_checkpoint(args.restore_step1, "transformer"))
    m_stop.load_state_dict(load_checkpoint(args.restore_step3, "stop_token"))
    m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    m=m.cuda()
    m_post = m_post.cuda()
    m_stop = m_stop.cuda()
    m.train(False)
    m_post.train(False)
    m_stop.train(False)
    test_dataset = get_dataset(hp.test_data_csv)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1)
    ref_dataset = get_dataset(hp.test_data_csv)
    ref_dataloader = DataLoader(ref_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1)

    writer = get_writer(hp.checkpoint_path, hp.log_directory)

    ref_dataloader_iter = iter(ref_dataloader)
    for i, data in enumerate(test_dataloader):
        character, mel, mel_input, pos_text, pos_mel, text_length, mel_length, fname = data
        ref_character, ref_mel, ref_mel_input, ref_pos_text, ref_pos_mel, ref_text_length, ref_mel_length, ref_fname = next(ref_dataloader_iter)
        stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1)
        mel_input = t.zeros([1,1,80]).cuda()
        stop=[]
        character = character.cuda()
        mel = mel.cuda()
        mel_input = mel_input.cuda()
        pos_text = pos_text.cuda()
        pos_mel = pos_mel.cuda()
        ref_character = ref_character.cuda()
        ref_mel = ref_mel.cuda()
        ref_mel_input = ref_mel_input.cuda()
        ref_pos_text = ref_pos_text.cuda()
        ref_pos_mel = ref_pos_mel.cuda()

        with t.no_grad():
            start=time.time()
            for i in range(args.max_len):
                pos_mel = t.arange(1,mel_input.size(1)+1).unsqueeze(0).cuda()
                mel_pred, postnet_pred, attn_probs, decoder_output, attns_enc, attns_dec, attns_style = m.forward(character, mel_input, pos_text, pos_mel, ref_mel, ref_pos_mel)
                stop_token = m_stop.forward(decoder_output)
                mel_input = t.cat([mel_input, postnet_pred[:,-1:,:]], dim=1)
                stop.append(t.sigmoid(stop_token).squeeze(-1)[0,-1])
                if stop[-1] > 0.5:
                    print("stop token at " + str(i) + " is :", stop[-1])
                    print("model inference time: ", time.time() - start)
                    break
            if stop[-1] == 0:
                continue
            mag_pred = m_post.forward(postnet_pred)
            inf_time = time.time() - start
            print("inference time: ", inf_time)

        wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
        print("rtx : ", (len(wav)/hp.sr) / inf_time)
        wav_path = os.path.join(hp.sample_path, 'wav')
        if not os.path.exists(wav_path):
            os.makedirs(wav_path)
        write(os.path.join(wav_path, "text_{}_ref_{}_synth.wav".format(fname, ref_fname)), hp.sr, wav)
        print("written as text{}_ref_{}_synth.wav".format(fname, ref_fname))
        attns_enc_new=[]
        attns_dec_new=[]
        attn_probs_new=[]
        attns_style_new=[]
        for i in range(len(attns_enc)):
            attns_enc_new.append(attns_enc[i].unsqueeze(0))
            attns_dec_new.append(attns_dec[i].unsqueeze(0))
            attn_probs_new.append(attn_probs[i].unsqueeze(0))
            attns_style_new.append(attns_style[i].unsqueeze(0))
        attns_enc = t.cat(attns_enc_new, 0)
        attns_dec = t.cat(attns_dec_new, 0)
        attn_probs = t.cat(attn_probs_new, 0)
        attns_style = t.cat(attns_style_new, 0)

        attns_enc = attns_enc.contiguous().view(attns_enc.size(0), 1, hp.n_heads, attns_enc.size(2), attns_enc.size(3))
        attns_enc = attns_enc.permute(1,0,2,3,4)
        attns_dec = attns_dec.contiguous().view(attns_dec.size(0), 1, hp.n_heads, attns_dec.size(2), attns_dec.size(3))
        attns_dec = attns_dec.permute(1,0,2,3,4)
        attn_probs = attn_probs.contiguous().view(attn_probs.size(0), 1, hp.n_heads, attn_probs.size(2), attn_probs.size(3))
        attn_probs = attn_probs.permute(1,0,2,3,4)
        attns_style = attns_style.contiguous().view(attns_style.size(0), 1, hp.n_heads, attns_style.size(2), attns_style.size(3))
        attns_style = attns_style.permute(1,0,2,3,4)

        save_dir = os.path.join(hp.sample_path, 'figure', "text_{}_ref_{}_synth.wav".format(fname, ref_fname))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        writer.add_alignments(attns_enc.detach().cpu(), attns_dec.detach().cpu(), attn_probs.detach().cpu(), attns_style.detach().cpu(), mel_length, text_length, args.restore_step1, 'Validation', save_dir)