def main(args):
    torch.cuda.manual_seed(13524532)

    print("... Load trained models ...\n")
    print("     Loding checkpoint of document-level TTS model: {}".format(
        tts_ckpt))
    print("     Loding checkpoint of MelGAN TTS model: {}".format(
        args.mel_ckpt))
    start = time.time()

    mel_ckpt = torch.load(args.mel_ckpt)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(mel_ckpt['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()
    model.load_state_dict(mel_ckpt['model_g'])
    model.eval(inference=False)
    mel_time = time.time() - start

    print('\n... Generate waveform ...\n')
    with torch.no_grad():
        num_of_iter = args.iteration
        texts = []
        with open(args.script_path, "r") as f:
            for line in f:
                line = line.strip()
                if len(line):
                    texts.append(line)

        print("   * input text\n    {} \n".format(texts[0]))

        for i in range(num_of_iter):
            start = time.time()
            mel, length, alignments = infer(args.tts_ckpt, texts[0])

            if len(mel.shape) == 2:
                mel = mel.unsqueeze(0)
            mel = mel.cuda()

            audio = model.inference(mel)
            audio = audio.cpu().detach().numpy()
            save_path = os.path.join(args.out_dir, str(i) + '_audio.wav')
            write(save_path, hp.audio.sampling_rate, audio)
            audio_length = len(audio) / hp.audio.sampling_rate

            print("    {}. ".format(i + 1))
            print("     - Path of generated audio file: {}".format(save_path))
            print("     - Length of generated audio file: {}s".format(
                audio_length))
            print(
                "     - Time taken from text loading to generate spectrogram: : {}s"
                .format(time.time() - start))
            print("     - Time taken to generate waveform: : {}s\n".format(
                time.time() - start + mel_time))
        print("finished generation")
def main(args):
    torch.manual_seed(1234)
    torch.cuda.manual_seed(1234)

    text_path = "/media/sh/Workspace/긴문장합성/kor_Document-level_Neural_TTS_length/test/1.txt"
    doc_ckpt_kor = '/media/sh/Workspace/긴문장합성/kor_Document-level_Neural_TTS_length/outdir/checkpoint_29000'
    save_folder = '/media/sh/Workspace/긴문장합성/samples'
    today = datetime.datetime.today()
    time = str(today.month) + str(today.day) + str(today.hour) + str(
        today.minute) + str(today.second)
    # time = str(time)
    save_name = 'kor_audio_length_regul_' + doc_ckpt_kor.split(
        '_')[-1] + '_' + time + '.wav'
    save_path = os.path.join(save_folder, save_name)

    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=False)

    with torch.no_grad():
        # for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))):
        #     mel = torch.load(melpath)
        texts = []
        with open(text_path, "r") as f:
            for line in f:
                line = line.strip()
                if len(line):
                    texts.append(line)

        mel, length, alignments = infer(doc_ckpt_kor, texts[0])

        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)
        mel = mel.cuda()

        audio = model.inference(mel)
        audio = audio.cpu().detach().numpy()
        # out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
        write(save_path, hp.audio.sampling_rate, audio)
        print('합성 끝')
示例#3
0
def get_melgan(full_path=None):
    if not full_path:
        melgan = torch.hub.load('seungwonpark/melgan', 'melgan')
        melgan.eval()
        melgan.to(device)
        return melgan
    
    # make sure to clone seungwonpark/melgan
    print("use local vocoder")
    from melgan.utils.hparams import load_hparam_str
    from melgan.model.generator import Generator
    cp = torch.load(full_path, map_location=device)
    hp = load_hparam_str(cp["hp_str"])
    model = Generator(hp.audio.n_mel_channels)
    if torch.cuda.is_available():
        model = model.cuda()
    model.load_state_dict(cp["model_g"])
    model.eval(inference=False)
    model.to(device)
    return model
示例#4
0
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=False)

    with torch.no_grad():
        # for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))):
        #     mel = torch.load(melpath)
        texts = []
        with open("/media/qw/data/Experiment/Encoder_selfAtt/test/1.txt",
                  "r") as f:
            for line in f:
                line = line.strip()
                if len(line):
                    texts.append(line)
        for i in range(10):
            mel, length, alignments = infer(
                '/media/qw/data/Experiment/Encoder_selfAtt/tacotron2_statedict.pt',
                texts[0])
            # mel, length, alignments = infer('/media/qw/data/Experiment/Encoder_selfAtt/result/3sentence', texts[0])
            # print('/'*i, '.'*(50-i))
            # plt.figure()
            # plt.imshow(alignments[0].T.cpu())
            # plt.savefig('./align/alignment{}.png'.format(i), dpi=300)
            # mel, length, alignments = infer('/media/qw/data/Experiment/Encoder_selfAtt/tacotron2_statedict.pt', 'Emil Sinclair is the protagonist of the novel. hello my name is sung woong hwang.')

            if len(mel.shape) == 2:
                mel = mel.unsqueeze(0)
            mel = mel.cuda()

            audio = model.inference(mel)
            audio = audio.cpu().detach().numpy()
            # out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
            write('/media/qw/data/Experiment/Encoder_selfAtt/audio.wav',
                  hp.audio.sampling_rate, audio)