Exemplo n.º 1
0
 def _setup_melgan(self, checkpoint):
     checkpoint = torch.load(checkpoint, map_location=self.device)
     hp = HParam("./speech/melgan/config/default.yaml")
     melgan = Generator(hp.audio.n_mel_channels).to(self.device)
     melgan.load_state_dict(checkpoint["model_g"])
     melgan.eval(inference=False)
     return melgan
def main(args):
    torch.cuda.manual_seed(13524532)

    print("... Load trained models ...\n")
    print("     Loding checkpoint of document-level TTS model: {}".format(
        tts_ckpt))
    print("     Loding checkpoint of MelGAN TTS model: {}".format(
        args.mel_ckpt))
    start = time.time()

    mel_ckpt = torch.load(args.mel_ckpt)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(mel_ckpt['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()
    model.load_state_dict(mel_ckpt['model_g'])
    model.eval(inference=False)
    mel_time = time.time() - start

    print('\n... Generate waveform ...\n')
    with torch.no_grad():
        num_of_iter = args.iteration
        texts = []
        with open(args.script_path, "r") as f:
            for line in f:
                line = line.strip()
                if len(line):
                    texts.append(line)

        print("   * input text\n    {} \n".format(texts[0]))

        for i in range(num_of_iter):
            start = time.time()
            mel, length, alignments = infer(args.tts_ckpt, texts[0])

            if len(mel.shape) == 2:
                mel = mel.unsqueeze(0)
            mel = mel.cuda()

            audio = model.inference(mel)
            audio = audio.cpu().detach().numpy()
            save_path = os.path.join(args.out_dir, str(i) + '_audio.wav')
            write(save_path, hp.audio.sampling_rate, audio)
            audio_length = len(audio) / hp.audio.sampling_rate

            print("    {}. ".format(i + 1))
            print("     - Path of generated audio file: {}".format(save_path))
            print("     - Length of generated audio file: {}s".format(
                audio_length))
            print(
                "     - Time taken from text loading to generate spectrogram: : {}s"
                .format(time.time() - start))
            print("     - Time taken to generate waveform: : {}s\n".format(
                time.time() - start + mel_time))
        print("finished generation")
Exemplo n.º 3
0
    def __init_vocoder(self, vocoder_model_path):

        # load pre trained MelGAN model for mel2audio:
        temp_model = torch.load(vocoder_model_path, map_location='cpu')

        model = Generator(80)  # Number of mel channels
        model.load_state_dict(temp_model['model_g'])
        model.eval(inference=False)

        return model
Exemplo n.º 4
0
def main(args):
    torch.manual_seed(1234)
    torch.cuda.manual_seed(1234)

    text_path = "/media/sh/Workspace/긴문장합성/kor_Document-level_Neural_TTS_length/test/1.txt"
    doc_ckpt_kor = '/media/sh/Workspace/긴문장합성/kor_Document-level_Neural_TTS_length/outdir/checkpoint_29000'
    save_folder = '/media/sh/Workspace/긴문장합성/samples'
    today = datetime.datetime.today()
    time = str(today.month) + str(today.day) + str(today.hour) + str(
        today.minute) + str(today.second)
    # time = str(time)
    save_name = 'kor_audio_length_regul_' + doc_ckpt_kor.split(
        '_')[-1] + '_' + time + '.wav'
    save_path = os.path.join(save_folder, save_name)

    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=False)

    with torch.no_grad():
        # for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))):
        #     mel = torch.load(melpath)
        texts = []
        with open(text_path, "r") as f:
            for line in f:
                line = line.strip()
                if len(line):
                    texts.append(line)

        mel, length, alignments = infer(doc_ckpt_kor, texts[0])

        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)
        mel = mel.cuda()

        audio = model.inference(mel)
        audio = audio.cpu().detach().numpy()
        # out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
        write(save_path, hp.audio.sampling_rate, audio)
        print('합성 끝')
Exemplo n.º 5
0
def load_tts_model(checkpoint_path=None, melgan_path=None):

    # set-up params
    hparams = create_hparams()

    # load model from checkpoint
    model = load_model(hparams)
    model.load_state_dict(
        torch.load(checkpoint_path, map_location='cpu')['state_dict'])
    _ = model.eval()

    # Load MelGAN for mel2audio synthesis and denoiser
    checkpoint = torch.load(melgan_path, map_location='cpu')
    #TODO base path = path of the file
    hp_melgan = load_hparam("./melgan/config/default.yaml")
    vocoder_model = Generator(80)  # Number of mel channels
    vocoder_model.load_state_dict(checkpoint['model_g'])
    vocoder_model.eval(inference=False)

    return model, vocoder_model, hparams
Exemplo n.º 6
0
def get_melgan(full_path=None):
    if not full_path:
        melgan = torch.hub.load('seungwonpark/melgan', 'melgan')
        melgan.eval()
        melgan.to(device)
        return melgan
    
    # make sure to clone seungwonpark/melgan
    print("use local vocoder")
    from melgan.utils.hparams import load_hparam_str
    from melgan.model.generator import Generator
    cp = torch.load(full_path, map_location=device)
    hp = load_hparam_str(cp["hp_str"])
    model = Generator(hp.audio.n_mel_channels)
    if torch.cuda.is_available():
        model = model.cuda()
    model.load_state_dict(cp["model_g"])
    model.eval(inference=False)
    model.to(device)
    return model
Exemplo n.º 7
0
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=False)

    with torch.no_grad():
        # for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))):
        #     mel = torch.load(melpath)
        texts = []
        with open("/media/qw/data/Experiment/Encoder_selfAtt/test/1.txt",
                  "r") as f:
            for line in f:
                line = line.strip()
                if len(line):
                    texts.append(line)
        for i in range(10):
            mel, length, alignments = infer(
                '/media/qw/data/Experiment/Encoder_selfAtt/tacotron2_statedict.pt',
                texts[0])
            # mel, length, alignments = infer('/media/qw/data/Experiment/Encoder_selfAtt/result/3sentence', texts[0])
            # print('/'*i, '.'*(50-i))
            # plt.figure()
            # plt.imshow(alignments[0].T.cpu())
            # plt.savefig('./align/alignment{}.png'.format(i), dpi=300)
            # mel, length, alignments = infer('/media/qw/data/Experiment/Encoder_selfAtt/tacotron2_statedict.pt', 'Emil Sinclair is the protagonist of the novel. hello my name is sung woong hwang.')

            if len(mel.shape) == 2:
                mel = mel.unsqueeze(0)
            mel = mel.cuda()

            audio = model.inference(mel)
            audio = audio.cpu().detach().numpy()
            # out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
            write('/media/qw/data/Experiment/Encoder_selfAtt/audio.wav',
                  hp.audio.sampling_rate, audio)
Exemplo n.º 8
0
    def mel_to_wav(mel_input):

        # TEMPORARY: melgan vocoder
        # melgan_vocoder = torch.hub.load('seungwonpark/melgan', 'melgan').to(config["device"])
        # melgan_vocoder.eval()
        from melgan.model.generator import Generator
        ckpt = torch.load('../runs/melgan_TEMP/librispeech_41cec78_0525.pt')
        melgan_vocoder = Generator(80).to(device)
        melgan_vocoder.load_state_dict(ckpt['model_g'])
        melgan_vocoder.eval()
        # END TEMPORARY

        mean, std = np.load(hparams.mel_mean_std)
        mean = torch.FloatTensor(mean)[:, None].to(device)
        std = torch.FloatTensor(std)[:, None].to(device)
        mel_input = 1.2 * mel_input * std + mean
        mel_input = torch.log(torch.clamp(torch.exp(mel_input), 1e-5))

        audio = melgan_vocoder.inference(mel_input).float() / 32768.0

        return audio.data.cpu().numpy()
Exemplo n.º 9
0
checkpoint_path = "checkpoint_78000"
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
model.to('cuda')
_ = model.eval()
print("Tacotron2 loaded successfully...")

# melgan mel2wav:

from melgan.model.generator import Generator
from melgan.utils.hparams import load_hparam

# load pre trained MelGAN model for mel2audio:
vocoder_checkpoint_path = "nvidia_tacotron2_LJ11_epoch6400.pt"
checkpoint = torch.load(vocoder_checkpoint_path)
hp_melgan = load_hparam("melgan/config/default.yaml")
vocoder_model = Generator(80)  # Number of mel channels
vocoder_model.load_state_dict(checkpoint['model_g'])
vocoder_model = vocoder_model.to('cuda')
vocoder_model.eval(inference=False)

print("MelGAN vocoder loaded successfully.")

#GENERATE TEXT to SPEECH
torch.manual_seed(1234)
#INPUT:
# where the clip file will be written:
save_path = 'audio_test.wav'
# where the pre-trained model is located:
# Inputs for the synthesis:
test_text = "the recommended book for natural language interaction is neural network methods from goldberg"
Exemplo n.º 10
0
                    default='cuda' if torch.cuda.is_available() else 'cpu',
                    help="What device to use.")
parser.add_argument("--audio_folder",
                    type=str,
                    default="synthesized_audio",
                    help="Where to save audios")
args = parser.parse_args()

print('Loading model checkpoints')
m = SpeedySpeech(device=args.device).load(args.speedyspeech_checkpoint,
                                          map_location=args.device)
m.eval()

checkpoint = torch.load(args.melgan_checkpoint, map_location=args.device)
hp = HParam("code/melgan/config/default.yaml")
melgan = Generator(hp.audio.n_mel_channels).to(args.device)
melgan.load_state_dict(checkpoint["model_g"])
melgan.eval(inference=False)

print('Processing text')
txt_processor = TextProcessor(HPText.graphemes, phonemize=HPText.use_phonemes)
test = [
    'This is a test.\n',
    'It would probably be best to put this into a text file.\n'
]
text = [t.strip() for t in test]

phonemes, plen = txt_processor(text)
# append more zeros - avoid cutoff at the end of the largest sequence
phonemes = torch.cat((phonemes, torch.zeros(len(phonemes), 5).long()), dim=-1)
phonemes = phonemes.to(args.device)