def init_vocoder(self): model_fpath = self.ui.current_vocoder_fpath # Case of Griffin-lim if model_fpath is None: return else: self.ui.log("Loading the vocoder %s... " % model_fpath) self.ui.set_loading(1) start = timer() if Path(model_fpath).parent.stem == "melgan": vocoder_melgan.load_vocoder_melgan(model_fpath) elif Path(model_fpath).parent.stem == "wavernn": vocoder.load_model(model_fpath) else: return self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.set_loading(0)
hparams = create_hparams() stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) model_path = "models/mellotron_libritts.pt" mellotron = load_model(hparams).cuda().eval() mellotron.load_state_dict(torch.load(model_path)['state_dict']) waveglow_path = 'models/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() melgan_path = 'models/multi_speaker.pt' load_vocoder_melgan(melgan_path) ## Setup dataloaders arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'data/examples_filelist.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) ## Load data file_idx = 0 audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path)