def __init__(self, tacotron_weights_path, waveglow_cfg_path, waveglow_weights_path, device, sampling_rate=22050): """ args: tacotron_weights_path: path to the tacotron weights waveglow_weights_path: path to the waveglow weights sampling_rate: the rate that audio representations are sampled per second """ hparams = create_hparams() hparams.sampling_rate = sampling_rate self.device = device self.tacotron = load_model(hparams, device) self.tacotron.load_state_dict( torch.load(tacotron_weights_path, map_location=device, pickle_module=pickle)['state_dict']) if device.type == "cpu": self.tacotron.cpu() else: self.tacotron.half().cuda() # GPU can handle half self.tacotron.eval() with open(waveglow_cfg_path, "r", encoding='utf-8') as reader: text = reader.read() wg_cfg = json.loads(text)['waveglow_config'] self.waveglow = WaveGlow(wg_cfg['n_mel_channels'], wg_cfg['n_flows'], wg_cfg['n_group'], wg_cfg['n_early_every'], wg_cfg['n_early_size'], wg_cfg['WN_config']) self.waveglow.load_state_dict( torch.load(waveglow_weights_path, map_location=device, pickle_module=pickle)) if device.type == "cuda": self.waveglow.cuda().half() self.waveglow.eval() for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow, device)
def synthesize(text, voice, sigma=0.6, denoiser_strength=0.1, is_fp16=False): hparams = create_hparams() hparams.sampling_rate = 22050 if voice == "papaito": voice_model = "nvidia_tacotron2_papaito_300" elif voice == "constantino": voice_model = "tacotron2_Constantino_600" elif voice == "orador": voice_model = "checkpoint_tacotron2_29000_es" checkpoint_path = "/home/debian/workspace/models/" + voice_model model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() waveglow_path = '/home/debian/workspace/models/waveglow_256channels_ljs_v2.pt' waveglow = torch.load(waveglow_path, map_location='cuda')['model'] _ = waveglow.cuda().eval().half() denoiser = Denoiser(waveglow) #text="¡Cágate lorito!" #with open(filelist_path, encoding='utf-8', mode='r') as f: # text = f.read() sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) #mel = torch.unsqueeze(mel, 0) mel = mel_outputs.half() if is_fp16 else mel_outputs audio = np.array([]) with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * hparams.max_wav_value audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') return audio, hparams.sampling_rate
def load_mel_model(checkpoint_path): hparams = create_hparams("distributed_run=False,mask_padding=False") hparams.sampling_rate = 22050 hparams.filter_length = 1024 hparams.hop_length = 256 hparams.win_length = 1024 model = load_model(hparams) try: model = model.module except: pass model.load_state_dict({ k.replace('module.', ''): v for k, v in torch.load(checkpoint_path)['state_dict'].items() }) _ = model.eval() return model
parser.add_argument("--is_fp16", action="store_true") parser.add_argument("-d", "--denoiser_strength", default=0.0, type=float, help='Removes model bias. Start with 0.1 and adjust') args = parser.parse_args() hparams = [] hparams = create_hparams() #hparams[HP_SAMPLING_RATE] = 22050 #checkpoint_path = "output/checkpoint_29000" checkpoint_path = args.checkpoint_path model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() #waveglow_path = '/media/debian/SSD_USB/models/waveglow_256channels_ljs_v2.pt' waveglow = torch.load(args.waveglow_path)['model'] _ = waveglow.cuda().eval().half() denoiser = Denoiser(waveglow) #text="¡Cágate lorito!" with open(args.filelist_path, encoding='utf-8', mode='r') as f: text = f.read() sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()