Пример #1
0
    def __init__(self,
                 tacotron_weights_path,
                 waveglow_cfg_path,
                 waveglow_weights_path,
                 device,
                 sampling_rate=22050):
        """
        args:
            tacotron_weights_path: path to the tacotron weights
            waveglow_weights_path: path to the waveglow weights
            sampling_rate: the rate that audio representations are sampled per second
        """
        hparams = create_hparams()
        hparams.sampling_rate = sampling_rate
        self.device = device

        self.tacotron = load_model(hparams, device)
        self.tacotron.load_state_dict(
            torch.load(tacotron_weights_path,
                       map_location=device,
                       pickle_module=pickle)['state_dict'])

        if device.type == "cpu":
            self.tacotron.cpu()
        else:
            self.tacotron.half().cuda()  # GPU can handle half

        self.tacotron.eval()

        with open(waveglow_cfg_path, "r", encoding='utf-8') as reader:
            text = reader.read()
        wg_cfg = json.loads(text)['waveglow_config']
        self.waveglow = WaveGlow(wg_cfg['n_mel_channels'], wg_cfg['n_flows'],
                                 wg_cfg['n_group'], wg_cfg['n_early_every'],
                                 wg_cfg['n_early_size'], wg_cfg['WN_config'])

        self.waveglow.load_state_dict(
            torch.load(waveglow_weights_path,
                       map_location=device,
                       pickle_module=pickle))

        if device.type == "cuda":
            self.waveglow.cuda().half()

        self.waveglow.eval()

        for k in self.waveglow.convinv:
            k.float()

        self.denoiser = Denoiser(self.waveglow, device)
Пример #2
0
def synthesize(text, voice, sigma=0.6, denoiser_strength=0.1, is_fp16=False):

    hparams = create_hparams()
    hparams.sampling_rate = 22050

    if voice == "papaito":
        voice_model = "nvidia_tacotron2_papaito_300"
    elif voice == "constantino":
        voice_model = "tacotron2_Constantino_600"
    elif voice == "orador":
        voice_model = "checkpoint_tacotron2_29000_es"
   
    checkpoint_path = "/home/debian/workspace/models/" + voice_model

    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()

    waveglow_path = '/home/debian/workspace/models/waveglow_256channels_ljs_v2.pt'
    waveglow = torch.load(waveglow_path, map_location='cuda')['model']
    _ = waveglow.cuda().eval().half()
    denoiser = Denoiser(waveglow)

    #text="¡Cágate lorito!"
    #with open(filelist_path, encoding='utf-8', mode='r') as f:
    #    text = f.read()

    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    #mel = torch.unsqueeze(mel, 0)
    mel = mel_outputs.half() if is_fp16 else mel_outputs
    audio = np.array([])
    with torch.no_grad():
        audio = waveglow.infer(mel, sigma=sigma)
        if denoiser_strength > 0:
             audio = denoiser(audio, denoiser_strength)
        audio = audio * hparams.max_wav_value
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')

    return audio, hparams.sampling_rate
Пример #3
0
def load_mel_model(checkpoint_path):

    hparams = create_hparams("distributed_run=False,mask_padding=False")
    hparams.sampling_rate = 22050
    hparams.filter_length = 1024
    hparams.hop_length = 256
    hparams.win_length = 1024

    model = load_model(hparams)
    try:
        model = model.module
    except:
        pass

    model.load_state_dict({
        k.replace('module.', ''): v
        for k, v in torch.load(checkpoint_path)['state_dict'].items()
    })
    _ = model.eval()
    return model
Пример #4
0
parser.add_argument("--is_fp16", action="store_true")
parser.add_argument("-d",
                    "--denoiser_strength",
                    default=0.0,
                    type=float,
                    help='Removes model bias. Start with 0.1 and adjust')

args = parser.parse_args()

hparams = []
hparams = create_hparams()
#hparams[HP_SAMPLING_RATE] = 22050

#checkpoint_path = "output/checkpoint_29000"
checkpoint_path = args.checkpoint_path
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

#waveglow_path = '/media/debian/SSD_USB/models/waveglow_256channels_ljs_v2.pt'
waveglow = torch.load(args.waveglow_path)['model']
_ = waveglow.cuda().eval().half()
denoiser = Denoiser(waveglow)

#text="¡Cágate lorito!"
with open(args.filelist_path, encoding='utf-8', mode='r') as f:
    text = f.read()

sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()