def __init__(self, hparams): super(Tacotron2, self).__init__() self.mask_padding = hparams.mask_padding self.fp16_run = hparams.fp16_run self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) val = sqrt(3.0) * std # uniform bounds for std self.embedding.weight.data.uniform_(-val, val) self.encoder = Encoder(hparams) self.decoder = Decoder(hparams) self.postnet = Postnet(hparams) if hparams.with_gst: self.gst = GST(hparams) self.speaker_embedding = nn.Embedding(hparams.n_speakers, hparams.speaker_embedding_dim) if hparams.pretrained_speaker: with open('data/VCTK/speaker-dict.json') as f: speakers = json.load(f) embed_paths = glob.glob( os.path.join(hparams.pretrained_speaker_path, '*.npy')) embeds = np.zeros((hparams.n_speakers, 1, 512)) for embed_path in embed_paths: speaker_id = embed_path.split('spker_embed-')[1][:-4] embed = np.load(embed_path) embeds[speakers[speaker_id], :, :] = embed embeddings = torch.tensor(embeds, dtype=torch.float32) self.register_buffer('pretrained_speaker', embeddings) self.speaker_linear = nn.Sequential( nn.Linear(512, hparams.speaker_embedding_dim), nn.ReLU())
def __init__(self, hparams): super(Tacotron2, self).__init__() self.mask_padding = hparams.mask_padding self.fp16_run = hparams.fp16_run self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.embedding = nn.Embedding(hparams.n_symbols, hparams.symbols_embedding_dim) std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) val = sqrt(3.0) * std # uniform bounds for std self.embedding.weight.data.uniform_(-val, val) self.encoder = Encoder(hparams) self.decoder = Decoder(hparams) self.postnet = Postnet(hparams) self.gst = GST(hparams)