def __init__(self, in_size, out_size, projection_size=[512], projection_fn='LeakyReLU', projection_do=0.0, cbhg_cfg={}): super().__init__() self.in_size = in_size self.out_size = out_size self.projection_size = projection_size self.projection_fn = projection_fn self.projection_do = ConfigParser.list_parser(projection_do, n=len(projection_size)) self.inverter_lyr = CBHG1d(in_size, conv_proj_filter=[256, in_size], **cbhg_cfg) _tmp = [] prev_size = self.inverter_lyr.out_features for ii in range(len(projection_size)): _tmp.append(nn.Linear(prev_size, self.projection_size[ii])) _tmp.append(generator_act_module(self.projection_fn)) _tmp.append(nn.Dropout(p=self.projection_do[ii])) prev_size = self.projection_size[ii] pass _tmp.append(nn.Linear(prev_size, out_size)) self.projection_lyr = nn.Sequential(*_tmp) pass
def __init__(self, speaker_emb_dim=256, speaker_proj_size=[512], speaker_proj_fn='none', speaker_proj_do=0.0, speaker_integrate_fn='none', speaker_emb_scale=1.0, *args, **kwargs): super().__init__(*args, **kwargs) self.speaker_emb_dim = speaker_emb_dim self.speaker_proj_size = speaker_proj_size self.speaker_proj_fn = speaker_proj_fn self.speaker_proj_do = ConfigParser.list_parser( speaker_proj_do, n=len(speaker_proj_size)) self.speaker_emb_scale = speaker_emb_scale # scalar x spk_vector # speaker_integrate_fn applied before non-linearity on decoder layer self.speaker_integrate_fn = speaker_integrate_fn _tmp = [] prev_size = speaker_emb_dim for ii in range(len(self.speaker_proj_size)): _tmp.append(nn.Linear(prev_size, self.speaker_proj_size[ii])) _tmp.append(generator_act_module(self.speaker_proj_fn)) _tmp.append(nn.Dropout(self.speaker_proj_do[ii])) prev_size = self.speaker_proj_size[ii] self.speaker_proj_lyr = nn.Sequential(*_tmp) self.speaker_module_lyr = nn.Module() # speaker proj -> decoder prenet (last layer) # self.speaker_module_lyr.add_module( 'dec_proj_prenet_lyr', nn.Linear(prev_size, self.dec_prenet_lyr[-1].out_features)) # speaker proj -> decoder regression core (first layer) # assert len(self.dec_core_gen_lyr ) >= 1, "dec_core_gen_lyr must have atleast 1 layer" self.speaker_module_lyr.add_module( 'dec_proj_core_gen_lyr', nn.Linear(prev_size, self.dec_core_gen_lyr[0].out_features)) pass
def __init__(self, dec_bern_end_size=[256], dec_bern_end_fn='Tanh', dec_bern_end_do=0.0, *args, **kwargs): super(TACOTRONBernoulliEnd, self).__init__(*args, **kwargs) self.dec_bern_end_size = dec_bern_end_size self.dec_bern_end_fn = dec_bern_end_fn self.dec_bern_end_do = ConfigParser.list_parser(dec_bern_end_do) # p(t = frame stop | dec_hid[t], y[t]) # _tmp = [] prev_size = self.dec_att_lyr.output_size + self.dec_out_size for ii in range(len(dec_bern_end_size)): _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii])) _tmp.append(generator_act_module(self.dec_bern_end_fn)) _tmp.append(nn.Dropout(p=self.dec_bern_end_do[ii])) prev_size = self.dec_bern_end_size[ii] _tmp.append(nn.Linear(prev_size, 1)) # output is logit, not transformed into sigmoid # self.dec_bernoulli_end_lyr = nn.Sequential(*_tmp) pass
def __init__( self, enc_in_size, dec_in_size, dec_out_size, enc_emb_size=256, enc_emb_do=0.0, enc_prenet_size=[256, 128], enc_prenet_do=[0.5, 0.5], enc_prenet_fn='leaky_relu', dec_prenet_size=[256, 128], dec_prenet_do=[0.5, 0.5], dec_prenet_fn='leaky_relu', dec_rnn_sizes=[256, 256], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.0, dec_cfg={"type": "standard_decoder"}, att_cfg={"type": "mlp"}, dec_core_gen_size=[512], dec_core_gen_fn='leaky_relu', dec_core_gen_do=0.0, # CBHG # enc_cbhg_cfg={}, # FRAME ENDING # dec_bern_end_size=[256], dec_bern_end_fn='LeakyReLU', dec_bern_end_do=0.0, # OPTIONAL # dec_in_range=None): """ Args: enc_in_size : size of vocab dec_in_size : input (mel) dim size dec_out_size : output (mel) dim size (usually same as dec_in_size) dec_in_range : pair of integer [x, y] \in [0, dec_in_size], all dims outside this pair will be masked as 0 in Tacotron paper, they only use last time-step instead of all group """ super().__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.dec_out_size = dec_out_size # mel spec dim size self.enc_emb_size = enc_emb_size self.enc_emb_do = enc_emb_do self.enc_prenet_size = enc_prenet_size self.enc_prenet_do = ConfigParser.list_parser(enc_prenet_do, len(enc_prenet_size)) self.enc_prenet_fn = enc_prenet_fn self.dec_prenet_size = dec_prenet_size self.dec_prenet_do = ConfigParser.list_parser(dec_prenet_do, len(dec_prenet_size)) self.dec_prenet_fn = dec_prenet_fn self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = dec_rnn_cfgs self.dec_rnn_do = dec_rnn_do self.dec_core_gen_size = dec_core_gen_size self.dec_core_gen_fn = dec_core_gen_fn self.dec_core_gen_do = ConfigParser.list_parser( dec_core_gen_do, len(dec_core_gen_size)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg # FRAME ENDING # self.dec_bern_end_size = dec_bern_end_size self.dec_bern_end_fn = dec_bern_end_fn self.dec_bern_end_do = ConfigParser.list_parser(dec_bern_end_do) # OPTIONAL # self.dec_in_range = dec_in_range if self.dec_in_range is not None: assert isinstance(self.dec_in_range, (list, tuple)) \ and len(self.dec_in_range) == 2 # CBHG config # self.enc_cbhg_cfg = ConfigParser.item_parser(enc_cbhg_cfg) self.enc_emb_lyr = nn.Embedding(enc_in_size, enc_emb_size) # init enc prenet # self.enc_prenet_lyr = nn.ModuleList() prev_size = enc_emb_size for ii in range(len(self.enc_prenet_size)): self.enc_prenet_lyr.append( nn.Linear(prev_size, enc_prenet_size[ii])) prev_size = enc_prenet_size[ii] # init enc middle # self.enc_core_lyr = cbhg.CBHG1d(prev_size, **enc_cbhg_cfg) # init dec prenet # self.dec_prenet_lyr = nn.ModuleList() prev_size = dec_in_size if self.dec_in_range is None else ( (self.dec_in_range[-1] or 0) - (self.dec_in_range[-2] or 0)) for ii in range(len(self.dec_prenet_size)): self.dec_prenet_lyr.append( nn.Linear(prev_size, dec_prenet_size[ii])) prev_size = dec_prenet_size[ii] # init dec rnn # _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # final_enc_size = self.enc_core_lyr.output_size self.dec_att_lyr = decoder.StandardDecoder(att_cfg, final_enc_size, prev_size, dec_rnn_sizes, dec_rnn_cfgs, dec_rnn_do) # init decoder layer melspec generator # prev_size = self.dec_att_lyr.output_size self.dec_core_gen_lyr = nn.ModuleList() for ii in range(len(self.dec_core_gen_size)): self.dec_core_gen_lyr.append( nn.Linear(prev_size, self.dec_core_gen_size[ii])) prev_size = self.dec_core_gen_size[ii] self.dec_core_gen_lyr.append(nn.Linear(prev_size, self.dec_out_size)) # init decoder frame ending predictor # # p(t=STOP | dec_hid[t], y[t]) # _tmp = [] prev_size = self.dec_att_lyr.output_size + self.dec_out_size for ii in range(len(dec_bern_end_size)): _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii])) _tmp.append(generator_act_module(self.dec_bern_end_fn)) _tmp.append(nn.Dropout(p=self.dec_bern_end_do[ii])) prev_size = self.dec_bern_end_size[ii] _tmp.append(nn.Linear(prev_size, 1)) # output is logit, not transformed into sigmoid # self.dec_bernoulli_end_lyr = nn.Sequential(*_tmp)
def __init__( self, enc_in_size, dec_in_size, dec_out_size, enc_emb_size=256, enc_emb_do=0.0, enc_conv_sizes=[5, 5, 5], enc_conv_filter=[256, 256, 256], enc_conv_do=0.25, enc_conv_fn='LeakyReLU', enc_rnn_sizes=[256], enc_rnn_cfgs={ "type": "lstm", 'bi': True }, enc_rnn_do=0.2, dec_prenet_size=[256, 256], dec_prenet_fn='leaky_relu', dec_prenet_do=0.25, dec_rnn_sizes=[512, 512], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.2, dec_proj_size=[512, 512], dec_proj_fn='leaky_relu', dec_proj_do=0.25, dec_bern_end_size=[256], dec_bern_end_do=0.0, dec_bern_end_fn='LeakyReLU', dec_cfg={"type": "standard_decoder"}, att_cfg={ "type": "mlp_history", "kwargs": { "history_conv_ksize": [2, 4, 8] } }, # location sensitive attention # OPTIONAL # dec_in_range=None, use_bn=False, # Tacotron V2 default activate BatchNorm use_ln=False, # Use layer-normalization on feedforward ): """ Tacotron V2 Decoder generates 2 outputs mel + linear spec, use main for conditional input next step Args: enc_in_size : size of vocab dec_in_size : input (mel) dim size dec_out_size : output (mel/linear) dim size (usually same as dec_in_size) dec_in_range : pair of integer [x, y] \in [0, dec_in_size], all dims outside this pair will be masked as 0 in Tacotron paper, they only use last time-step instead of all group """ super().__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.dec_out_size = dec_out_size # output projection -> mel/linear spec # self.enc_emb_size = enc_emb_size self.enc_emb_do = enc_emb_do self.enc_conv_sizes = enc_conv_sizes self.enc_conv_filter = enc_conv_filter self.enc_conv_do = ConfigParser.list_parser(enc_conv_do, len(enc_conv_sizes)) self.enc_conv_fn = enc_conv_fn self.enc_rnn_sizes = enc_rnn_sizes self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do, len(enc_rnn_sizes)) self.enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) self.dec_prenet_size = dec_prenet_size self.dec_prenet_do = ConfigParser.list_parser(dec_prenet_do, len(dec_prenet_size)) self.dec_prenet_fn = dec_prenet_fn self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_proj_size = dec_proj_size self.dec_proj_fn = dec_proj_fn self.dec_proj_do = ConfigParser.list_parser(dec_proj_do, len(dec_proj_size)) self.dec_bern_end_size = dec_bern_end_size self.dec_bern_end_do = ConfigParser.list_parser( dec_bern_end_do, len(dec_bern_end_size)) self.dec_bern_end_fn = dec_bern_end_fn self.dec_cfg = dec_cfg self.att_cfg = att_cfg self.use_bn = use_bn self.use_ln = use_ln if use_ln == True: raise ValueError("Layer Normalization is not supported yet!") # OPTIONAL # self.dec_in_range = dec_in_range if self.dec_in_range is not None: assert isinstance(self.dec_in_range, (list, tuple)) \ and len(self.dec_in_range) == 2 ### FINISH ### # init emb layer self.enc_emb_lyr = nn.Embedding(enc_in_size, enc_emb_size) # init enc conv # _tmp = [] prev_size = enc_emb_size for ii in range(len(self.enc_conv_sizes)): _tmp.append( Conv1dEv(prev_size, self.enc_conv_filter[ii], self.enc_conv_sizes[ii], padding='same')) _tmp.append(generator_act_module(self.enc_conv_fn)) if self.use_bn: _tmp.append(nn.BatchNorm1d(self.enc_conv_filter[ii])) _tmp.append(nn.Dropout(p=self.enc_conv_do[ii])) prev_size = self.enc_conv_filter[ii] self.enc_conv_lyr = nn.Sequential(*_tmp) # init enc rnn # self.enc_rnn_lyr = nn.ModuleList() _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) for ii in range(len(self.enc_rnn_sizes)): _rnn_cfg = {} _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [ prev_size, enc_rnn_sizes[ii], 1, True, True, 0, _enc_rnn_cfgs[ii]['bi'] ] self.enc_rnn_lyr.append(generator_rnn(_rnn_cfg)) prev_size = enc_rnn_sizes[ii] # init dec prenet # _tmp = [] prev_size = dec_in_size if self.dec_in_range is None else ( (self.dec_in_range[-1] or 0) - (self.dec_in_range[-2] or 0)) for ii in range(len(self.dec_prenet_size)): _tmp.append(nn.Linear(prev_size, self.dec_prenet_size[ii])) prev_size = self.dec_prenet_size[ii] self.dec_prenet_lyr = nn.ModuleList(_tmp) # init dec rnn # _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) final_enc_size = self.enc_rnn_lyr[-1].hidden_size * ( 2 if self.enc_rnn_lyr[-1].bidirectional else 1) assert 'type' in dec_cfg, "decoder type need to be defined" if dec_cfg['type'] == 'standard_decoder': _tmp_dec_cfg = dict(dec_cfg) del _tmp_dec_cfg['type'] # self.dec_att_lyr = decoder.StandardDecoder(att_cfg=att_cfg, ctx_size=final_enc_size, in_size=prev_size, rnn_sizes=dec_rnn_sizes, rnn_cfgs=dec_rnn_cfgs, rnn_do=dec_rnn_do, **_tmp_dec_cfg) # init dec lin proj -> mel/linear-spec prev_size = self.dec_att_lyr.out_features _tmp = [] for ii in range(len(self.dec_proj_size)): _tmp.append(nn.Linear(prev_size, self.dec_proj_size[ii])) prev_size = self.dec_proj_size[ii] _tmp.append(nn.Linear(prev_size, self.dec_out_size)) self.dec_proj_lyr = nn.ModuleList(_tmp) # init dec bern end layer _tmp = [] prev_size = self.dec_out_size + self.dec_att_lyr.out_features + ( self.enc_rnn_lyr[-1].hidden_size * (2 if self.enc_rnn_lyr[-1].bidirectional else 1)) for ii in range(len(self.dec_bern_end_size)): _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii])) _tmp.append(generator_act_module(dec_bern_end_fn)) _tmp.append(nn.Dropout(self.dec_bern_end_do[ii])) prev_size = self.dec_bern_end_size[ii] pass _tmp.append(nn.Linear(prev_size, 1)) self.dec_bern_end_lyr = nn.Sequential(*_tmp) pass
def __init__( self, enc_in_size, dec_in_size, dec_out_size, enc_fnn_sizes=[512], enc_fnn_act='LeakyReLU', enc_fnn_do=0.25, enc_rnn_sizes=[256, 256, 256], enc_rnn_cfgs={ "type": "lstm", "bi": True }, enc_rnn_do=0.25, downsampling=[False, True, True], dec_emb_size=256, dec_emb_do=0.25, dec_emb_tied_weight=True, # tying weight from char/word embedding with softmax layer dec_rnn_sizes=[512, 512], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.25, dec_cfg={"type": "standard_decoder"}, att_cfg={"type": "mlp"}, use_layernorm=False, ): super().__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.dec_out_size = dec_out_size self.enc_fnn_sizes = enc_fnn_sizes self.enc_fnn_act = enc_fnn_act self.enc_fnn_do = ConfigParser.list_parser(enc_fnn_do, len(enc_fnn_sizes)) self.enc_rnn_sizes = enc_rnn_sizes self.enc_rnn_cfgs = enc_rnn_cfgs self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do, len(enc_rnn_sizes)) self.downsampling = ConfigParser.list_parser(downsampling, len(enc_rnn_sizes)) self.dec_emb_size = dec_emb_size self.dec_emb_do = dec_emb_do self.dec_emb_tied_weight = dec_emb_tied_weight self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg self.use_layernorm = use_layernorm if self.use_layernorm == True: raise ValueError("LayerNorm is not implemented yet") # modules # # init encoder # prev_size = enc_in_size _tmp = [] for ii in range(len(enc_fnn_sizes)): _tmp.append(nn.Linear(prev_size, enc_fnn_sizes[ii])) if use_layernorm: _tmp.append(LayerNorm(enc_fnn_sizes[ii])) _tmp.append(generator_act_module(enc_fnn_act)) _tmp.append(nn.Dropout(p=self.enc_fnn_do[ii])) prev_size = enc_fnn_sizes[ii] self.enc_fnn_lyr = nn.Sequential(*_tmp) self.enc_rnn_lyr = nn.ModuleList() _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) for ii in range(len(enc_rnn_sizes)): _rnn_cfg = {} _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [ prev_size, enc_rnn_sizes[ii], 1, True, True, 0, _enc_rnn_cfgs[ii]['bi'] ] self.enc_rnn_lyr.append(generator_rnn(_rnn_cfg)) prev_size = enc_rnn_sizes[ii] * (2 if _enc_rnn_cfgs[ii]['bi'] else 1) final_enc_size = prev_size # init decoder # self.dec_emb_lyr = nn.Embedding(self.dec_in_size, dec_emb_size, padding_idx=None) prev_size = dec_emb_size _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # assert 'type' in dec_cfg, "decoder type need to be defined" if dec_cfg['type'] == 'standard_decoder': _tmp_dec_cfg = dict(dec_cfg) del _tmp_dec_cfg['type'] # self.dec_att_lyr = decoder.StandardDecoder(att_cfg=att_cfg, ctx_size=final_enc_size, in_size=dec_emb_size, rnn_sizes=dec_rnn_sizes, rnn_cfgs=_dec_rnn_cfgs, rnn_do=dec_rnn_do, **_tmp_dec_cfg) else: raise NotImplementedError("decoder type {} is not found".format( dec_cfg['type'])) self.dec_presoftmax_lyr = nn.Linear(self.dec_att_lyr.output_size, dec_out_size) if dec_emb_tied_weight: assert dec_out_size == dec_in_size and self.dec_emb_lyr.embedding_dim == self.dec_presoftmax_lyr.in_features self.dec_presoftmax_lyr.weight = self.dec_emb_lyr.weight pass