def __init__( self, # network structure related idim: int, odim: int, embed_dim: int = 512, elayers: int = 1, eunits: int = 512, econv_layers: int = 3, econv_chans: int = 512, econv_filts: int = 5, atype: str = "location", adim: int = 512, aconv_chans: int = 32, aconv_filts: int = 15, cumulate_att_w: bool = True, dlayers: int = 2, dunits: int = 1024, prenet_layers: int = 2, prenet_units: int = 256, postnet_layers: int = 5, postnet_chans: int = 512, postnet_filts: int = 5, output_activation: str = None, use_batch_norm: bool = True, use_concate: bool = True, use_residual: bool = False, reduction_factor: int = 1, spk_embed_dim: int = None, spk_embed_integration_type: str = "concat", use_gst: bool = False, gst_tokens: int = 10, gst_heads: int = 4, gst_conv_layers: int = 6, gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128), gst_conv_kernel_size: int = 3, gst_conv_stride: int = 2, gst_gru_layers: int = 1, gst_gru_units: int = 128, # training related dropout_rate: float = 0.5, zoneout_rate: float = 0.1, use_masking: bool = True, use_weighted_masking: bool = False, bce_pos_weight: float = 5.0, loss_type: str = "L1+L2", use_guided_attn_loss: bool = True, guided_attn_loss_sigma: float = 0.4, guided_attn_loss_lambda: float = 1.0, ): """Initialize Tacotron2 module.""" assert check_argument_types() super().__init__() # store hyperparameters self.idim = idim self.odim = odim self.eos = idim - 1 self.spk_embed_dim = spk_embed_dim self.cumulate_att_w = cumulate_att_w self.reduction_factor = reduction_factor self.use_gst = use_gst self.use_guided_attn_loss = use_guided_attn_loss self.loss_type = loss_type if self.spk_embed_dim is not None: self.spk_embed_integration_type = spk_embed_integration_type # define activation function for the final output if output_activation is None: self.output_activation_fn = None elif hasattr(F, output_activation): self.output_activation_fn = getattr(F, output_activation) else: raise ValueError(f"there is no such an activation function. " f"({output_activation})") # set padding idx padding_idx = 0 self.padding_idx = padding_idx # define network modules self.enc = Encoder( idim=idim, embed_dim=embed_dim, elayers=elayers, eunits=eunits, econv_layers=econv_layers, econv_chans=econv_chans, econv_filts=econv_filts, use_batch_norm=use_batch_norm, use_residual=use_residual, dropout_rate=dropout_rate, padding_idx=padding_idx, ) if self.use_gst: self.gst = StyleEncoder( idim=odim, # the input is mel-spectrogram gst_tokens=gst_tokens, gst_token_dim=eunits, gst_heads=gst_heads, conv_layers=gst_conv_layers, conv_chans_list=gst_conv_chans_list, conv_kernel_size=gst_conv_kernel_size, conv_stride=gst_conv_stride, gru_layers=gst_gru_layers, gru_units=gst_gru_units, ) if spk_embed_dim is None: dec_idim = eunits elif spk_embed_integration_type == "concat": dec_idim = eunits + spk_embed_dim elif spk_embed_integration_type == "add": dec_idim = eunits self.projection = torch.nn.Linear(self.spk_embed_dim, eunits) else: raise ValueError(f"{spk_embed_integration_type} is not supported.") if atype == "location": att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts) elif atype == "forward": att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts) if self.cumulate_att_w: logging.warning("cumulation of attention weights is disabled " "in forward attention.") self.cumulate_att_w = False elif atype == "forward_ta": att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, odim) if self.cumulate_att_w: logging.warning("cumulation of attention weights is disabled " "in forward attention.") self.cumulate_att_w = False else: raise NotImplementedError("Support only location or forward") self.dec = Decoder( idim=dec_idim, odim=odim, att=att, dlayers=dlayers, dunits=dunits, prenet_layers=prenet_layers, prenet_units=prenet_units, postnet_layers=postnet_layers, postnet_chans=postnet_chans, postnet_filts=postnet_filts, output_activation_fn=self.output_activation_fn, cumulate_att_w=self.cumulate_att_w, use_batch_norm=use_batch_norm, use_concate=use_concate, dropout_rate=dropout_rate, zoneout_rate=zoneout_rate, reduction_factor=reduction_factor, ) self.taco2_loss = Tacotron2Loss( use_masking=use_masking, use_weighted_masking=use_weighted_masking, bce_pos_weight=bce_pos_weight, ) if self.use_guided_attn_loss: self.attn_loss = GuidedAttentionLoss( sigma=guided_attn_loss_sigma, alpha=guided_attn_loss_lambda, )
def __init__( self, # network structure related idim: int, odim: int, embed_dim: int = 512, elayers: int = 1, eunits: int = 512, econv_layers: int = 3, econv_chans: int = 512, econv_filts: int = 5, atype: str = "location", adim: int = 512, aconv_chans: int = 32, aconv_filts: int = 15, cumulate_att_w: bool = True, dlayers: int = 2, dunits: int = 1024, prenet_layers: int = 2, prenet_units: int = 256, postnet_layers: int = 5, postnet_chans: int = 512, postnet_filts: int = 5, output_activation: str = None, use_batch_norm: bool = True, use_concate: bool = True, use_residual: bool = False, reduction_factor: int = 1, # extra embedding related spks: int = -1, langs: int = -1, spk_embed_dim: int = None, spk_embed_integration_type: str = "concat", use_gst: bool = False, gst_tokens: int = 10, gst_heads: int = 4, gst_conv_layers: int = 6, gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128), gst_conv_kernel_size: int = 3, gst_conv_stride: int = 2, gst_gru_layers: int = 1, gst_gru_units: int = 128, # training related dropout_rate: float = 0.5, zoneout_rate: float = 0.1, use_masking: bool = True, use_weighted_masking: bool = False, bce_pos_weight: float = 5.0, loss_type: str = "L1+L2", use_guided_attn_loss: bool = True, guided_attn_loss_sigma: float = 0.4, guided_attn_loss_lambda: float = 1.0, ): """Initialize Tacotron2 module. Args: idim (int): Dimension of the inputs. odim: (int) Dimension of the outputs. embed_dim (int): Dimension of the token embedding. elayers (int): Number of encoder blstm layers. eunits (int): Number of encoder blstm units. econv_layers (int): Number of encoder conv layers. econv_filts (int): Number of encoder conv filter size. econv_chans (int): Number of encoder conv filter channels. dlayers (int): Number of decoder lstm layers. dunits (int): Number of decoder lstm units. prenet_layers (int): Number of prenet layers. prenet_units (int): Number of prenet units. postnet_layers (int): Number of postnet layers. postnet_filts (int): Number of postnet filter size. postnet_chans (int): Number of postnet filter channels. output_activation (str): Name of activation function for outputs. adim (int): Number of dimension of mlp in attention. aconv_chans (int): Number of attention conv filter channels. aconv_filts (int): Number of attention conv filter size. cumulate_att_w (bool): Whether to cumulate previous attention weight. use_batch_norm (bool): Whether to use batch normalization. use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs. reduction_factor (int): Reduction factor. spks: Number of speakers. If set to > 0, speaker ID embedding will be used. langs: Number of langs. If set to > 0, lang ID embedding will be used. spk_embed_dim (int): Pretrained speaker embedding dimension. spk_embed_integration_type (str): How to integrate speaker embedding. use_gst (str): Whether to use global style token. gst_tokens (int): Number of GST embeddings. gst_heads (int): Number of heads in GST multihead attention. gst_conv_layers (int): Number of conv layers in GST. gst_conv_chans_list: (Sequence[int]): List of the number of channels of conv layers in GST. gst_conv_kernel_size (int): Kernel size of conv layers in GST. gst_conv_stride (int): Stride size of conv layers in GST. gst_gru_layers (int): Number of GRU layers in GST. gst_gru_units (int): Number of GRU units in GST. dropout_rate (float): Dropout rate. zoneout_rate (float): Zoneout rate. use_masking (bool): Whether to mask padded part in loss calculation. use_weighted_masking (bool): Whether to apply weighted masking in loss calculation. bce_pos_weight (float): Weight of positive sample of stop token (only for use_masking=True). loss_type (str): Loss function type ("L1", "L2", or "L1+L2"). use_guided_attn_loss (bool): Whether to use guided attention loss. guided_attn_loss_sigma (float): Sigma in guided attention loss. guided_attn_loss_lambda (float): Lambda in guided attention loss. """ assert check_argument_types() super().__init__() # store hyperparameters self.idim = idim self.odim = odim self.eos = idim - 1 self.spk_embed_dim = spk_embed_dim self.cumulate_att_w = cumulate_att_w self.reduction_factor = reduction_factor self.spks = spks self.langs = langs self.use_gst = use_gst self.use_guided_attn_loss = use_guided_attn_loss self.loss_type = loss_type if self.spk_embed_dim is not None: self.spk_embed_integration_type = spk_embed_integration_type # define activation function for the final output if output_activation is None: self.output_activation_fn = None elif hasattr(F, output_activation): self.output_activation_fn = getattr(F, output_activation) else: raise ValueError( f"there is no such an activation function. " f"({output_activation})" ) # set padding idx padding_idx = 0 self.padding_idx = padding_idx # define network modules self.enc = Encoder( idim=idim, embed_dim=embed_dim, elayers=elayers, eunits=eunits, econv_layers=econv_layers, econv_chans=econv_chans, econv_filts=econv_filts, use_batch_norm=use_batch_norm, use_residual=use_residual, dropout_rate=dropout_rate, padding_idx=padding_idx, ) if self.use_gst: self.gst = StyleEncoder( idim=odim, # the input is mel-spectrogram gst_tokens=gst_tokens, gst_token_dim=eunits, gst_heads=gst_heads, conv_layers=gst_conv_layers, conv_chans_list=gst_conv_chans_list, conv_kernel_size=gst_conv_kernel_size, conv_stride=gst_conv_stride, gru_layers=gst_gru_layers, gru_units=gst_gru_units, ) if self.spks > 0: self.sid_emb = torch.nn.Embedding(spks, embed_dim) if self.langs > 0: self.lid_emb = torch.nn.Embedding(langs, embed_dim) if spk_embed_dim is None: dec_idim = eunits elif spk_embed_integration_type == "concat": dec_idim = eunits + spk_embed_dim elif spk_embed_integration_type == "add": dec_idim = eunits self.projection = torch.nn.Linear(self.spk_embed_dim, eunits) else: raise ValueError(f"{spk_embed_integration_type} is not supported.") if atype == "location": att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts) elif atype == "forward": att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts) if self.cumulate_att_w: logging.warning( "cumulation of attention weights is disabled " "in forward attention." ) self.cumulate_att_w = False elif atype == "forward_ta": att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, odim) if self.cumulate_att_w: logging.warning( "cumulation of attention weights is disabled " "in forward attention." ) self.cumulate_att_w = False else: raise NotImplementedError("Support only location or forward") self.dec = Decoder( idim=dec_idim, odim=odim, att=att, dlayers=dlayers, dunits=dunits, prenet_layers=prenet_layers, prenet_units=prenet_units, postnet_layers=postnet_layers, postnet_chans=postnet_chans, postnet_filts=postnet_filts, output_activation_fn=self.output_activation_fn, cumulate_att_w=self.cumulate_att_w, use_batch_norm=use_batch_norm, use_concate=use_concate, dropout_rate=dropout_rate, zoneout_rate=zoneout_rate, reduction_factor=reduction_factor, ) self.taco2_loss = Tacotron2Loss( use_masking=use_masking, use_weighted_masking=use_weighted_masking, bce_pos_weight=bce_pos_weight, ) if self.use_guided_attn_loss: self.attn_loss = GuidedAttentionLoss( sigma=guided_attn_loss_sigma, alpha=guided_attn_loss_lambda, )
def __init__(self, idim, odim, args=None): """Initialize Tacotron2 module. Args: idim (int): Dimension of the inputs. odim (int): Dimension of the outputs. args (Namespace, optional): - spk_embed_dim (int): Dimension of the speaker embedding. - elayers (int): The number of encoder blstm layers. - eunits (int): The number of encoder blstm units. - econv_layers (int): The number of encoder conv layers. - econv_filts (int): The number of encoder conv filter size. - econv_chans (int): The number of encoder conv filter channels. - dlayers (int): The number of decoder lstm layers. - dunits (int): The number of decoder lstm units. - prenet_layers (int): The number of prenet layers. - prenet_units (int): The number of prenet units. - postnet_layers (int): The number of postnet layers. - postnet_filts (int): The number of postnet filter size. - postnet_chans (int): The number of postnet filter channels. - output_activation (int): The name of activation function for outputs. - adim (int): The number of dimension of mlp in attention. - aconv_chans (int): The number of attention conv filter channels. - aconv_filts (int): The number of attention conv filter size. - cumulate_att_w (bool): Whether to cumulate previous attention weight. - use_batch_norm (bool): Whether to use batch normalization. - use_concate (int): Whether to concatenate encoder embedding with decoder lstm outputs. - dropout_rate (float): Dropout rate. - zoneout_rate (float): Zoneout rate. - reduction_factor (int): Reduction factor. - spk_embed_dim (int): Number of speaker embedding dimenstions. - spc_dim (int): Number of spectrogram embedding dimenstions (only for use_cbhg=True). - use_cbhg (bool): Whether to use CBHG module. - cbhg_conv_bank_layers (int): The number of convoluional banks in CBHG. - cbhg_conv_bank_chans (int): The number of channels of convolutional bank in CBHG. - cbhg_proj_filts (int): The number of filter size of projection layeri in CBHG. - cbhg_proj_chans (int): The number of channels of projection layer in CBHG. - cbhg_highway_layers (int): The number of layers of highway network in CBHG. - cbhg_highway_units (int): The number of units of highway network in CBHG. - cbhg_gru_units (int): The number of units of GRU in CBHG. - use_masking (bool): Whether to mask padded part in loss calculation. - bce_pos_weight (float): Weight of positive sample of stop token (only for use_masking=True). - use-guided-attn-loss (bool): Whether to use guided attention loss. - guided-attn-loss-sigma (float) Sigma in guided attention loss. - guided-attn-loss-lamdba (float): Lambda in guided attention loss. """ # initialize base classes TTSInterface.__init__(self) torch.nn.Module.__init__(self) # fill missing arguments args = fill_missing_args(args, self.add_arguments) # store hyperparameters self.idim = idim self.odim = odim self.adim = args.adim self.spk_embed_dim = args.spk_embed_dim self.cumulate_att_w = args.cumulate_att_w self.reduction_factor = args.reduction_factor self.encoder_reduction_factor = args.encoder_reduction_factor self.use_cbhg = args.use_cbhg self.use_guided_attn_loss = args.use_guided_attn_loss self.src_reconstruction_loss_lambda = args.src_reconstruction_loss_lambda self.trg_reconstruction_loss_lambda = args.trg_reconstruction_loss_lambda # define activation function for the final output if args.output_activation is None: self.output_activation_fn = None elif hasattr(F, args.output_activation): self.output_activation_fn = getattr(F, args.output_activation) else: raise ValueError( "there is no such an activation function. (%s)" % args.output_activation ) # define network modules self.enc = Encoder( idim=idim * args.encoder_reduction_factor, input_layer="linear", elayers=args.elayers, eunits=args.eunits, econv_layers=args.econv_layers, econv_chans=args.econv_chans, econv_filts=args.econv_filts, use_batch_norm=args.use_batch_norm, use_residual=args.use_residual, dropout_rate=args.dropout_rate, ) dec_idim = ( args.eunits if args.spk_embed_dim is None else args.eunits + args.spk_embed_dim ) if args.atype == "location": att = AttLoc( dec_idim, args.dunits, args.adim, args.aconv_chans, args.aconv_filts ) elif args.atype == "forward": att = AttForward( dec_idim, args.dunits, args.adim, args.aconv_chans, args.aconv_filts ) if self.cumulate_att_w: logging.warning( "cumulation of attention weights is disabled in forward attention." ) self.cumulate_att_w = False elif args.atype == "forward_ta": att = AttForwardTA( dec_idim, args.dunits, args.adim, args.aconv_chans, args.aconv_filts, odim, ) if self.cumulate_att_w: logging.warning( "cumulation of attention weights is disabled in forward attention." ) self.cumulate_att_w = False else: raise NotImplementedError("Support only location or forward") self.dec = Decoder( idim=dec_idim, odim=odim, att=att, dlayers=args.dlayers, dunits=args.dunits, prenet_layers=args.prenet_layers, prenet_units=args.prenet_units, postnet_layers=args.postnet_layers, postnet_chans=args.postnet_chans, postnet_filts=args.postnet_filts, output_activation_fn=self.output_activation_fn, cumulate_att_w=self.cumulate_att_w, use_batch_norm=args.use_batch_norm, use_concate=args.use_concate, dropout_rate=args.dropout_rate, zoneout_rate=args.zoneout_rate, reduction_factor=args.reduction_factor, ) self.taco2_loss = Tacotron2Loss( use_masking=args.use_masking, bce_pos_weight=args.bce_pos_weight ) if self.use_guided_attn_loss: self.attn_loss = GuidedAttentionLoss( sigma=args.guided_attn_loss_sigma, alpha=args.guided_attn_loss_lambda, ) if self.use_cbhg: self.cbhg = CBHG( idim=odim, odim=args.spc_dim, conv_bank_layers=args.cbhg_conv_bank_layers, conv_bank_chans=args.cbhg_conv_bank_chans, conv_proj_filts=args.cbhg_conv_proj_filts, conv_proj_chans=args.cbhg_conv_proj_chans, highway_layers=args.cbhg_highway_layers, highway_units=args.cbhg_highway_units, gru_units=args.cbhg_gru_units, ) self.cbhg_loss = CBHGLoss(use_masking=args.use_masking) if self.src_reconstruction_loss_lambda > 0: self.src_reconstructor = Encoder( idim=dec_idim, input_layer="linear", elayers=args.elayers, eunits=args.eunits, econv_layers=args.econv_layers, econv_chans=args.econv_chans, econv_filts=args.econv_filts, use_batch_norm=args.use_batch_norm, use_residual=args.use_residual, dropout_rate=args.dropout_rate, ) self.src_reconstructor_linear = torch.nn.Linear( args.econv_chans, idim * args.encoder_reduction_factor ) self.src_reconstruction_loss = CBHGLoss(use_masking=args.use_masking) if self.trg_reconstruction_loss_lambda > 0: self.trg_reconstructor = Encoder( idim=dec_idim, input_layer="linear", elayers=args.elayers, eunits=args.eunits, econv_layers=args.econv_layers, econv_chans=args.econv_chans, econv_filts=args.econv_filts, use_batch_norm=args.use_batch_norm, use_residual=args.use_residual, dropout_rate=args.dropout_rate, ) self.trg_reconstructor_linear = torch.nn.Linear( args.econv_chans, odim * args.reduction_factor ) self.trg_reconstruction_loss = CBHGLoss(use_masking=args.use_masking) # load pretrained model if args.pretrained_model is not None: self.load_pretrained_model(args.pretrained_model)
def __init__( self, idim: int, odim: int, embed_dim: int = 512, elayers: int = 1, eunits: int = 512, econv_layers: int = 3, econv_chans: int = 512, econv_filts: int = 5, atype: str = "location", adim: int = 512, aconv_chans: int = 32, aconv_filts: int = 15, cumulate_att_w: bool = True, dlayers: int = 2, dunits: int = 1024, prenet_layers: int = 2, prenet_units: int = 256, postnet_layers: int = 5, postnet_chans: int = 512, postnet_filts: int = 5, output_activation: str = None, use_cbhg: bool = False, cbhg_conv_bank_layers: int = 8, cbhg_conv_bank_chans: int = 128, cbhg_conv_proj_filts: int = 3, cbhg_conv_proj_chans: int = 256, cbhg_highway_layers: int = 4, cbhg_highway_units: int = 128, cbhg_gru_units: int = 256, use_batch_norm: bool = True, use_concate: bool = True, use_residual: bool = False, dropout_rate: float = 0.5, zoneout_rate: float = 0.1, reduction_factor: int = 1, spk_embed_dim: int = None, spc_dim: int = None, use_masking: bool = True, use_weighted_masking: bool = False, bce_pos_weight: float = 5.0, use_guided_attn_loss: bool = True, guided_attn_loss_sigma: float = 0.4, guided_attn_loss_lambda: float = 1.0, ): assert check_argument_types() super().__init__() # store hyperparameters self.idim = idim self.odim = odim self.eos = idim - 1 self.spk_embed_dim = spk_embed_dim self.cumulate_att_w = cumulate_att_w self.reduction_factor = reduction_factor self.use_cbhg = use_cbhg self.use_guided_attn_loss = use_guided_attn_loss # define activation function for the final output if output_activation is None: self.output_activation_fn = None elif hasattr(F, output_activation): self.output_activation_fn = getattr(F, output_activation) else: raise ValueError(f"there is no such an activation function. " f"({output_activation})") # set padding idx padding_idx = 0 self.padding_idx = padding_idx # define network modules self.enc = Encoder( idim=idim, embed_dim=embed_dim, elayers=elayers, eunits=eunits, econv_layers=econv_layers, econv_chans=econv_chans, econv_filts=econv_filts, use_batch_norm=use_batch_norm, use_residual=use_residual, dropout_rate=dropout_rate, padding_idx=padding_idx, ) dec_idim = eunits if spk_embed_dim is None else eunits + spk_embed_dim if atype == "location": att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts) elif atype == "forward": att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts) if self.cumulate_att_w: logging.warning("cumulation of attention weights is disabled " "in forward attention.") self.cumulate_att_w = False elif atype == "forward_ta": att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, odim) if self.cumulate_att_w: logging.warning("cumulation of attention weights is disabled " "in forward attention.") self.cumulate_att_w = False else: raise NotImplementedError("Support only location or forward") self.dec = Decoder( idim=dec_idim, odim=odim, att=att, dlayers=dlayers, dunits=dunits, prenet_layers=prenet_layers, prenet_units=prenet_units, postnet_layers=postnet_layers, postnet_chans=postnet_chans, postnet_filts=postnet_filts, output_activation_fn=self.output_activation_fn, cumulate_att_w=self.cumulate_att_w, use_batch_norm=use_batch_norm, use_concate=use_concate, dropout_rate=dropout_rate, zoneout_rate=zoneout_rate, reduction_factor=reduction_factor, ) self.taco2_loss = Tacotron2Loss( use_masking=use_masking, use_weighted_masking=use_weighted_masking, bce_pos_weight=bce_pos_weight, ) if self.use_guided_attn_loss: self.attn_loss = GuidedAttentionLoss( sigma=guided_attn_loss_sigma, alpha=guided_attn_loss_lambda, ) if self.use_cbhg: self.cbhg = CBHG( idim=odim, odim=spc_dim, conv_bank_layers=cbhg_conv_bank_layers, conv_bank_chans=cbhg_conv_bank_chans, conv_proj_filts=cbhg_conv_proj_filts, conv_proj_chans=cbhg_conv_proj_chans, highway_layers=cbhg_highway_layers, highway_units=cbhg_highway_units, gru_units=cbhg_gru_units, ) self.cbhg_loss = CBHGLoss(use_masking=use_masking)