예제 #1
0
 def __init__(self, size, self_attn, src_attn, feed_forward, dropout_rate,
              normalize_before=True, concat_after=False):
     super(DecoderLayer, self).__init__()
     self.size = size
     self.self_attn = self_attn
     self.src_attn = src_attn
     self.feed_forward = feed_forward
     self.norm1 = LayerNorm(size)
     self.norm2 = LayerNorm(size)
     self.norm3 = LayerNorm(size)
     self.dropout = nn.Dropout(dropout_rate)
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     if self.concat_after:
         self.concat_linear1 = nn.Linear(size + size, size)
         self.concat_linear2 = nn.Linear(size + size, size)
예제 #2
0
    def __init__(self,
                 idim,
                 n_layers=2,
                 n_chans=256,
                 kernel_size=3,
                 dropout_rate=0.1,
                 offset=1.0):
        """Initilize duration predictor module.

        Args:
            idim (int): Input dimension.
            n_layers (int, optional): Number of convolutional layers.
            n_chans (int, optional): Number of channels of convolutional layers.
            kernel_size (int, optional): Kernel size of convolutional layers.
            dropout_rate (float, optional): Dropout rate.
            offset (float, optional): Offset value to avoid nan in log domain.

        """
        super(DurationPredictor, self).__init__()
        self.offset = offset
        self.conv = torch.nn.ModuleList()
        for idx in range(n_layers):
            in_chans = idim if idx == 0 else n_chans
            self.conv += [
                torch.nn.Sequential(
                    torch.nn.Conv1d(in_chans,
                                    n_chans,
                                    kernel_size,
                                    stride=1,
                                    padding=(kernel_size - 1) // 2),
                    torch.nn.ReLU(), LayerNorm(n_chans, dim=1),
                    torch.nn.Dropout(dropout_rate))
            ]
        self.linear = torch.nn.Linear(n_chans, 1)
예제 #3
0
 def __init__(self, odim,
              attention_dim=256,
              attention_heads=4,
              linear_units=1024,
              num_blocks=4,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              self_attention_dropout_rate=0.0,
              src_attention_dropout_rate=0.0,
              input_layer="linear",
              use_output_layer=True,
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     torch.nn.Module.__init__(self)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate),
             torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     else:
         raise NotImplementedError("only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     self.decoders = repeat(
         4,
         lambda: DecoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
             dropout_rate,
             normalize_before,
             concat_after
         )
     )
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
예제 #4
0
 def __init__(self,
              idim,
              n_layers=2,
              n_chans=256,
              out=1,
              kernel_size=3,
              dropout_rate=0.5,
              offset=1.0):
     super(VariancePredictor, self).__init__()
     self.offset = offset
     self.conv = torch.nn.ModuleList()
     for idx in range(n_layers):
         in_chans = idim if idx == 0 else n_chans
         self.conv += [
             torch.nn.Sequential(
                 torch.nn.Conv1d(in_chans,
                                 n_chans,
                                 kernel_size,
                                 stride=1,
                                 padding=(kernel_size - 1) // 2),
                 torch.nn.ReLU(), LayerNorm(n_chans, dim=1),
                 torch.nn.Dropout(dropout_rate))
         ]
     self.linear = torch.nn.Linear(n_chans, out)
예제 #5
0
 def __init__(self,
              idim,
              attention_dim=256,
              attention_heads=2,
              linear_units=2048,
              num_blocks=4,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False,
              positionwise_layer_type="linear",
              positionwise_conv_kernel_size=1,
              padding_idx=-1):
     super(Encoder, self).__init__()
     if input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(idim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif input_layer == "conv2d":
         self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
     elif input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(idim,
                                attention_dim,
                                padding_idx=padding_idx),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif input_layer is None:
         self.embed = torch.nn.Sequential(
             pos_enc_class(attention_dim, positional_dropout_rate))
     else:
         raise ValueError("unknown input_layer: " + input_layer)
     self.normalize_before = normalize_before
     if positionwise_layer_type == "linear":
         positionwise_layer = PositionwiseFeedForward
         positionwise_layer_args = (attention_dim, linear_units,
                                    dropout_rate)
     elif positionwise_layer_type == "conv1d":
         positionwise_layer = MultiLayeredConv1d
         positionwise_layer_args = (attention_dim, linear_units,
                                    positionwise_conv_kernel_size,
                                    dropout_rate)
     else:
         raise NotImplementedError("Support only linear or conv1d.")
     self.encoders = repeat(
         4, lambda: EncoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim,
                                  attention_dropout_rate),
             positionwise_layer(*positionwise_layer_args), dropout_rate,
             normalize_before, concat_after))
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)