def __init__(self, size, self_attn, src_attn, feed_forward, dropout_rate, normalize_before=True, concat_after=False): super(DecoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.src_attn = src_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.norm3 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(size + size, size) self.concat_linear2 = nn.Linear(size + size, size)
def __init__(self, idim, n_layers=2, n_chans=256, kernel_size=3, dropout_rate=0.1, offset=1.0): """Initilize duration predictor module. Args: idim (int): Input dimension. n_layers (int, optional): Number of convolutional layers. n_chans (int, optional): Number of channels of convolutional layers. kernel_size (int, optional): Kernel size of convolutional layers. dropout_rate (float, optional): Dropout rate. offset (float, optional): Offset value to avoid nan in log domain. """ super(DurationPredictor, self).__init__() self.offset = offset self.conv = torch.nn.ModuleList() for idx in range(n_layers): in_chans = idim if idx == 0 else n_chans self.conv += [ torch.nn.Sequential( torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate)) ] self.linear = torch.nn.Linear(n_chans, 1)
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=1024, num_blocks=4, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="linear", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate) ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate) ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise NotImplementedError("only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before self.decoders = repeat( 4, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after ) ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__(self, idim, n_layers=2, n_chans=256, out=1, kernel_size=3, dropout_rate=0.5, offset=1.0): super(VariancePredictor, self).__init__() self.offset = offset self.conv = torch.nn.ModuleList() for idx in range(n_layers): in_chans = idim if idx == 0 else n_chans self.conv += [ torch.nn.Sequential( torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate)) ] self.linear = torch.nn.Linear(n_chans, out)
def __init__(self, idim, attention_dim=256, attention_heads=2, linear_units=2048, num_blocks=4, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, positionwise_layer_type="linear", positionwise_conv_kernel_size=1, padding_idx=-1): super(Encoder, self).__init__() if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate) else: raise NotImplementedError("Support only linear or conv1d.") self.encoders = repeat( 4, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), positionwise_layer(*positionwise_layer_args), dropout_rate, normalize_before, concat_after)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)