Exemplo n.º 1
0
    def __init__(self,
                 in_dim,
                 out_dim,
                 convolutions=((256, 5, 1), ) * 4,
                 dropout=0.1):
        super(Converter, self).__init__()
        self.dropout = dropout
        self.in_dim = in_dim
        self.out_dim = out_dim

        # Non-causual convolutions
        in_channels = convolutions[0][0]
        self.fc1 = Linear(in_dim, in_channels)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()

        Conv1dLayer = Conv1d if has_dilation(convolutions) else ConvTBC
        for (out_channels, kernel_size, dilation) in convolutions:
            pad = (kernel_size - 1) // 2 * dilation
            dilation = (dilation, )
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                Conv1dLayer(in_channels,
                            out_channels * 2,
                            kernel_size,
                            padding=pad,
                            dilation=dilation,
                            dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, out_dim)
Exemplo n.º 2
0
 def __init__(self, conv_channels, embed_dim, dropout=0.1):
     super(AttentionLayer, self).__init__()
     # projects from output of convolution to embedding dimension
     self.in_projection = Linear(conv_channels, embed_dim)
     # projects from embedding dimension to convolution size
     self.out_projection = Linear(embed_dim, conv_channels)
     self.dropout = dropout
Exemplo n.º 3
0
    def __init__(
        self,
        conv_layers_before=None,
        input_size=83,
        embed_dim=512,
        convolutions=((512, 3), ) * 20,
        dropout=0.1,
    ):
        super(FConvEncoder, self).__init__(None)  # no src dictionary
        self.dropout = dropout
        self.num_attention_layers = None

        self.conv_layers_before = conv_layers_before
        self.fc0 = Linear(input_size, embed_dim, dropout=dropout) \
            if input_size != embed_dim else None

        convolutions = extend_conv_spec(convolutions)
        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.residuals = []

        layer_in_channels = [in_channels]
        for _, (out_channels, kernel_size,
                residual) in enumerate(convolutions):
            if residual == 0:
                residual_dim = out_channels
            else:
                residual_dim = layer_in_channels[-residual]
            self.projections.append(
                Linear(residual_dim, out_channels
                       ) if residual_dim != out_channels else None)
            if kernel_size % 2 == 1:
                padding = kernel_size // 2
            else:
                padding = 0
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        dropout=dropout,
                        padding=padding))
            self.residuals.append(residual)
            in_channels = out_channels
            layer_in_channels.append(out_channels)
        self.fc2 = Linear(in_channels, embed_dim)
Exemplo n.º 4
0
    def __init__(
            self,
            in_dim,
            out_dim,
            convolutions=((256, 5, 1), ) * 6,
            deconvolutions=((256, 5, 1), ) * 2,  # do upsampling
            dropout=0.1):
        super(Converter, self).__init__()
        self.dropout = dropout
        self.in_dim = in_dim
        self.out_dim = out_dim

        # Non-causual convolutions
        in_channels = convolutions[0][0]
        self.fc1 = Linear(in_dim, in_channels)

        # Convlutions
        self.convolutions = nn.ModuleList()
        self.deconvolutions = nn.ModuleList()
        for idx, (out_channels, kernel_size,
                  dilation) in enumerate(convolutions):
            if idx < len(deconvolutions):
                self.deconvolutions.append(
                    ConvTranspose1d(in_channels,
                                    out_channels,
                                    kernel_size=2,
                                    padding=0,
                                    stride=2))
            pad = (kernel_size - 1) // 2 * dilation
            dilation = (dilation, )
            self.convolutions.append(
                Conv1d(in_channels,
                       out_channels * 2,
                       kernel_size,
                       padding=pad,
                       dilation=dilation,
                       dropout=dropout))
            in_channels = out_channels

        self.fc2 = Linear(in_channels, out_dim)
Exemplo n.º 5
0
    def __init__(self,
                 n_vocab,
                 embed_dim,
                 n_speakers,
                 speaker_embed_dim,
                 padding_idx=None,
                 convolutions=((64, 5, .1), ) * 7,
                 max_positions=512,
                 dropout=0.1):
        super(Encoder, self).__init__()
        self.dropout = dropout
        self.num_attention_layers = None

        # Text input embeddings
        self.embed_tokens = Embedding(n_vocab, embed_dim, padding_idx)

        # Text position embedding
        self.embed_text_positions = Embedding(max_positions, embed_dim,
                                              padding_idx)
        self.embed_text_positions.weight.data = position_encoding_init(
            max_positions, embed_dim)

        # Speaker embedding
        if n_speakers > 1:
            self.speaker_fc1 = Linear(speaker_embed_dim, embed_dim)
            self.speaker_fc2 = Linear(speaker_embed_dim, embed_dim)
        self.n_speakers = n_speakers

        # Non-causual convolutions
        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.speaker_projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()

        Conv1dLayer = Conv1d if has_dilation(convolutions) else ConvTBC

        for (out_channels, kernel_size, dilation) in convolutions:
            pad = (kernel_size - 1) // 2 * dilation
            dilation = (dilation, )
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.speaker_projections.append(
                Linear(speaker_embed_dim, out_channels
                       ) if n_speakers > 1 else None)
            self.convolutions.append(
                Conv1dLayer(in_channels,
                            out_channels * 2,
                            kernel_size,
                            padding=pad,
                            dilation=dilation,
                            dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)
Exemplo n.º 6
0
    def __init__(
        self,
        embed_dim,
        n_speakers,
        speaker_embed_dim,
        in_dim=80,
        r=5,
        max_positions=512,
        padding_idx=None,
        convolutions=((128, 5, 1), ) * 4,
        attention=True,
        dropout=0.1,
        use_memory_mask=False,
        force_monotonic_attention=True,
        query_position_rate=1.0,
        key_position_rate=1.29,
    ):
        super(Decoder, self).__init__()
        self.dropout = dropout
        self.in_dim = in_dim
        self.r = r

        in_channels = in_dim * r
        if isinstance(attention, bool):
            # expand True into [True, True, ...] and do the same with False
            attention = [attention] * len(convolutions)

        # Position encodings for query (decoder states) and keys (encoder states)
        self.embed_query_positions = Embedding(max_positions,
                                               convolutions[0][0], padding_idx)
        self.embed_query_positions.weight.data = position_encoding_init(
            max_positions,
            convolutions[0][0],
            position_rate=query_position_rate)
        self.embed_keys_positions = Embedding(max_positions, embed_dim,
                                              padding_idx)
        self.embed_keys_positions.weight.data = position_encoding_init(
            max_positions, embed_dim, position_rate=key_position_rate)

        self.fc1 = Linear(in_channels, convolutions[0][0], dropout=dropout)
        in_channels = convolutions[0][0]

        # Causual convolutions
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()

        Conv1dLayer = Conv1d if has_dilation(
            convolutions) else LinearizedConv1d

        for i, (out_channels, kernel_size,
                dilation) in enumerate(convolutions):
            pad = (kernel_size - 1) * dilation
            dilation = (dilation, )
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                Conv1dLayer(in_channels,
                            out_channels * 2,
                            kernel_size,
                            padding=pad,
                            dilation=dilation,
                            dropout=dropout))
            self.attention.append(
                AttentionLayer(out_channels, embed_dim, dropout=dropout
                               ) if attention[i] else None)
            in_channels = out_channels
        self.fc2 = Linear(in_channels, in_dim * r)

        # decoder states -> Done binary flag
        self.fc3 = Linear(in_channels, 1)

        self._is_inference_incremental = False
        self.max_decoder_steps = 200
        self.min_decoder_steps = 10
        self.use_memory_mask = use_memory_mask
        if isinstance(force_monotonic_attention, bool):
            self.force_monotonic_attention = \
                [force_monotonic_attention] * len(convolutions)
        else:
            self.force_monotonic_attention = force_monotonic_attention