예제 #1
0
    def __init__(self,
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1):
        super(AttnPathEncoder, self).__init__()
        self.layers = num_layers
        self.self_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()

        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttention(hidden_size, hidden_size, hidden_size,
                                   num_heads))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
        self.out_norm = LayerNormalization(hidden_size)
예제 #2
0
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 share_embed=False,
                 rank_scale=0.0):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([2]))
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_TARGET)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_TARGET)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.encdec_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        self.norm3_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttentionDecoder(hidden_size,
                                          hidden_size,
                                          hidden_size,
                                          num_heads,
                                          rank_scale=rank_scale))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
            self.norm3_blocks.append(LayerNormalization(hidden_size))
            self.encdec_attention_blocks.append(
                MultiheadAttention(hidden_size,
                                   hidden_size,
                                   hidden_size,
                                   num_heads,
                                   rank_scale=rank_scale))
        self.out_norm = LayerNormalization(hidden_size)
        out_embed_dim = hidden_size
        if share_embed:
            assert out_embed_dim == embed_dim, \
                "Shared embed weights implies same dimensions " \
                " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
            self.out_embed = nn.Linear(hidden_size, num_embeddings)
            self.out_embed.weight = self.embed_tokens.weight
        else:
            self.out_embed = Linear(hidden_size,
                                    num_embeddings,
                                    dropout=dropout)
예제 #3
0
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 convolutions=((256, 3), ) * 4):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttention(hidden_size, hidden_size, hidden_size,
                                   num_heads))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
        self.out_norm = LayerNormalization(hidden_size)

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            pad = (kernel_size - 1) / 2
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        padding=pad,
                        dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)