Exemplo n.º 1
0
    def test_size(self):
        q = Variable(torch.FloatTensor(64, 10, 20))
        k = Variable(torch.FloatTensor(64, 10, 21))
        v = Variable(torch.FloatTensor(64, 10, 30))
        with self.assertRaises(AssertionError):
            attention = ScaledDotProductAttention(100)
            attention(q, k, v)

        q = Variable(torch.FloatTensor(64, 10, 20))
        k = Variable(torch.FloatTensor(64, 10, 20))
        v = Variable(torch.FloatTensor(64, 10, 30))
        with self.assertRaises(AssertionError):
            attention = ScaledDotProductAttention(100)
            attention(q, k, v)

        q = Variable(torch.FloatTensor(64, 10, 20))
        k = Variable(torch.FloatTensor(64, 10, 20))
        v = Variable(torch.FloatTensor(64, 10, 30))
        attn_mask = Variable(torch.FloatTensor(64, 10))
        with self.assertRaises(AssertionError):
            attention = ScaledDotProductAttention(100)
            attention(q, k, v, attn_mask)

        q = Variable(torch.FloatTensor(64, 10, 20))
        k = Variable(torch.FloatTensor(64, 10, 20))
        v = Variable(torch.FloatTensor(64, 10, 30))
        attention = ScaledDotProductAttention(20)
        o, attn = attention(q, k, v)
        self.assertEqual(o.size(), v.size())
        self.assertEqual(attn.size(), torch.Size([64, 10, 10]))
Exemplo n.º 2
0
    def __init__(self, d_k, d_v, d_model, n_heads, dropout):
        super(_MultiHeadAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.n_heads = n_heads

        self.w_q = Linear([d_model, d_k * n_heads])
        self.w_k = Linear([d_model, d_k * n_heads])
        self.w_v = Linear([d_model, d_v * n_heads])

        self.attention = ScaledDotProductAttention(d_k, dropout)
Exemplo n.º 3
0
    def __init__(self, d_k, d_v, d_model, n_heads, dropout):
        super(_MultiHeadAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.n_heads = n_heads

        self.w_q = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k))
        self.w_k = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k))
        self.w_v = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_v))

        self.attention = ScaledDotProductAttention(d_k, dropout)

        init.xavier_normal(self.w_q)
        init.xavier_normal(self.w_k)
        init.xavier_normal(self.w_v)
Exemplo n.º 4
0
    def __init__(self,
                 n_layers,
                 d_k,
                 d_v,
                 d_model,
                 d_ff,
                 n_heads,
                 max_seq_len,
                 tgt_vocab_size,
                 dropout=0.1,
                 weighted=False):
        super(tree_encoder, self).__init__()
        self.d_model = d_model
        self.n_layers = 1
        #        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model, padding_idx=data_utils.PAD,)
        #        self.pos_emb = PosEncoding(25, 300) # TODO: *10 fix

        #        self.dropout_emb = nn.Dropout(dropout)
        #        self.layer_type = DecoderLayer if not weighted else WeightedDecoderLayer
        #        self.layers = nn.ModuleList(
        #            [self.layer_type(d_k, d_v, d_model, d_ff, n_heads, dropout) for _ in range(n_layers)])
        #
        #        self.V = nn.ParameterList([nn.Parameter((-.5 - .5) * torch.rand(300, 300) + .5, requires_grad = True) for _ in range(10)]) # 60x60 type er 30 ta
        self.Wm = nn.Linear(300, 300)
        self.Um = nn.Linear(300, 300)
        #        self.w = nn.Parameter((-.5 - .5) * torch.rand(1, 300) + .5, requires_grad = True)
        self.pos_ffn = PoswiseFeedForwardNet(d_model, d_ff, dropout)
        self.demo = nn.Linear(300, 300)
        #        self.head_attn = MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout)
        #self.layers1 = nn.ModuleList(
        #[ScaledDotProductAttention(300, dropout) for _ in range(self.n_layers)])
        #self.layers = nn.ModuleList(
        #[MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout) for _ in range(self.n_layers)])
        self.layers = nn.ModuleList([
            MultiBranchAttention(d_k, d_v, d_model, d_ff, n_heads, dropout)
            for _ in range(self.n_layers)
        ])
        #self.layers1 = nn.ModuleList(
        #[MultiBranchAttention(d_k, d_v, d_model, d_ff, n_heads, dropout) for _ in range(self.n_layers)])
        self.attention = ScaledDotProductAttention(300, dropout)
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.new_objective = False
        self.proj = nn.Linear(900, 300)
Exemplo n.º 5
0
    def __init__(self, d_k, d_v, d_model, n_heads, dropout):
        super(_MultiHeadAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model
        self.n_heads = n_heads

        self.w_q = nn.Parameter(
            (-.5 - .5) * torch.rand(n_heads, d_model, d_k) + .5,
            requires_grad=True)
        self.w_k = nn.Parameter(
            (-.5 - .5) * torch.rand(n_heads, d_model, d_k) + .5,
            requires_grad=True)
        self.w_v = nn.Parameter(
            (-.5 - .5) * torch.rand(n_heads, d_model, d_k) + .5,
            requires_grad=True)

        self.attention = ScaledDotProductAttention(d_k, dropout)

        init.xavier_normal(self.w_q)
        init.xavier_normal(self.w_k)
        init.xavier_normal(self.w_v)
Exemplo n.º 6
0
    def __init__(self, n_heads, d_k, d_v, d_model, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        assert n_heads * d_k == d_model, ('`n_heads` * `d_k` != `d_model`'
                                          ' ({} x {} != {})'.format(
                                              n_heads, d_k, d_model))

        self.n_heads = n_heads
        self.d_k = d_k
        self.d_v = d_v

        self.w_q = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k))
        self.w_k = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k))
        self.w_v = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_v))
        self.attn = ScaledDotProductAttention(d_k, attn_droput=dropout)

        self.proj = nn.Linear(n_heads * d_v, d_model)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = LayerNormalization(d_model)

        nn.init.xavier_normal(self.w_q)
        nn.init.xavier_normal(self.w_k)
        nn.init.xavier_normal(self.w_v)