def test_size(self): q = Variable(torch.FloatTensor(64, 10, 20)) k = Variable(torch.FloatTensor(64, 10, 21)) v = Variable(torch.FloatTensor(64, 10, 30)) with self.assertRaises(AssertionError): attention = ScaledDotProductAttention(100) attention(q, k, v) q = Variable(torch.FloatTensor(64, 10, 20)) k = Variable(torch.FloatTensor(64, 10, 20)) v = Variable(torch.FloatTensor(64, 10, 30)) with self.assertRaises(AssertionError): attention = ScaledDotProductAttention(100) attention(q, k, v) q = Variable(torch.FloatTensor(64, 10, 20)) k = Variable(torch.FloatTensor(64, 10, 20)) v = Variable(torch.FloatTensor(64, 10, 30)) attn_mask = Variable(torch.FloatTensor(64, 10)) with self.assertRaises(AssertionError): attention = ScaledDotProductAttention(100) attention(q, k, v, attn_mask) q = Variable(torch.FloatTensor(64, 10, 20)) k = Variable(torch.FloatTensor(64, 10, 20)) v = Variable(torch.FloatTensor(64, 10, 30)) attention = ScaledDotProductAttention(20) o, attn = attention(q, k, v) self.assertEqual(o.size(), v.size()) self.assertEqual(attn.size(), torch.Size([64, 10, 10]))
def __init__(self, d_k, d_v, d_model, n_heads, dropout): super(_MultiHeadAttention, self).__init__() self.d_k = d_k self.d_v = d_v self.d_model = d_model self.n_heads = n_heads self.w_q = Linear([d_model, d_k * n_heads]) self.w_k = Linear([d_model, d_k * n_heads]) self.w_v = Linear([d_model, d_v * n_heads]) self.attention = ScaledDotProductAttention(d_k, dropout)
def __init__(self, d_k, d_v, d_model, n_heads, dropout): super(_MultiHeadAttention, self).__init__() self.d_k = d_k self.d_v = d_v self.d_model = d_model self.n_heads = n_heads self.w_q = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k)) self.w_k = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k)) self.w_v = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_v)) self.attention = ScaledDotProductAttention(d_k, dropout) init.xavier_normal(self.w_q) init.xavier_normal(self.w_k) init.xavier_normal(self.w_v)
def __init__(self, n_layers, d_k, d_v, d_model, d_ff, n_heads, max_seq_len, tgt_vocab_size, dropout=0.1, weighted=False): super(tree_encoder, self).__init__() self.d_model = d_model self.n_layers = 1 # self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model, padding_idx=data_utils.PAD,) # self.pos_emb = PosEncoding(25, 300) # TODO: *10 fix # self.dropout_emb = nn.Dropout(dropout) # self.layer_type = DecoderLayer if not weighted else WeightedDecoderLayer # self.layers = nn.ModuleList( # [self.layer_type(d_k, d_v, d_model, d_ff, n_heads, dropout) for _ in range(n_layers)]) # # self.V = nn.ParameterList([nn.Parameter((-.5 - .5) * torch.rand(300, 300) + .5, requires_grad = True) for _ in range(10)]) # 60x60 type er 30 ta self.Wm = nn.Linear(300, 300) self.Um = nn.Linear(300, 300) # self.w = nn.Parameter((-.5 - .5) * torch.rand(1, 300) + .5, requires_grad = True) self.pos_ffn = PoswiseFeedForwardNet(d_model, d_ff, dropout) self.demo = nn.Linear(300, 300) # self.head_attn = MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout) #self.layers1 = nn.ModuleList( #[ScaledDotProductAttention(300, dropout) for _ in range(self.n_layers)]) #self.layers = nn.ModuleList( #[MultiHeadAttention(d_k, d_v, d_model, n_heads, dropout) for _ in range(self.n_layers)]) self.layers = nn.ModuleList([ MultiBranchAttention(d_k, d_v, d_model, d_ff, n_heads, dropout) for _ in range(self.n_layers) ]) #self.layers1 = nn.ModuleList( #[MultiBranchAttention(d_k, d_v, d_model, d_ff, n_heads, dropout) for _ in range(self.n_layers)]) self.attention = ScaledDotProductAttention(300, dropout) self.w_1 = nn.Linear(d_model, d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout) self.new_objective = False self.proj = nn.Linear(900, 300)
def __init__(self, d_k, d_v, d_model, n_heads, dropout): super(_MultiHeadAttention, self).__init__() self.d_k = d_k self.d_v = d_v self.d_model = d_model self.n_heads = n_heads self.w_q = nn.Parameter( (-.5 - .5) * torch.rand(n_heads, d_model, d_k) + .5, requires_grad=True) self.w_k = nn.Parameter( (-.5 - .5) * torch.rand(n_heads, d_model, d_k) + .5, requires_grad=True) self.w_v = nn.Parameter( (-.5 - .5) * torch.rand(n_heads, d_model, d_k) + .5, requires_grad=True) self.attention = ScaledDotProductAttention(d_k, dropout) init.xavier_normal(self.w_q) init.xavier_normal(self.w_k) init.xavier_normal(self.w_v)
def __init__(self, n_heads, d_k, d_v, d_model, dropout=0.1): super(MultiHeadAttention, self).__init__() assert n_heads * d_k == d_model, ('`n_heads` * `d_k` != `d_model`' ' ({} x {} != {})'.format( n_heads, d_k, d_model)) self.n_heads = n_heads self.d_k = d_k self.d_v = d_v self.w_q = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k)) self.w_k = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_k)) self.w_v = nn.Parameter(torch.FloatTensor(n_heads, d_model, d_v)) self.attn = ScaledDotProductAttention(d_k, attn_droput=dropout) self.proj = nn.Linear(n_heads * d_v, d_model) self.dropout = nn.Dropout(dropout) self.layer_norm = LayerNormalization(d_model) nn.init.xavier_normal(self.w_q) nn.init.xavier_normal(self.w_k) nn.init.xavier_normal(self.w_v)