def forward(self, q, k, v, mask=None): attn = flow.bmm(q, k.transpose(1, 2)) attn = attn / self.temperature if mask is not None: attn = attn.masked_fill(mask, -np.inf) attn = self.softmax(attn) attn = self.dropout(attn) output = flow.bmm(attn, v) return output, attn
def _test_bmm_backward(test_case, device): input1 = flow.tensor( [ [ [ -0.0036776792258024216, 1.9946473836898804, -0.423959881067276 ], [ 1.0892143249511719, 0.04005361348390579, -0.27883127331733704 ], ], [ [ -0.970306396484375, 0.017771577462553978, 0.019596196711063385 ], [ 0.27402883768081665, -0.8192587494850159, -0.3135920464992523 ], ], ], dtype=flow.float32, device=flow.device(device), requires_grad=True, ) input2 = flow.tensor( [ [ [1.118346929550171, -0.930071234703064], [1.1238232851028442, 1.373764157295227], [0.17178462445735931, -1.1010534763336182], ], [ [0.6694859862327576, 0.9250285029411316], [-1.0835869312286377, 0.4192655086517334], [1.2616937160491943, 0.33809131383895874], ], ], dtype=flow.float32, device=flow.device(device), requires_grad=True, ) of_out = flow.bmm(input1, input2) of_out = of_out.sum() of_out.backward() np_grad = [ [ [0.18827569484710693, 2.4975874423980713, -0.9292688369750977], [0.18827569484710693, 2.4975874423980713, -0.9292688369750977], ], [ [1.5945144891738892, -0.6643214225769043, 1.5997850894927979], [1.5945144891738892, -0.6643214225769043, 1.5997850894927979], ], ] test_case.assertTrue( np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05))
def test_bmm_exception_dim_not_right(test_case): x = flow.tensor((2, 2)) with test_case.assertRaises(RuntimeError) as ctx: y = flow.bmm(x, x) test_case.assertTrue( "Expected 3-dimensional tensor, but got 1-dimensional tensor for argument #1" in str(ctx.exception))
def _scaled_dot_product_attention( q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: float = 0.0, ) -> Tuple[Tensor, Tensor]: B, Nt, E = q.shape q = q / math.sqrt(E) # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns) attn = flow.bmm(q, k.transpose(-2, -1)) if attn_mask is not None: attn += attn_mask attn = flow.softmax(attn, dim=-1) if dropout_p > 0.0: attn = flow.nn.functional.dropout(attn, p=dropout_p) # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E) output = flow.bmm(attn, v) return output, attn
def _test_bmm(test_case, device): input1 = flow.tensor(np.random.randn(10, 3, 4), dtype=flow.float32, device=flow.device(device)) input2 = flow.tensor(np.random.randn(10, 4, 5), dtype=flow.float32, device=flow.device(device)) of_out = flow.bmm(input1, input2) np_out = np.matmul(input1.numpy(), input2.numpy()) test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
def _bmm(self, other): return flow.bmm(self, other)