def loss(q, k, scale, proj, attn_dist): qp = q kp = k ra, _ = fat.relu_rff_attn(qp, kp, scale * proj) return fat.kl(attn_dist, ra).mean()
def loss(q, k, scale, proj, attn_dist): qp = q kp = k ra, _ = fat.relu_rff_attn(qp, kp, jax.lax.stop_gradient(proj)) return fat.kl(attn_dist, ra).mean()
def loss(q, k, scale, proj, attn_dist): qp = renorm(q, axis=-1) kp = renorm(k, axis=-1) ra, _ = fat.rff_attn(qp, kp, jax.lax.stop_gradient(proj)) return fat.kl(attn_dist, ra).mean()
def loss(q, k, scale, proj, attn_dist): qp = renorm(q, axis=-1) kp = renorm(k, axis=-1) ra, _ = fat.rff_attn(qp, kp, proj) return fat.kl(attn_dist, ra).mean()
def loss(q, k, dummy_proj, attn): logits = q @ k.T probs = softmax(logits) return fat.kl(attn, probs).mean()