def __init__(self, embed_dim, num_heads, ff_dim, prenorm=False, act=lambda x: x.relu()): self.num_heads = num_heads self.head_size = embed_dim // num_heads assert self.head_size * self.num_heads == embed_dim self.prenorm, self.act = prenorm, act self.query = (Tensor.uniform(embed_dim, embed_dim), Tensor.zeros(embed_dim)) self.key = (Tensor.uniform(embed_dim, embed_dim), Tensor.zeros(embed_dim)) self.value = (Tensor.uniform(embed_dim, embed_dim), Tensor.zeros(embed_dim)) self.out = (Tensor.uniform(embed_dim, embed_dim), Tensor.zeros(embed_dim)) self.ff1 = (Tensor.uniform(embed_dim, ff_dim), Tensor.zeros(ff_dim)) self.ff2 = (Tensor.uniform(ff_dim, embed_dim), Tensor.zeros(embed_dim)) self.ln1 = (Tensor.ones(embed_dim), Tensor.zeros(embed_dim)) self.ln2 = (Tensor.ones(embed_dim), Tensor.zeros(embed_dim))
def __init__(self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1): self.eps, self.track_running_stats, self.training, self.momentum = eps, track_running_stats, training, momentum self.weight, self.bias = Tensor.ones(sz), Tensor.zeros(sz) self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False) self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)
def __init__(self, sz, eps=1e-5, affine=True, track_running_stats=True, momentum=0.1): assert affine == True, "BatchNorm2D is only supported with affine" self.eps, self.track_running_stats, self.momentum = eps, track_running_stats, momentum self.weight, self.bias = Tensor.ones(sz), Tensor.zeros(sz) self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False) self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)
def __init__(self, sz, eps=0.001): self.eps = Tensor([eps], requires_grad=False) self.two = Tensor([2], requires_grad=False) self.weight = Tensor.ones(sz) self.bias = Tensor.zeros(sz) self.running_mean = Tensor.zeros(sz, requires_grad=False) self.running_var = Tensor.ones(sz, requires_grad=False) self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)
def __init__(self, layers=12, embed_dim=192, num_heads=3): self.embedding = (Tensor.uniform(embed_dim, 3, 16, 16), Tensor.zeros(embed_dim)) self.embed_dim = embed_dim self.cls = Tensor.ones(1, 1, embed_dim) self.pos_embedding = Tensor.ones(1, 197, embed_dim) self.tbs = [ TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=embed_dim * 4, prenorm=True, act=lambda x: x.gelu()) for i in range(layers) ] self.encoder_norm = (Tensor.uniform(embed_dim), Tensor.zeros(embed_dim)) self.head = (Tensor.uniform(embed_dim, 1000), Tensor.zeros(1000))
def test_dropout(self): Tensor.training = True n, rate = 1_000_000, 0.1 w = Tensor.ones(n).dropout(rate) non_zeros = np.count_nonzero(w.cpu().data) expected = n * (1 - rate) np.testing.assert_allclose(non_zeros, expected, rtol=1e-3)