Exemplo n.º 1
0
 def test_conv_backward_pass_options(self):
     lang = CONV_TRAIN
     N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1
     convolution = tc.define(lang, training=True, name="convolution", backward="convolution_grad", constants={"sh":sH, "sw":sW})
     I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True)
     W = Parameter(torch.randn(O, C, kH, kW).cuda())
     out = convolution(I, W, options=[tc.Options("conv"), tc.Options("group_conv")])
     out.sum().backward()
Exemplo n.º 2
0
def test_mv(R, C):
    mat = Variable(torch.randn(R, C).cuda())
    vector = Variable(torch.randn(C).cuda())
    out_tc = mv(mat, vector, options=tc.Options("mlp"))
    out_pt = torch.mv(mat, vector)
    assert out_tc.cpu().data.view(-1).tolist() == approx(
        out_pt.cpu().data.view(-1).tolist(), abs=1e-4)
Exemplo n.º 3
0
    def test_train_matmul(self):
        LANG = """
        def matmul(float(M,N) A, float(N,K) B) -> (output) {
          output(i, j) +=! A(i, kk) * B(kk, j)
        }
        def matmul_grad(float(M,N) A, float(N,K) B, float(M,K) O_grad) -> (A_grad, B_grad){
          A_grad(i, j) +=! O_grad(i, kk) * B(j, kk)
          B_grad(i, j) +=! O_grad(kk, j) * A(kk, i)
        }
        """

        matmul = tc.define(LANG, name="matmul", training=True, backward="matmul_grad")
        mat1 = Parameter(torch.randn(3, 4).cuda())
        mat2 = Variable(torch.randn(4, 5).cuda(), requires_grad=True)
        out = matmul(mat1, mat2, options=[tc.Options("mlp"), tc.Options("mlp")])
        out.sum().backward()
 def test_absolute(self):
     LANG = """
     def abs(float(M, N) A) -> (O1) {
       O1(m, n) = fabs(A(m, n))
     }
     """
     absolute = tc.define(LANG, name="abs")
     A = -1 * torch.randn(3, 4).cuda()
     out = absolute(A, options=tc.Options("pointwise"))
Exemplo n.º 5
0
def test_matmul(M, N, K):
    mat1 = Variable(torch.randn(M, N).cuda())
    mat2 = Variable(torch.randn(N, K).cuda())
    matmul.autotune(mat1,
                    mat2,
                    cache=True,
                    options=tc.Options("mlp"),
                    **tc.autotuner_settings)
    out_tc = matmul(mat1, mat2)
    out_pt = torch.matmul(mat1, mat2)
    assert out_tc.cpu().data.view(-1).tolist() == approx(
        out_pt.cpu().data.view(-1).tolist(), abs=1e-4)
Exemplo n.º 6
0
def tmm(M, K, N, **compare_kwargs):
    global A, B, tc_tmm
    print('tmm(M={}, K={}, N={})'.format(M, K, N))
    A = Variable(torch.Tensor(M, K).cuda().normal_())
    B = Variable(torch.Tensor(N, K).cuda().normal_())
    tc_tmm = tc.define('''
    def tmm(float(M, K) A, float(N, K) B) -> (C) {
        C(m, n) +=! A(m, kk) * B(n, kk)
    }''',
                       name='tmm')
    tc_tmm.autotune(A, B, options=tc.Options('mlp'), **autotune_kwargs)
    compare('tc_tmm(A, B)', 'torch.mm(A, B.t())', **compare_kwargs)
Exemplo n.º 7
0
def tbmm(B, M, K, N, **compare_kwargs):
    global X, Y, tc_tbmm
    print('tbmm(B={}, M={}, K={}, N={})'.format(B, M, K, N))
    X = Variable(torch.Tensor(B, N, M).cuda().normal_())
    Y = Variable(torch.Tensor(B, K, M).cuda().normal_())
    tc_tbmm = tc.define('''
    def tbmm(float(B, N, M) X, float(B, K, M) Y) -> (Z) {
        Z(b, n, k) +=! X(b, n, m) * Y(b, k, m)
    }''',
                        name='tbmm')
    tc_tbmm.autotune(X, Y, options=tc.Options('mlp'), **autotune_kwargs)
    compare('tc_tbmm(X, Y)', 'torch.bmm(X, Y.transpose(1, 2))',
            **compare_kwargs)
Exemplo n.º 8
0
def autotune():
    input, running_mean, running_std, weight, bias, params = generate_data()
    input = input.transpose(0, 1).contiguous().view(input.shape[1], -1)
    grad_output = input.clone()
    options = tc.Options("mlp")
    tuner_kwargs = dict(options=options,
                        generations=1,
                        pop_size=10,
                        crossover_rate=80,
                        number_elites=1,
                        threads=20)

    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_mean_std, input,
                              params, **tuner_kwargs)
    batchMean, batchStd = BatchReNorm2dTCFunction.calc_mean_std(input, params)
    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_r_d, batchStd,
                              batchMean, running_mean, running_std, params,
                              **tuner_kwargs)
    r, d = BatchReNorm2dTCFunction.calc_r_d(batchStd, batchMean, running_mean,
                                            running_std, params)
    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_O, input, weight,
                              bias, batchStd, batchMean, r, d, **tuner_kwargs)
    O = BatchReNorm2dTCFunction.calc_O(input, weight, bias, batchStd,
                                       batchMean, r, d)
    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_running_mean_std,
                              batchStd, batchMean, running_mean, running_std,
                              params, **tuner_kwargs)
    rMeanOut, rStdOut = BatchReNorm2dTCFunction.calc_running_mean_std(
        batchStd, batchMean, running_mean, running_std, params)

    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_xHat_grad, weight,
                              grad_output, **tuner_kwargs)
    xHat_grad = BatchReNorm2dTCFunction.calc_xHat_grad(weight, grad_output)
    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_mean_std_grad,
                              input, batchMean, batchStd, r, xHat_grad,
                              **tuner_kwargs)
    batchMean_grad, batchStd_grad = BatchReNorm2dTCFunction.calc_mean_std_grad(
        input, batchMean, batchStd, r, xHat_grad)
    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_xHat, input,
                              batchMean, batchStd, r, d, **tuner_kwargs)
    xHat = BatchReNorm2dTCFunction.calc_xHat(input, batchMean, batchStd, r, d)
    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_weight_bias_grad,
                              grad_output, xHat, **tuner_kwargs)
    weight_grad, bias_grad = BatchReNorm2dTCFunction.calc_weight_bias_grad(
        grad_output, xHat)
    autotune_with_named_cache(BatchReNorm2dTCFunction.calc_I_grad, input,
                              batchMean, batchStd, r, xHat_grad,
                              batchMean_grad, batchStd_grad, **tuner_kwargs)
    I_grad = BatchReNorm2dTCFunction.calc_I_grad(input, batchMean, batchStd, r,
                                                 xHat_grad, batchMean_grad,
                                                 batchStd_grad)
 def test_layernorm(self):
     # NOTE: take note of use of {{ }} below for handling TC with scalars
     lang = """
     def layernorm(float(T, B, C) I) -> (O, mean, centered, var)
     {{
        mean(t, b) +=! I(t, b, c) / C
        centered(t, b, c) = I(t, b, c) - mean(t, b)
        var(t, b) +=! centered(t, b, c) * centered(t, b, c)
        var(t, b) = (var(t, b) + {eps}) / C
        O(t, b, c) = centered(t, b, c) / rsqrt(var(t, b))
     }}
     """
     layernorm = tc.define(lang, name="layernorm", constants={"eps": 1e-5})
     inp = torch.randn(7, 32, 64).cuda()
     options = tc.Options("mlp")
     options = layernorm.autotune(inp, **tc.autotuner_settings)
     out = layernorm(inp, options=options)
Exemplo n.º 10
0
def gconv(N, G, F, C, W, H, KH, KW, **compare_kwargs):
    global tc_I, tc_W1, tc_gconv, nn_I, nn_gconv
    print('gconv(N={}, G={}, F={}, C={}, W={}, H={}, KH={}, KW={})'.format(
        N, G, F, C, W, H, KH, KW))
    tc_I = Variable(torch.Tensor(N, G, C, H, W).cuda().normal_())
    nn_I = tc_I.view(N, G * C, H, W)
    nn_gconv = nn.Conv2d(G * C, G * F, (KH, KW), groups=G, bias=False).cuda()
    tc_W1 = nn_gconv.weight.view(G, F, C, KH, KW)
    tc_gconv = tc.define('''
    def gconv(float(N, G, C, H, W) I, float(G, F, C, KW, KW) W1) -> (O) {
        O(n, g, o, h, w) +=! I(n, g, i, h + kh, w + kw) * W1(g, o, i, kh, kw)
    }''',
                         name='gconv')
    tc_gconv.autotune(tc_I,
                      tc_W1,
                      options=tc.Options('group_conv'),
                      **autotune_kwargs)
    compare('tc_gconv(tc_I, tc_W1)', 'nn_gconv(nn_I)', **compare_kwargs)
Exemplo n.º 11
0
 def build_model_tc(self):
     import tensor_comprehensions as tc
     lang = """
     def convolution(float(N,CI,H,W) I, float(CO,CI,KH,KW) W1) -> (O) {
         O(n, co, h, w) +=! I(n, ci, h + kh, w + kw) * W1(co, ci, kh, kw)
     }
     """
     convolution = tc.define(lang, name="convolution")
     inp, kern = self.get_dataset()
     if (self.params.backend_opts['tc_autotune']):
         convolution.autotune(
             inp,
             kern,
             cache=self.get_tc_cache(),
             options=tc.Options("conv"),
             generations=self.params.backend_opts["tc_at_generations"],
             pop_size=self.params.backend_opts["tc_at_population"],
             elites=1,
             threads=8)
     return convolution
Exemplo n.º 12
0
 def build_model_tc(self):
     actmap = {"relu": "fmax(OUT(n, i, j), 0)"}
     import tensor_comprehensions as tc
     lang = """
     def matmul(float(N, I, K) IN, float(K, J) W, float(J) B) -> (OUT) {
         OUT(n, i, j) +=! IN(n, i, k) * W(k, j)
         OUT(n, i, j) = OUT(n, i, j) + B(j)
      """
     if self.activation:
         lang += "OUT(n, i, j) = {}\n".format(actmap[self.activation])
     lang += "}"
     inp, wgt, bias = self.get_dataset()
     matmul = tc.define(lang, name="matmul")
     matmul.autotune(
         inp,
         wgt,
         bias,
         cache=self.get_tc_cache(),
         options=tc.Options("mlp"),
         generations=self.params.backend_opts['tc_at_generations'],
         pop_size=self.params.backend_opts['tc_at_population'],
         elites=1,
         threads=8)
     return matmul
Exemplo n.º 13
0
def test_maxpool(B, C, H, W):
    tensor = Variable(torch.randn(B, C, H, W).cuda())
    out_tc = maxpool(tensor, options=tc.Options("conv"))
    out_pt = F.max_pool2d(tensor, kernel_size=2, stride=1)
    assert out_tc.cpu().data.view(-1).tolist() == approx(
        out_pt.cpu().data.view(-1).tolist())
Exemplo n.º 14
0
def test_abs(M, N):
    mat = Variable(torch.randn(M, N).cuda())
    out_tc = abs(mat, options=tc.Options("pointwise"))
    out_pt = torch.abs(mat)
    assert out_tc.cpu().data.view(-1).tolist() == approx(
        out_pt.cpu().data.view(-1).tolist())
Exemplo n.º 15
0
import tensor_comprehensions as tc
import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter

matmul = tc.define(tc.database['matmul']['lang'], name='matmul')
mat1, mat2 = torch.randn(3, 4).cuda(), torch.randn(4, 5).cuda()
out = matmul(mat1, mat2, options=tc.Options("mlp"))
print(out)

CONV_LANG = """
def convolution(float(N,C,H,W) I, float(M,C,KH,KW) W1) -> (O) {{
   O(n, m, h, w) +=! I(n, c, {sh} * h + kh, {sw} * w + kw) * W1(m, c, kh, kw)
}}
def convolution_grad(float(N,C,H,W) I, float(M,C,KH,KW) W1, float(N,M,H,W) O_grad) -> (I_grad, W1_grad) {{
   I_grad(n, c, h, w) +=! O_grad(n, m, {sh} * h - kh, {sw} * w - kw) * W1(m, c, kh, kw)
   W1_grad(m, c, kh, kw) +=! O_grad(n, m, {sh} * h - kh, {sw} * w - kw) * I(n, c, h, w)
}}
"""
N, C, H, W, O, kH, kW, sH, sW = 32, 4, 56, 56, 16, 1, 1, 1, 1
convolution = tc.define(CONV_LANG, training=True, name="convolution", backward="convolution_grad",
                        constants={"sh": sH, "sw": sW})
I = Variable(torch.randn(N, C, H, W).cuda(), requires_grad=True)
W = Parameter(torch.randn(O, C, kH, kW).cuda())
out = convolution(I, W, options=[tc.Options("conv"), tc.Options("group_conv")])
out[0].sum().backward()

lang = """
def matmul(float(M,N) A, float(N,K) B) -> (output) {
  output(i, j) +=! A(i, kk) * B(kk, j)
}
Exemplo n.º 16
0
    return times


def autotune(cache_file='tc_cache'):
    print("Starting autotune")
    A, B = torch.randn(M, K).cuda(), torch.randn(K, N).cuda()
    sgemm = tc.define(lang, name="sgemm")
    best_opts = sgemm.autotune(A,
                               B,
                               cache=cache_file,
                               generations=25,
                               pop_size=50,
                               crossover_rate=70,
                               number_elites=5,
                               gpus="1,2,3")
    print("Done autotune")
    print(sorted(test(best_opts, 20))[10])
    return best_opts


def load_cache(cache_file='tc_cache'):
    A, B = torch.randn(M, K).cuda(), torch.randn(K, N).cuda()
    sgemm = tc.define(lang, name="sgemm")
    # Couldn't find a reasonable way to load cache:
    return sgemm.autotune(A, B, cache=cache_file, generations=0)


# autotune()
print("naive:", sorted(test(tc.Options("naive"), 10))[5])
print("autotuned:", sorted(test(load_cache(), 100))[50])