예제 #1
0
def tvm_conv(batch, channel, out_channel, height, width, k_h, k_w, stride, pad, target, devid=0, number=10):
    A = tvm.te.placeholder((batch, channel, height, width), dtype="float32")
    W = tvm.te.placeholder((out_channel, channel, k_h, k_w), dtype="float32")
    Output = conv2d_nchw(A, W, stride=stride, padding=pad)
    s = tvm.te.create_schedule(Output.op)
    bufs = [A, W, Output]
    return evaluate(s, bufs, target, devid, number)
def try_yolo_conv(config, parameter, fsch):
    # get the compute
    # (1, 3, 448, 448, 64, 3, 7, 7, 1, 2, 3, 1, 1)
    batch, CI, H, W, CO, _, kh, kw, _, st, pad, dilation, group = config
    inputs = tvm.placeholder((batch, CI, H, W), dtype="float32")
    weight = tvm.placeholder((CO, CI, kh, kw), dtype="float32")
    outputs = conv2d_nchw(inputs, weight, stride=st, padding=pad, dilation=dilation, groups=group)
    
    s = tvm.create_schedule(outputs.op)
    fsch(s, outputs, inputs, weight, parameter)

    arg_bufs = [inputs, weight, outputs]
    stmt = tvm.lower(s, arg_bufs, simple_mode=True)
    # print(stmt)
    dev_id = 2
    ctx = tvm.nd.context("cuda", dev_id)
    max_dims = ctx.max_thread_dimensions
    kwargs = {
        "max_shared_memory_per_block": ctx.max_shared_memory_per_block,
        "max_threads_per_block": ctx.max_threads_per_block,
        "max_thread_x": max_dims[0],
        "max_thread_y": max_dims[1],
        "max_thread_z": max_dims[2]
    }
    verify = tvm.ir_pass.VerifyGPUCode(stmt, kwargs)
    # print("config is:\n %s" % (str(config)))
    if verify:
        print("Valid kernel")
        time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10)
        print("Yolo conv use", time_cost, "ms\n")
    else:
        print("Invalid kernel")
        time_cost = float("inf")
    return time_cost
예제 #3
0
def conv2d(N,
           C,
           H,
           W,
           K,
           kernel_size,
           stride=1,
           padding=0,
           dilation=1,
           groups=1):
    Img = tvm.te.placeholder((N, C, H, W))
    W = tvm.te.placeholder((K, C // groups, kernel_size, kernel_size))
    Output = conv2d_nchw(Img,
                         W,
                         stride=stride,
                         padding=padding,
                         dilation=dilation,
                         groups=groups)
    return [Output.op], [Img, W, Output]
예제 #4
0
def conv2d_batching(N, H, W, CO, CI, KH, KW, stride, padding):
    data = tvm.placeholder((N, CI, H, W), name='data', dtype="float32")
    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel', dtype="float32")
    conv = conv2d_nchw(data, kernel, stride=stride, padding=padding)
    s = tvm.create_schedule([conv.op])

    ##### space definition begin #####
    n, f, y, x = s[conv].op.axis
    rc, ry, rx = s[conv].op.reduce_axis
    fused = s[conv].fuse(y, x)
    cfg = autotvm.get_config()
    cfg.define_split("tile_n", n, num_outputs=4)
    cfg.define_split("tile_f", f, num_outputs=4)
    cfg.define_split("tile_rc", rc, num_outputs=3)
    cfg.define_split("tile_ry", ry, num_outputs=3)
    cfg.define_split("tile_rx", rx, num_outputs=3)
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    cfg.define_knob("unroll_explicit", [0, 1])
    ##### space definition end #####

    # inline padding
    pad_data = s[conv].op.input_tensors[0]
    s[pad_data].compute_inline()
    data, raw_data = pad_data, data

    output = conv
    OL = s.cache_write(conv, 'local')

    # create cache stage
    AA = s.cache_read(data, 'shared', [OL])
    WW = s.cache_read(kernel, 'shared', [OL])
    AL = s.cache_read(AA, 'local', [OL])
    WL = s.cache_read(WW, 'local', [OL])

    # tile and bind spatial axes
    n, f, y, x = s[output].op.axis
    yx = s[output].fuse(y, x)
    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
    kernel_scope = yx

    s[output].bind(yx, tvm.thread_axis("blockIdx.z"))
    s[output].bind(bn, tvm.thread_axis("blockIdx.y"))
    s[output].bind(bf, tvm.thread_axis("blockIdx.x"))
    s[output].bind(vn, tvm.thread_axis("vthread"))
    s[output].bind(vf, tvm.thread_axis("vthread"))
    s[output].bind(tn, tvm.thread_axis("threadIdx.y"))
    s[output].bind(tf, tvm.thread_axis("threadIdx.x"))
    s[output].reorder(yx, bn, bf, vn, vf, tn, tf, ni, fi)
    s[OL].compute_at(s[output], tf)

    # tile reduction axes
    n, f, yx = s[OL].op.axis
    rc, ry, rx = s[OL].op.reduce_axis
    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, yx)

    s[AA].compute_at(s[OL], rxo)
    s[WW].compute_at(s[OL], rxo)
    s[AL].compute_at(s[OL], rxm)
    s[WL].compute_at(s[OL], rxm)

    # cooperative fetching
    for load in [AA, WW]:
        n, f, y, x = s[load].op.axis
        fused = s[load].fuse(n, f, y, x)
        ty, fused = s[load].split(fused, nparts=cfg["tile_n"].size[2])
        tx, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))

    # tune unroll
    s[output].pragma(kernel_scope, 'auto_unroll_max_step',
                     cfg['auto_unroll_max_step'].val)
    s[output].pragma(kernel_scope, 'unroll_explicit',
                     cfg['unroll_explicit'].val)

    return s, [raw_data, kernel, conv]
예제 #5
0
            data.set_op_relation(op)

    return data


if __name__ == "__main__":
    from flextensor.nn import conv2d_nchw
    N = 16
    C = 256
    H = 14
    W = 14
    K = 512
    k = 3
    Img = tvm.te.placeholder([N, C, H, W], dtype="float32")
    Kernel = tvm.te.placeholder([K, C, k, k], dtype="float32")
    Outputs = conv2d_nchw(Img, Kernel, None, 1, 1, 1, 1)

    a = tvm.te.placeholder([4, 4])
    b = tvm.te.compute([4, 4], lambda i, j: a[
        (3 * i + j) * 4 - 12 * i - 4 * j + i, j])

    data = build(Outputs.op)
    var1 = IndexNode(Outputs.op, 0, "spatial")
    var2 = Outputs.op.axis[0].var
    var3 = Outputs.op.body[0].source[0].a.args[0]
    print(var1, var2, var3)
    tmp = {}
    tmp[var1] = 999
    print(tmp[var2], tmp[var3])
    print(str(data))
예제 #6
0
def main():
    batch = 2
    dtype = "float64"
    img = tvm.te.placeholder([batch, 1, 32, 32], dtype=dtype, name="img")
    label = tvm.te.placeholder([batch, 10], dtype=dtype, name="label")
    weight_1 = tvm.te.placeholder([6, 1, 5, 5], dtype=dtype, name="w1")
    weight_2 = tvm.te.placeholder([16, 6, 5, 5], dtype=dtype, name="w2")
    weight_3 = tvm.te.placeholder([120, 16, 5, 5], dtype=dtype, name="w3")
    weight_4 = tvm.te.placeholder([120, 84], dtype=dtype, name="w4")
    weight_5 = tvm.te.placeholder([84, 10], dtype=dtype, name="w5")

    act = tanh  # ReLU

    t1 = conv2d_nchw(img, weight_1, None, 1, 0, 1, 1)
    t2 = act(t1)
    t3 = avgpool(t2)

    t4 = conv2d_nchw(t3, weight_2, None, 1, 0, 1, 1)
    t5 = act(t4)
    t6 = avgpool(t5)

    t7 = conv2d_nchw(t6, weight_3, None, 1, 0, 1, 1)
    t8 = act(t7)
    # t9 = avgpool(t8)

    t10 = flatten_gemm(t8, weight_4)

    t11 = (gemm(t10, weight_5))

    t12 = softmax(t11)

    # t13 = sum_all(t12)
    t13 = mse_loss(t12, label)

    d1, d2, d3, d4, d5 = tvm.te.mygradient(
        t13, [weight_1, weight_2, weight_3, weight_4, weight_5])

    s = tvm.te.create_schedule([t13.op, d1.op, d2.op, d3.op, d4.op, d5.op])

    func = tvm.build(s, [
        img, label, weight_1, weight_2, weight_3, weight_4, weight_5, t13, d1,
        d2, d3, d4, d5
    ],
                     target="llvm")

    free_vars = [weight_1, weight_2, weight_3, weight_4, weight_5]
    gradients = [d1, d2, d3, d4, d5]
    params = []
    for var in free_vars:
        shape = to_tuple(var.shape)
        var_np = np.random.uniform(-100, 100, shape).astype(dtype)
        params.append(var_np)
    img_np = np.random.uniform(-10, 10, to_tuple(img.shape)).astype(dtype)
    label_np = np.random.uniform(-10, 10, to_tuple(label.shape)).astype(dtype)
    ret_np = np.zeros(to_tuple(t13.shape)).astype(dtype)
    inits = []
    for var in gradients:
        shape = to_tuple(var.shape)
        var_np = np.zeros(shape).astype(dtype)
        inits.append(var_np)

    ctx = tvm.device("llvm")

    img_tvm = tvm.nd.array(img_np, ctx)
    label_tvm = tvm.nd.array(label_np, ctx)
    ret_tvm = tvm.nd.array(ret_np, ctx)
    free_vars_tvm = [tvm.nd.array(x, ctx) for x in params]
    gradients_tvm = [tvm.nd.array(x, ctx) for x in inits]

    func(img_tvm, label_tvm, *free_vars_tvm, ret_tvm, *gradients_tvm)

    ret_torch, grad_torch = pytorch_result(img_np, label_np, params)

    print(ret_tvm)
    print(ret_torch)

    tvm.testing.assert_allclose(ret_tvm.asnumpy(),
                                ret_torch.detach().numpy(),
                                atol=1e-3,
                                rtol=1e-5)
    for i in range(len(gradients_tvm)):
        print("grad_torch", i, grad_torch[i].detach().T.numpy())
        if i > 2:
            tvm.testing.assert_allclose(gradients_tvm[i].asnumpy(),
                                        grad_torch[i].detach().T.numpy(),
                                        atol=1e-3,
                                        rtol=1e-5)
        else:
            tvm.testing.assert_allclose(gradients_tvm[i].asnumpy(),
                                        grad_torch[i].detach().numpy(),
                                        atol=1e-3,
                                        rtol=1e-5)

    print("Compare to Pytorch success!")