예제 #1
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
     C = tvm.placeholder((n,), name='C', dtype="int32")
     D = tvm.compute((n,),
                     lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
     s = tvm.create_schedule(D.op)
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
     s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B, C, D], "cuda")
     np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_c = np.random.randint(low=0, high=127, size=(n,))
     np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
     b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
     c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
     d = tvm.nd.empty((n,), D.dtype, ctx)
     fun(a, b, c, d)
     tvm.testing.assert_allclose(d.asnumpy(), np_d)
예제 #2
0
파일: lstm.py 프로젝트: gwli/tvm
 def check_device(target):
     num_step = n_num_step
     flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c],
                       target)
     ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
     # launch the kernel.
     scan_h_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     scan_c_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     Xi2h_np = np.random.normal(
         size=(num_step, batch_size, 4, num_hidden)).astype("float32")
     Wh2h_np = np.random.normal(
         size=(4, num_hidden, num_hidden)).astype("float32")
     scan_h_a = tvm.nd.array(scan_h_np, ctx)
     scan_c_a = tvm.nd.array(scan_c_np, ctx)
     Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
     Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     # measure time cost of second step.
     tstart = time.time()
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     tgap = time.time() - tstart
     print("Time cost=%g" % tgap)
예제 #3
0
파일: run_onnx_tvm.py 프로젝트: shinh/test
def run(args):
    onnx_model = onnx.load_model(os.path.join(args.test_dir, 'model.onnx'))
    symbol, params = nnvm.frontend.from_onnx(onnx_model)
    input_names = symbol.list_input_names()
    output_names = symbol.list_output_names()

    test_data_dir = os.path.join(args.test_dir, 'test_data_set_0')
    inputs, outputs = load_test_data(test_data_dir, input_names, output_names)
    inputs = dict(inputs)

    # assert len(input_names) == len(inputs) + len(params)
    # assert len(output_names) == len(outputs)

    graph, lib, params = compile(
        symbol, args.target, input_names, inputs, params,
        args.opt_level, args.autotvm_log)

    if args.dump_nnvm:
        print(graph.ir())
        print(graph.json())

    ctx = tvm.gpu()

    # Prepare inputs.
    tvm_inputs = {}
    for name, value in inputs.items():
        tvm_inputs[name] = tvm.nd.array(value, ctx=ctx)
    for name, value in params.items():
        tvm_inputs[name] = tvm.nd.array(value, ctx=ctx)

    graph_module = None
    if args.debug:
        try:
            graph_module = debug_runtime.create(graph, lib, ctx)
        except:
            print('debug_runtime is disabled. '
                  'Set USE_GRAPH_RUNTIME_DEBUG=ON and rebuild TVM')
    if graph_module is None:
        graph_module = graph_runtime.create(graph, lib, ctx)

    graph_module.set_input(**tvm_inputs)

    graph_module.run()

    for i, (name, expected) in enumerate(outputs):
        tvm_output = tvm.nd.empty(expected.shape, expected.dtype, ctx=ctx)
        actual = graph_module.get_output(i, tvm_output).asnumpy()
        np.testing.assert_allclose(expected, actual,
                                   rtol=1e-3, atol=1e-4), name
        print('%s: OK' % name)
    print('ALL OK')

    if args.iterations > 1:
        num_iterations = args.iterations - 1
        start = time.time()
        for t in range(num_iterations):
            graph_module.run()
            cupy.cuda.device.Device().synchronize()
        elapsed = time.time() - start
        print('Elapsed: %.3f msec' % (elapsed * 1000 / num_iterations))
예제 #4
0
 def check_cuda(n, value):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     lanes = 4
     dtype = 'int8'
     ctx = tvm.gpu(0)
     A = tvm.compute((n, lanes), lambda i,j: tvm.const(value, dtype=dtype))
     s = tvm.create_schedule(A.op)
     y, x = s[A].op.axis
     s[A].vectorize(x)
     s[A].bind(y, tvm.thread_axis("blockIdx.x"))
     fun = tvm.build(s, [A], "cuda", name="make_int8x4")
     np_a = np.full((n, lanes), value, dtype=dtype)
     a = tvm.nd.empty(np_a.shape, dtype, ctx)
     fun(a)
     np.testing.assert_equal(a.asnumpy(), np_a)
예제 #5
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     ctx = tvm.gpu(0)
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.compute((n,), lambda i: A[i], name='B')
     s = tvm.create_schedule(B.op)
     bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
     s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B], "cuda", name="vector_load")
     np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
     b = tvm.nd.empty((n,), B.dtype, ctx)
     fun(a,b)
     tvm.testing.assert_allclose(a.asnumpy(), b.asnumpy())
예제 #6
0
파일: config.py 프로젝트: masa-ito-fj/nnvm
def ctx_list():
    """Get context list for testcases"""
    device_list = os.environ.get("NNVM_TEST_TARGETS", "")
    device_list = (device_list.split(",") if device_list
                   else ["llvm", "cuda"])
    device_list = set(device_list)
    res = [("llvm", tvm.cpu(0)), ("cuda", tvm.gpu(0))]
    return [x for x in res if x[1].exist and x[0] in device_list]
예제 #7
0
def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"):
    global TASK
    TASK = "bcast_binary_" + typ + "_lhs" +\
           "_".join([str(ele) for ele in lhs_shape]) +\
           "rhs" + "_".join([str(ele) for ele in rhs_shape])
    A = tvm.placeholder(shape=lhs_shape, name="A")
    B = tvm.placeholder(shape=rhs_shape, name="B")
    if typ == "add":
        C = topi.broadcast_add(A, B)
    elif typ == "sub":
        C = topi.broadcast_sub(A, B)
    elif typ == "div":
        C = topi.broadcast_div(A, B)
    elif typ == "mul":
        C = topi.broadcast_mul(A, B)
    elif typ == "maximum":
        C = topi.broadcast_maximum(A, B)
    elif typ == "minimum":
        C = topi.broadcast_minimum(A, B)
    else:
        raise NotImplementedError
    s = topi.cuda.schedule_broadcast(C)
    fcuda = tvm.build(s, [A, B, C], "cuda", name="broadcast_binary" + "_" + typ)

    lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
    rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype)
    if typ == "add":
        out_npy = lhs_npy + rhs_npy
    elif typ == "sub":
        out_npy = lhs_npy - rhs_npy
    elif typ == "div":
        rhs_npy = np.abs(rhs_npy) + 0.001
        out_npy = lhs_npy / rhs_npy
    elif typ == "mul":
        out_npy = lhs_npy * rhs_npy
    elif typ == "maximum":
        out_npy = np.maximum(lhs_npy, rhs_npy)
    elif typ == "minimum":
        out_npy = np.minimum(lhs_npy, rhs_npy)
    lhs_nd = tvm.nd.array(lhs_npy, tvm.gpu())
    rhs_nd = tvm.nd.array(rhs_npy, tvm.gpu())
    out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.gpu())
    for _ in range(2):
        fcuda(lhs_nd, rhs_nd, out_nd)
    np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
예제 #8
0
def test_ctx():
    def test_ctx_func(ctx):
        assert tvm.gpu(7) == ctx
        return tvm.cpu(0)
    x = test_ctx_func(tvm.gpu(7))
    assert x == tvm.cpu(0)
    x = tvm.opencl(10)
    x = tvm._api_internal._context_test(x, x.device_type, x.device_id)
    assert x == tvm.opencl(10)
예제 #9
0
def test_broadcast_to(in_shape, out_shape):
    global TASK
    TASK = "bcast_to_i" + "_".join([str(ele) for ele in in_shape])\
           + "o" + "_".join([str(ele) for ele in out_shape])
    # Build the logic and compile the function
    A = tvm.placeholder(shape=in_shape, name="A")
    B = topi.broadcast_to(A, out_shape)
    s = topi.cuda.schedule_broadcast(B)
    fcuda = tvm.build(s, [A, B], "cuda", name="broadcast_to")

    data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
    out_npy = np.broadcast_to(data_npy, out_shape)

    data_nd = tvm.nd.array(data_npy, tvm.gpu())
    out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), tvm.gpu())
    for _ in range(2):
        fcuda(data_nd, out_nd)
    np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
예제 #10
0
파일: test_topi_clip.py 프로젝트: gwli/tvm
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
     f = tvm.build(s, [A, B], device, name="clip")
     f(a, b)
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
예제 #11
0
파일: test_cudnn.py 프로젝트: bddppq/tvm
 def verify():
     ctx = tvm.gpu(0)
     f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d")
     x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32),
                      ctx)
     w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32),
                      ctx)
     y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32),
                      ctx)
     f(x, w, y)
예제 #12
0
def enabled_ctx_list():
    ctx_list = [('cpu', tvm.cpu(0)),
                ('gpu', tvm.gpu(0)),
                ('cl', tvm.opencl(0)),
                ('metal', tvm.metal(0)),
                ('rocm', tvm.rocm(0)),
                ('vulkan', tvm.vulkan(0)),
                ('vpi', tvm.vpi(0))]
    for k, v  in ctx_list:
        assert tvm.context(k, 0) == v
    ctx_list = [x[1] for x in ctx_list if x[1].exist]
    return ctx_list
예제 #13
0
파일: test_topi_clip.py 프로젝트: gwli/tvm
 def check_device(device):
     if not tvm.module.enabled(device):
         print("Skip because %s is not enabled" % device)
         return
     target = topi.cpp.TEST_create_target(device)
     s = topi.cpp.generic.default_schedule(target, [B], False)
     ctx = tvm.cpu(0) if device == "llvm" else tvm.gpu(0)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
     f = tvm.build(s, [A, B], device, name="clip")
     f(a, b)
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
예제 #14
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
         print("skip because gpu does not support fp16")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B], "cuda")
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(
         np.random.uniform(size=(n, lanes)))
     c = tvm.nd.empty((n,), B.dtype, ctx)
     fun(a, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
예제 #15
0
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
    global TASK
    # Build the logic and compile the function
    A = tvm.placeholder(shape=in_shape, name="A")
    if type == "sum":
        TASK = "sum_map_id%d" %test_id
        B = topi.sum(A, axis=axis, keepdims=keepdims)
    elif type == "max":
        TASK = "max_map_id%d" %test_id
        B = topi.max(A, axis=axis, keepdims=keepdims)
    elif type == "min":
        TASK = "min_map_id%d" %test_id
        B = topi.min(A, axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError
    s = topi.cuda.schedule_reduce(B)
    with tvm.build_config(auto_unroll_max_step=16,
                          auto_unroll_min_depth=0):
        fcuda = tvm.build(s, [A, B], "cuda", name="sum")

    # Test
    in_npy = np.random.normal(size=in_shape).astype(np.float32)
    if type == "sum":
        out_npy = in_npy.sum(axis=axis, keepdims=keepdims)
    elif type == "max":
        out_npy = in_npy.max(axis=axis, keepdims=keepdims)
    elif type == "min":
        out_npy = in_npy.min(axis=axis, keepdims=keepdims)
    else:
        raise NotImplementedError

    data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu())
    out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu())

    for _ in range(2):
        fcuda(data_tvm, out_tvm)
    tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
예제 #16
0
 def verify(target="cuda"):
     if not tvm.module.enabled(target):
         print("skip because %s is not enabled..." % target)
         return
     if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
         print("skip because extern function is not available")
         return
     ctx = tvm.gpu(0)
     f = tvm.build(s, [A, B, C], target)
     a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
     b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
     c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
     f(a, b, c)
     tvm.testing.assert_allclose(
         c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
예제 #17
0
def test_allocate():
    @tvm.hybrid.script
    def blur2d(a):
        b = output_tensor((30, 30), 'float32')
        for i in range(30):
            ha = allocate((3, 30), 'float32')
            for j in range(3):
                for k in range(30):
                    ha[j, k] = a[i+j, k] + a[i+j, k+1] + a[i+j, k+2]
            for j in range(30):
                b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0
        return b

    a = tvm.placeholder((32, 32), 'float32', 'a')
    b = blur2d(a)
    sch = tvm.create_schedule(b.op)
    func, ins, outs = run_and_check(blur2d, [a])
    run_and_check(func, ins, outs=outs)

    if tvm.gpu().exist:
        @tvm.hybrid.script
        def share_vec_add(a, b):
            c = output_tensor((256, ), 'float32')
            shared = allocate((256, ), 'float32', 'shared')
            for i in bind("threadIdx.x", 256):
                shared[i] = a[i]
            local = allocate((256, ), 'float32', 'local')
            for i in bind("threadIdx.x", 256):
                local[i] = b[i]
            for i in bind("threadIdx.x", 256):
                c[i] = shared[i] + local[i]
            return c

        a = tvm.placeholder((256, ), dtype='float32', name='a')
        b = tvm.placeholder((256, ), dtype='float32', name='b')
        c = share_vec_add(a, b)
        func, ins, outs = run_and_check(share_vec_add, [a, b], target='cuda')
        run_and_check(func, ins, outs=outs, target='cuda')
    else:
        print('[Warning] No GPU found! Skip shared mem test!')
예제 #18
0
파일: matexp.py 프로젝트: bddppq/tvm
    def check_device(target):
        with tvm.build_config(
                detect_global_barrier=detect_global_barrier,
                auto_unroll_max_step=128,
                unroll_explicit=False):
            f = tvm.build(s, [s_scan, Whh], target)
        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        res_np = np.zeros(
            (n_num_step, n_batch_size, n_num_hidden)).astype("float32")
        Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32")
        Whh_np[:] = 2.0 / n_num_hidden
        Whh_np[:, n_num_hidden//2:] = 0

        res_a = tvm.nd.array(res_np, ctx)
        Whh_a = tvm.nd.array(Whh_np, ctx)
        # Skip first pass as it is compilation
        f(res_a, Whh_a)
        ctx.sync()
        # measure time cost of second step.
        tstart = time.time()
        f(res_a, Whh_a)
        ctx.sync()
        tgap = time.time() - tstart
        print("Time cost=%g" % tgap)
        # correctness
        if not SKIP_CHECK:
            res_gpu = res_a.asnumpy()
            res_cmp = np.ones_like(res_np).astype("float64")
            Whh_np = Whh_np.astype("float64")
            for t in range(1, n_num_step):
                res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np)
            for i  in range(n_num_step):
                for j in range(n_num_hidden):
                    if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5:
                        print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j]))
            tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
예제 #19
0
파일: lstm.py 프로젝트: bddppq/tvm
 def check_device(target):
     num_step = n_num_step
     flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c],
                       target)
     ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
     # launch the kernel.
     scan_h_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     scan_c_np = np.zeros(
         (num_step, batch_size, num_hidden)).astype("float32")
     Xi2h_np = np.random.normal(
         size=(num_step, batch_size, 4, num_hidden)).astype("float32")
     Wh2h_np = np.random.normal(
         size=(4, num_hidden, num_hidden)).astype("float32")
     scan_h_a = tvm.nd.array(scan_h_np, ctx)
     scan_c_a = tvm.nd.array(scan_c_np, ctx)
     Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
     Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
     flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     ctx.sync()
     # measure time cost of second step.
     evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
     eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
     print("Time cost=%g" % eval_result.mean)
예제 #20
0
    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
        # Build the kernel
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
        # Prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        scale_tvm = tvm.nd.array(scale_np, ctx)
        shift_tvm = tvm.nd.array(shift_np, ctx)

        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        scale_shift_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(ScaleShift.shape),
                     dtype=ScaleShift.dtype), ctx)
        relu_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # Measure time cost of kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift)
        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000)
        tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          scale_shift_tvm).mean
        # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu)
        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000)
        tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          relu_tvm).mean
        print("Input shape = " + str(get_const_tuple(Input.shape)))
        print("Filter shape = " + str(get_const_tuple(Filter.shape)))
        print("Stride = (%d, %d)" % (stride_h, stride_w))
        print("padding = %s\n" % padding)
        print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape)))
        print("average time cost of 1000 runs (depthwise_conv2d) = %g us" %
              (tcost_1 * 1e6))
        print(
            "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us"
            % (tcost_2 * 1e6))
        print(
            "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us"
            % (tcost_3 * 1e6))
        # correctness
        depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
            input_np, filter_np, stride=[stride_h, stride_w], padding=padding)
        scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape))
        for c in range(in_channel * channel_multiplier):
            scale_shift_scipy[:,
                              c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[
                                  c] + shift_np[c]
        relu_scipy = np.maximum(scale_shift_scipy, 0)
        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                   depthwise_conv2d_scipy,
                                   rtol=1e-5)
        np.testing.assert_allclose(scale_shift_tvm.asnumpy(),
                                   scale_shift_scipy,
                                   rtol=1e-5)
        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
        print("success")
예제 #21
0
 def test_ctx_func(ctx):
     assert tvm.gpu(7) == ctx
     return tvm.cpu(0)
예제 #22
0
파일: nnvm.py 프로젝트: mbrukman/myia
    def convert(self, lst, *, target='cpu', dev_id=0):
        """Converts the list of nodes to a runnable form.

        All the nodes in the list must represent linear flow (no calls,
        branches, ...)

        Returns:
            (fn, inputs, outputs):

            - fn: A callable function
            - inputs: the list of inputs nodes whose values should be
                      provided to the function
            - outputs: the list of output nodes corresponding to the
                       outputs of the function

        Notes:
            This implementation converts the nodes to NNVM and compiles it.

        """
        self.c = count()
        self.eqv = {}
        self.inputs = []
        self.input_names = []
        self.constants = {}
        self.constant_vars = {}
        self.shapes = {}
        self.types = {}

        for n in lst:
            assert n.is_apply()
            assert n.inputs[0].is_constant(Primitive)
            fn = n.inputs[0].value
            conv = self.mapping.get(fn, None)
            if conv is not None:
                self.eqv[n] = conv(self, *n.inputs[1:])
            else:
                raise NotImplementedError(fn)

        outputs = get_outputs(lst, lst[0].graph.manager.uses,
                              set(self.eqv.keys()))

        inmap = dict((self.eqv[i], i) for i in self.inputs)

        # Check for empty functions
        if all(self.eqv[o] in inmap for o in outputs):
            return None, [inmap[self.eqv[o]] for o in outputs], outputs

        if target == 'cpu':
            target = 'llvm'

        g = nnvm.graph.create(sym.Group(list(self.eqv[o] for o in outputs)))
        dg, lib, params = nnvm.compiler.build(g,
                                              target=target,
                                              shape=self.shapes,
                                              dtype=self.types,
                                              params=self.constants)

        shape = dg.json_attr('shape')
        types = dg.json_attr('dtype')
        index = dg.index

        def spec(entry_id):
            return (shape[entry_id],
                    graph_attr.TCODE_TO_DTYPE[types[entry_id]])

        output_specs = [spec(index.entry_id(x)) for x in index.output_entries]
        assert len(output_specs) == len(outputs)

        if target == 'llvm':
            context = tvm.cpu(dev_id)
        elif target == 'cuda':  # pragma: no cover
            context = tvm.gpu(dev_id)
        else:  # pragma: no cover
            raise Exception(f"Unsupported target: {target}")

        module = graph_runtime.create(dg, lib, context)

        for n, p in params.items():
            module.set_input(n, p)

        input_types = [self.types[i] for i in self.input_names]
        return (NNVMRunner(module, self.input_names, input_types, output_specs,
                           context), self.inputs, outputs)
예제 #23
0
# into the lower intrinsic IR of the specified target backend, which is CUDA
# in this example. Then the machine code will be generated as the module library.

opt_level = 3
target = tvm.target.cuda()
with nnvm.compiler.build_config(opt_level=opt_level):
    graph, lib, params = nnvm.compiler.build(
        net, target, shape={"data": data_shape}, params=params)

#####################################################################
# Run the generate library
# ------------------------
# Now we can create graph runtime and run the module on Nvidia GPU.

# create random input
ctx = tvm.gpu()
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
# create module
module = graph_runtime.create(graph, lib, ctx)
# set input and parameters
module.set_input("data", data)
module.set_input(**params)
# run
module.run()
# get output
out = module.get_output(0, tvm.nd.empty(out_shape))
# convert to numpy
out.asnumpy()

# Print first 10 elements of output
print(out.asnumpy().flatten()[0:10])
예제 #24
0
######################################################################
# Compile the Graph
# -----------------
# Now we would like to port the Gluon model to a portable computational graph.
# It's as easy as several lines.
# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
input_shape = (1, 3, 224, 224)
dtype = 'float32'
net, params = relay.frontend.from_mxnet(block,
                                        shape={'data': input_shape},
                                        dtype=dtype)
# we want a probability so add a softmax operator
net = relay.Function(net.params, relay.nn.softmax(net.body), None,
                     net.type_params, net.attrs)

######################################################################
# now compile the graph
target = 'cuda'
shape_dict = {'data': x.shape}
with relay.build_config(opt_level=3):
    intrp = relay.build_module.create_executor('graph', net, tvm.gpu(0),
                                               target)

######################################################################
# Execute the portable graph on TVM
# ---------------------------------
# Now, we would like to reproduce the same forward computation using TVM.
tvm_output = intrp.evaluate(net)(tvm.nd.array(x.astype(dtype)), **params)
top1 = np.argmax(tvm_output.asnumpy()[0])
print('TVM prediction top-1:', top1, synset[top1])
예제 #25
0
def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
    target = tvm.target.Target.current(allow_none=False)
    dispatch_ctx = autotvm.task.DispatchContext.current

    _, outs = relay.backend.compile_engine.select_implementation(
        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target)
    workload = autotvm.task.get_workload(outs)
    if workload is None:
        # The best implementation is not an AutoTVM template,
        # we then assume it's not necessary to alter this op.
        return None
    cfg = dispatch_ctx.query(target, workload)
    if cfg.is_fallback:  # if is fallback, clear query cache and return None
        autotvm.task.clear_fallback_cache(target, workload)
        return None

    topi_tmpl = workload[0]
    new_attrs = {k: attrs[k] for k in attrs.keys()}

    strides = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    data_layout = attrs["data_layout"]
    kernel_layout = attrs["kernel_layout"]
    data, kernel = tinfos
    out_dtype = out_type.dtype

    if topi_tmpl == "conv2d_NCHWc_int8.cuda":
        assert data_layout == "NCHW" and kernel_layout == "OIHW"
        N, CI, H, W = get_const_tuple(data.shape)
        CO, _, KH, KW = get_const_tuple(kernel.shape)

        new_layout = "NCHW4c"
        new_attrs["channels"] = CO
        new_attrs["data_layout"] = new_layout
        new_attrs["out_layout"] = new_layout
        new_attrs["kernel_layout"] = "OIHW4o4i"
        ic_block_factor = oc_block_factor = 4

        # Store the same config for the altered operator (workload)
        new_data = te.placeholder(
            (N, CI // ic_block_factor, H, W, ic_block_factor),
            dtype=data.dtype)
        new_kernel = te.placeholder(
            (
                CO // oc_block_factor,
                CI // ic_block_factor,
                KH,
                KW,
                oc_block_factor,
                ic_block_factor,
            ),
            dtype=kernel.dtype,
        )
        new_workload = autotvm.task.args_to_workload(
            [
                new_data, new_kernel, strides, padding, dilation, new_layout,
                out_dtype
            ],
            "conv2d_NCHWc_int8.cuda",
        )
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.conv2d(*inputs, **new_attrs)

    if topi_tmpl == "conv2d_nchw_winograd.cuda":
        if dilation != (1, 1):
            logger.warning(
                "Does not support weight pre-transform for dilated convolution."
            )
            return None

        assert data_layout == "NCHW" and kernel_layout == "OIHW"
        N, CI, H, W = get_const_tuple(data.shape)
        CO, _, KH, KW = get_const_tuple(kernel.shape)

        # pre-compute weight transformation in winograd
        tile_size = _infer_tile_size(tinfos[0], tinfos[1])

        weight = relay.nn.contrib_conv2d_winograd_weight_transform(
            inputs[1], tile_size=tile_size)
        weight = relay.transpose(weight, axes=[0, 1, 3, 2])
        new_attrs["tile_size"] = tile_size
        new_attrs["channels"] = CO

        # Store the same config for the altered operator (workload)
        new_data = data
        new_weight = te.placeholder(
            (KH + tile_size - 1, KW + tile_size - 1, CI, CO),
            dtype=kernel.dtype)
        new_workload = autotvm.task.args_to_workload(
            [new_data, new_weight, strides, padding, dilation, out_dtype],
            "conv2d_nchw_winograd_without_weight_transform.cuda",
        )
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
            inputs[0], weight, **new_attrs)

    if topi_tmpl in ("conv2d_nhwc_winograd_direct.cuda",
                     "conv2d_nhwc_winograd_tensorcore.cuda"):
        if dilation != (1, 1):
            logger.warning(
                "Does not support weight pre-transform for dilated convolution."
            )
            return None

        assert data_layout == "NHWC" and kernel_layout == "HWIO"
        N, H, W, CI = get_const_tuple(data.shape)
        KH, KW, _, CO = get_const_tuple(kernel.shape)

        # Pre-compute weight transformation in winograd
        if H % 8 == 0:
            tile_size = 4
        else:
            tile_size = 2
        kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
        weight = relay.nn.contrib_conv2d_winograd_weight_transform(
            kernel_transform, tile_size=tile_size)
        weight = relay.transpose(weight, axes=[0, 1, 3, 2])
        new_attrs["tile_size"] = tile_size
        new_attrs["channels"] = CO
        # Store the same config for the altered operator (workload)
        new_data = data
        new_weight = te.placeholder(
            (KH + tile_size - 1, KW + tile_size - 1, CI, CO),
            dtype=kernel.dtype)
        if topi_tmpl == "conv2d_nhwc_winograd_direct.cuda":
            new_workload = autotvm.task.args_to_workload(
                [new_data, new_weight, strides, padding, dilation, out_dtype],
                "conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
            )
        elif topi_tmpl == "conv2d_nhwc_winograd_tensorcore.cuda":
            new_workload = autotvm.task.args_to_workload(
                [new_data, new_weight, strides, padding, dilation, out_dtype],
                "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
            )
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
            inputs[0], weight, **new_attrs)

    if topi_tmpl == "group_conv2d_NCHWc_int8.cuda":
        assert data_layout == "NCHW" and kernel_layout == "OIHW"
        N, CI, H, W = get_const_tuple(data.shape)
        CO, _, KH, KW = get_const_tuple(kernel.shape)

        new_layout = "NCHW4c"
        new_attrs["channels"] = CO
        new_attrs["data_layout"] = new_layout
        new_attrs["out_layout"] = new_layout
        new_attrs["kernel_layout"] = "OIHW4o4i"
        ic_block_factor = oc_block_factor = 4

        # Store the same config for the altered operator (workload)
        new_data = te.placeholder(
            (N, CI // ic_block_factor, H, W, ic_block_factor),
            dtype=data.dtype)
        new_kernel = te.placeholder(
            (
                CO // oc_block_factor,
                CI // ic_block_factor // groups,
                KH,
                KW,
                oc_block_factor,
                ic_block_factor,
            ),
            dtype=kernel.dtype,
        )
        new_workload = autotvm.task.args_to_workload(
            [
                new_data, new_kernel, strides, padding, dilation, groups,
                out_dtype
            ],
            "group_conv2d_NCHWc_int8.cuda",
        )
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.conv2d(*inputs, **new_attrs)

    if topi_tmpl == "conv2d_HWNCnc_tensorcore.cuda":
        assert data_layout == "HWNC" and kernel_layout == "HWOI"
        assert float(tvm.gpu(0).compute_version) >= 7.5
        H, W, N, CI = get_const_tuple(data.shape)
        KH, KW, CO, _ = get_const_tuple(kernel.shape)

        if (kernel.dtype in ["int4", "uint4"] and (CI % 32 != 0 or CO % 8 != 0)
                or kernel.dtype in ["int8", "uint8"] and
            (CI % 16 != 0 or CO % 32 != 0)):
            return relay.nn.conv2d(*inputs, **new_attrs)

        new_attrs["channels"] = CO
        if kernel.dtype in ["int4", "uint4"]:
            new_attrs["kernel_layout"] = "HWOI8o32i"
            ic_block_factor = 32
            oc_block_factor = 8
        else:
            new_attrs["kernel_layout"] = "HWOI32o16i"
            ic_block_factor = 16
            oc_block_factor = 32

        new_kernel = te.placeholder(
            (
                KH,
                KW,
                CO // oc_block_factor,
                CI // ic_block_factor,
                oc_block_factor,
                ic_block_factor,
            ),
            dtype=kernel.dtype,
        )

        new_workload = autotvm.task.args_to_workload(
            [data, new_kernel, strides, padding, dilation, out_dtype],
            "conv2d_HWNCnc_tensorcore.cuda",
        )

        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.conv2d(*inputs, **new_attrs)

    return None
예제 #26
0
def tracer(module, info, is_before):
    pass
    #global timing
    #if bool(is_before):
    #    timing = time.time()
    #else:
    #    print('Executes: ', info.name, (time.time() - timing) * 1000)


passes = [(1, tensorizer.rewrite)]
with tvm.transform.PassContext(opt_level=4,
                               trace=tracer,
                               config={'tir.add_lower_pass': passes}):
    #with tvm.transform.PassContext(opt_level=4, trace=tracer):
    #graph, lib, params = tvm.relay.build(module, target='cuda -libs=cublas,cudnn')
    graph, lib, params = tvm.relay.build(module, target='nvptx')
    module = runtime.create(graph, lib, tvm.gpu())

    x_ = (np.random.randn(n, c, h, w) * 128).astype('float32')
    module.set_input('x', x_)

    timer = module.module.time_evaluator('run',
                                         ctx=tvm.gpu(),
                                         number=1,
                                         repeat=1)
    timed = timer()

    print((n * oc * (h - kh + 1) * (w - kw + 1)) * (kh * kw * ic) /
          timed.mean / 1e9)
    print('%d us' % int(timed.mean * 1e6))
예제 #27
0
파일: sddmm.py 프로젝트: yzh119/FeatGraph
 def _register(self):
     self._target = 'cuda'
     self._ctx = tvm.gpu(0)
     self._compute_func = vanilla_sddmm
     self._schedule_func = schedule_vanilla_sddmm_cuda_tree_reduce
import tvm.contrib.graph_runtime as graph_runtime

data_shape = (1, 3, 224, 224)

# load the module back.

loaded_lib = tvm.module.load('deploy_lib.tar')
#dev_lib = tvm.module.load("deploy_cuda.ptx")
#loaded_lib.import_module(dev_lib)

loaded_graph = open("deploy_graph.json").read()

loaded_params = bytearray(open("deploy_param.params", "rb").read())

cuda = True
ctx = tvm.gpu(0) if cuda else tvm.cpu(0)

print("=> [TVM on tune_run.py] creating TVM runtime module")
fcreate = tvm.get_global_func("tvm.graph_runtime.create")

gmodule = fcreate(loaded_graph, loaded_lib, ctx.device_type, ctx.device_id)

set_input, get_output, run = gmodule["set_input"], gmodule[
    "get_output"], gmodule["run"]

print("=> [TVM] feeding inputs and params into TVM module")
x = np.ones([1, 3, 224, 224])
set_input('0', tvm.nd.array(x.astype('float32')))
gmodule["load_params"](loaded_params)

print("=> [TVM] running TVM module, saving output")
예제 #29
0
    print("Tuning...")
    mod.tune_tvm(log_file=log_file, n_trial=20)

print("Building...")
tvm_mod = mod.build_tvm(export_dir)
pytorch_mod = mod.build_pytorch_module(num_inputs=2, num_outputs=1)


## Or you can load from a prebuilt tvm module
# mod = PyTorchTVMModule()
# tvm_mod = mod.load_tvm(export_dir)
# pytorch_mod = mod.build_pytorch_module(num_inputs=2, num_outputs=1, input_infos=input_shapes)


print("Run TVM...")
tvm_x = tvm.nd.array(x.cpu().numpy().astype(dtype), device=tvm.gpu(0))
tvm_y = tvm.nd.array(y.cpu().numpy().astype(dtype), device=tvm.gpu(0))
for i in range(20):
    t = time.time()
    tvm_mod.run(x=tvm_x, y=tvm_y)
    print(1000 * (time.time() - t))
tvm_output = tvm_mod.get_output(0)
print(tvm_output.shape)


print("Run PyTorch...")
for i in range(20):
    t = time.time()
    outputs = pytorch_mod.forward([x, y])
    torch.cuda.synchronize()
    print(1000 * (time.time() - t))
def run(name, N, H, W, CO, CI, KH, KW, stride, pad):
    N, H, W, CO, CI, KH, KW, strides, padding = N, H, W, CO, CI, KH, KW, (
        stride, stride), (pad, pad)
    task = autotvm.task.create(conv2d_no_batching,
                               args=(N, H, W, CO, CI, KH, KW, strides,
                                     padding),
                               target='cuda')
    print(task.config_space)
    logfile = "conv2d_" + name + ".log"

    # Use local gpu, measure 10 times for every config to reduce variance
    # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(),
                                            runner=autotvm.LocalRunner(
                                                repeat=3,
                                                min_repeat_ms=100,
                                                timeout=4))

    # Begin tuning, log records to file `conv2d.log`
    # During tuning we will also try many invalid configs, so you are expected to
    # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
    tuner = autotvm.tuner.XGBTuner(task)
    # tuner.tune(n_trial=1000,
    #            measure_option=measure_option,
    #            callbacks=[autotvm.callback.log_to_file(logfile)])

    #########################################################################
    # Finally we can inspect the best config from log file, check correctness,
    # and measure running time.

    # inspect the best config
    dispatch_context = autotvm.apply_history_best(logfile)
    best_config = dispatch_context.query(task.target, task.workload)
    print("\nBest config:")
    print(best_config)

    # apply history best from log file
    with autotvm.apply_history_best(logfile):
        with tvm.target.create("cuda"):
            s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides,
                                             padding)
            func = tvm.build(s, arg_bufs)

    # check correctness
    a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    # c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

    ctx = tvm.gpu()
    a_tvm = tvm.nd.array(a_np, ctx=ctx)
    w_tvm = tvm.nd.array(w_np, ctx=ctx)
    c_tvm = tvm.nd.empty((N, CO, (H + 2 * pad - KH) // stride + 1,
                          (W + 2 * pad - KW) // stride + 1),
                         ctx=ctx)
    # func(a_tvm, w_tvm, c_tvm)

    # tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

    # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
    # and the overhead of kernel launch. You can also use nvprof to validate the result.
    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
    cost = evaluator(a_tvm, w_tvm, c_tvm).mean * 1e3
    print('Time cost of this operator: %f' % cost)
    with open("autotvm_conv_nchw.txt", "a") as f:
        f.write("name, {}\n".format(cost))
예제 #31
0
def test_tensor_core_batch_conv():
    # The sizes of inputs and filters
    batch_size = 32
    height = 14
    width = 14
    in_channels = 32
    out_channels = 64
    kernel_h = 3
    kernel_w = 3
    pad_h = 1
    pad_w = 1
    stride_h = 1
    stride_w = 1
    block_size = 16

    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    warp_size = 32
    chunk = 2

    # Input feature map: (N, H, W, IC, n, ic)
    data_shape = (
        batch_size // block_size,
        height,
        width,
        in_channels // block_size,
        block_size,
        block_size,
    )
    # Kernel: (H, W, IC, OC, ic, oc)
    kernel_shape = (
        kernel_h,
        kernel_w,
        in_channels // block_size,
        out_channels // block_size,
        block_size,
        block_size,
    )

    # Output feature map: (N, H, W, OC, n, oc)
    output_shape = (
        batch_size // block_size,
        height,
        width,
        out_channels // block_size,
        block_size,
        block_size,
    )

    assert batch_size % block_size == 0
    assert in_channels % block_size == 0
    assert out_channels % block_size == 0

    kh = te.reduce_axis((0, kernel_h), name="kh")
    kw = te.reduce_axis((0, kernel_w), name="kw")
    ic = te.reduce_axis((0, in_channels // block_size), name="ic")
    ii = te.reduce_axis((0, block_size), name="ii")

    # Algorithm
    A = te.placeholder(data_shape, name="A", dtype="float16")
    W = te.placeholder(kernel_shape, name="W", dtype="float16")
    Apad = te.compute(
        (
            batch_size // block_size,
            height + 2 * pad_h,
            width + 2 * pad_w,
            in_channels // block_size,
            block_size,
            block_size,
        ),
        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
            tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width),
            A[n, h - pad_h, w - pad_w, i, nn, ii],
            tvm.tir.const(0.0, "float16"),
        ),
        name="Apad",
    )
    Conv = te.compute(
        output_shape,
        lambda n, h, w, o, nn, oo: te.sum(
            Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32")
            * W[kh, kw, ic, o, ii, oo].astype("float32"),
            axis=[ic, kh, kw, ii],
        ),
        name="Conv",
    )

    s = te.create_schedule(Conv.op)
    s[Apad].compute_inline()

    AS = s.cache_read(Apad, "shared", [Conv])
    WS = s.cache_read(W, "shared", [Conv])
    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
    ConvF = s.cache_write(Conv, "wmma.accumulator")

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    block_z = te.thread_axis("blockIdx.z")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_z = te.thread_axis("threadIdx.z")

    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
    block_k = s[Conv].fuse(hc, wc)
    s[Conv].bind(block_k, block_z)
    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
    s[Conv].bind(block_i, block_x)
    s[Conv].bind(block_j, block_y)
    s[Conv].bind(nc, thread_y)
    s[Conv].bind(oc, thread_z)

    s[ConvF].compute_at(s[Conv], oc)
    n, h, w, o, nnf, oof = ConvF.op.axis
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

    s[AF].compute_at(s[ConvF], kw)
    s[WF].compute_at(s[ConvF], kw)

    s[WS].compute_at(s[ConvF], kh)
    s[AS].compute_at(s[ConvF], kh)

    n, h, w, i, nn, ii = AS.op.axis
    tx, xo = s[AS].split(n, nparts=block_row_warps)
    ty, yo = s[AS].split(xo, nparts=block_col_warps)
    t = s[AS].fuse(nn, ii)
    to, ti = s[AS].split(t, factor=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(ti, thread_x)

    kh, kw, ic, o, ii, oo = WS.op.axis
    tx, xo = s[WS].split(o, nparts=block_row_warps)
    ty, yo = s[WS].split(xo, nparts=block_col_warps)
    t = s[WS].fuse(ii, oo)
    to, ti = s[WS].split(t, nparts=warp_size)
    s[WS].bind(tx, thread_y)
    s[WS].bind(ty, thread_z)
    s[WS].bind(to, thread_x)
    s[WS].vectorize(ti)

    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a"))
    s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b"))
    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))

    func = tvm.build(s, [A, W, Conv], "cuda")

    dev = tvm.gpu(0)
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, dev)
    w = tvm.nd.array(w_np, dev)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev)
    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
    print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))

    if VERIFY:
        func(a, w, c)
        a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels)
        w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(
            kernel_h, kernel_w, in_channels, out_channels
        )
        c_np = (
            c.asnumpy()
            .transpose((0, 4, 1, 2, 3, 5))
            .reshape(batch_size, height, width, out_channels)
        )
        c_std = conv2d_nhwc_python(
            a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w)
        ).astype(Conv.dtype)
        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
예제 #32
0
 def setUp(self):
     self.ctx = tvm.gpu()
예제 #33
0
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8",
                                                                    "uint8"):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda",
                )
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if ((2 < kh < 8 and 2 < kw < 8 and kh == kw)
                    and (stride_h == 1 and stride_w == 1)
                    and (dilation_h == 1 and dilation_w == 1)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5,
                )
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                name="conv2d_nhwc.cuda",
            )

            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)
            # Winograd shape related judgment
            (
                judge_winograd_tensorcore,
                judge_winograd_autotvm,
                judge_winograd_auto_scheduler,
            ) = judge_winograd(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                data.dtype,
                kernel.dtype,
                pre_flag=False,
            )
            if judge_winograd_autotvm:
                if (target.kind.name == "cuda"
                        and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                        and judge_winograd_tensorcore):
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_tensorcore),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore
                        ),
                        name="conv2d_nhwc_winograd_tensorcore.cuda",
                        plevel=5,
                    )
                else:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_direct),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                        name="conv2d_nhwc_winograd_direct.cuda",
                        plevel=5,
                    )
            if (target.kind.name == "cuda"
                    and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                    and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or
                         (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or
                         (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nhwc_tensorcore),
                    name="conv2d_nhwc_tensorcore.cuda",
                    plevel=20,
                )

            # register auto-scheduler implementations
            use_auto_scheduler = PassContext.current().config.get(
                "relay.backend.use_auto_scheduler", False)
            if use_auto_scheduler and judge_winograd_auto_scheduler:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                    naive_schedule,  # this implementation should never be picked by autotvm
                    name="conv2d_nhwc.winograd",
                    plevel=15,
                )

        elif layout == "HWNC":
            assert kernel_layout in [
                "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"
            ]
            _, _, N, in_channels = get_const_tuple(data.shape)
            pre_computed = len(kernel.shape) == 6
            if pre_computed:
                _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(
                    kernel.shape)
                out_channels = oc_chunk * oc_block_factor
            else:
                _, _, out_channels, _ = get_const_tuple(kernel.shape)

            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
            if ((N % 16 == 0 and in_channels % 16 == 0
                 and out_channels % 16 == 0)
                    or (N % 8 == 0 and in_channels % 16 == 0
                        and out_channels % 32 == 0)
                    or (N % 32 == 0 and in_channels % 16 == 0
                        and out_channels % 8 == 0) and
                (data.dtype in tensorcore_dtypes
                 and kernel.dtype in tensorcore_dtypes)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_hwnc_tensorcore),
                    name="conv2d_hwnc_tensorcore_direct.cuda",
                    plevel=20,
                )
            else:
                raise RuntimeError("Unsupported shape for conv2d HWNC.\
                                    Need to satisfy tensor core schedule.")
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))
        # add cudnn implementation
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in [
                    "NCHW", "NHWC"
            ] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        # add cudnn implementation, if any
        cudnn_impl = False
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in [
                    "NCHW", "NHWC"
            ] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
                cudnn_impl = True

        if layout == "NCHW":
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.cuda",
            )
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda",
            )
        elif not cudnn_impl:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy
예제 #34
0
def convert(topology, backend, device, extra_config={}):
    """
    This function is used to convert a `onnxconverter_common.topology.Topology` object into a *backend* model.

    Args:
        topology: The `onnxconverter_common.topology.Topology` object that will be converted into a backend model
        backend: Which backend the model should be run on
        device: Which device the translated model will be run on
        extra_config: Extra configurations to be used by individual operator converters

    Returns:
        A model implemented in the selected backend
    """
    assert topology is not None, "Cannot convert a Topology object of type None."
    assert backend is not None, "Cannot convert a Topology object into backend None."
    assert device is not None, "Cannot convert a Topology object into device None."

    tvm_backend = None
    operator_map = {}

    if tvm_installed():
        import tvm
        from tvm import relay
        from tvm.contrib import graph_runtime

        tvm_backend = tvm.__name__

    for operator in topology.topological_operator_iterator():
        try:
            converter = get_converter(operator.type)

            if backend == onnx.__name__:
                # vers = LooseVersion(torch.__version__)
                # allowed_min = LooseVersion("1.6.0")
                # Pytorch <= 1.6.0 has a bug with exporting GEMM into ONNX.
                # For the moment only tree_trav is enabled for pytorch <= 1.6.0
                # if vers < allowed_min:
                extra_config[constants.TREE_IMPLEMENTATION] = "tree_trav"

            operator_map[operator.full_name] = converter(
                operator, device, extra_config)
        except ValueError:
            raise MissingConverter(
                "Unable to find converter for {} type {} with extra config: {}."
                .format(operator.type,
                        type(getattr(operator, "raw_model", None)),
                        extra_config))
        except Exception as e:
            raise e

    # Set the parameters for the model / container
    n_threads = None if constants.N_THREADS not in extra_config else extra_config[
        constants.N_THREADS]
    batch_size = None if constants.BATCH_SIZE not in extra_config else extra_config[
        constants.BATCH_SIZE]

    # We set the number of threads for torch here to avoid errors in case we JIT.
    # We set intra op concurrency while we force operators to run sequentially.
    # We can revise this later, but in general we don't have graphs requireing inter-op parallelism.
    if n_threads is not None:
        if torch.get_num_interop_threads() != 1:
            torch.set_num_interop_threads(1)
        torch.set_num_threads(n_threads)

    operators = list(topology.topological_operator_iterator())
    torch_model = _PyTorchBackendModel(topology.raw_model.input_names,
                                       topology.raw_model.output_names,
                                       operator_map, operators,
                                       extra_config).eval()

    if backend == onnx.__name__:
        onnx_model_name = output_model_name = None
        target_opset = 11

        # Set optional configuration options for ONNX if any.
        if constants.ONNX_OUTPUT_MODEL_NAME in extra_config:
            onnx_model_name = extra_config[constants.ONNX_OUTPUT_MODEL_NAME]
            output_model_name = onnx_model_name + ".onnx"
        if constants.ONNX_TARGET_OPSET in extra_config:
            target_opset = extra_config[constants.ONNX_TARGET_OPSET]
        if output_model_name is None:
            output_model_name = str(uuid4().hex) + ".onnx"

        # Put the tracing test input into the right format.
        batch_trace_input, _ = _get_trace_input_from_test_input(
            extra_config[constants.TEST_INPUT], batch_size)

        # Generate the ONNX models
        torch.onnx.export(
            torch_model,
            batch_trace_input,
            output_model_name,
            input_names=topology.raw_model.input_names,
            output_names=topology.raw_model.output_names,
            keep_initializers_as_inputs=False,
            opset_version=target_opset,
            do_constant_folding=True,
        )
        hb_model = onnx.load(output_model_name)
        os.remove(output_model_name)

        # Set the ONNX model name if any.
        if onnx_model_name is not None:
            hb_model.graph.name = onnx_model_name

        # Fix the model to use arbitrary batch dimensions
        def fix_dim(dim):
            updated = False
            if dim.HasField("dim_value"):
                dim.Clear()
                updated = True
                dim.dim_param = "sym"

            return updated

        def fix_value_info(value):
            num_fixed = 0
            if value.type.HasField("tensor_type"):
                shape = value.type.tensor_type.shape
                if shape:
                    dim = shape.dim[0]
                    if fix_dim(dim):
                        num_fixed += 1

            return num_fixed

        def fix_graph(graph):
            num_fixed = 0
            for input in graph.input:
                num_fixed += fix_value_info(input)

            for output in graph.output:
                num_fixed += fix_value_info(output)

            for node in graph.node:
                for attr in node.attribute:
                    if attr.HasField("g"):
                        num_fixed += fix_graph(attr.g)

            return num_fixed

        fix_graph(hb_model.graph)
    elif backend == tvm_backend:
        # First we need to generate the torchscript model.
        batch_trace_input, remainder_trace_input = _get_trace_input_from_test_input(
            extra_config[constants.TEST_INPUT], batch_size)
        ts_model = _jit_model(torch_model, batch_trace_input, "cpu",
                              extra_config)
        if remainder_trace_input is not None:
            remainder_ts_model = _jit_model(torch_model, remainder_trace_input,
                                            "cpu", extra_config)

        # Generate the test input in the TVM format. In case we have a remainder beyond the batch, generate a remainder test input as well.
        test_input = [(
            topology.raw_model.input_names[i],
            batch_trace_input[i].shape
            if type(batch_trace_input) is tuple else batch_trace_input.shape,
        ) for i in range(len(topology.raw_model.input_names))]
        if remainder_trace_input is not None:
            remainder_test_input = [(
                topology.raw_model.input_names[i],
                remainder_trace_input[i].shape
                if type(remainder_trace_input) is tuple else
                remainder_trace_input.shape,
            ) for i in range(len(topology.raw_model.input_names))]

        # Pick the proper target.
        if device == "cuda":
            target = tvm.target.cuda()
            ctx = tvm.gpu()
        elif device == "cpu":
            target = "llvm"
            ctx = tvm.cpu()
        elif "llvm" in device:
            target = device
            ctx = tvm.cpu()
        else:
            raise RuntimeError("Device {} not recognized".format(device))

        # Get configuration parameters.
        config = {}
        if constants.TVM_MAX_FUSE_DEPTH in extra_config:
            config["relay.FuseOps.max_depth"] = extra_config[
                constants.TVM_MAX_FUSE_DEPTH]
        else:
            # 50 is a good depth for operator fusion. More than that will probably hurt performance.
            # https://github.com/microsoft/hummingbird/issues/232#issuecomment-697979508
            config["relay.FuseOps.max_depth"] = 50

        # Create the relay version of the model.
        model, params = relay.frontend.from_pytorch(ts_model, test_input)
        if remainder_trace_input is not None:
            remainder_model, remainder_params = relay.frontend.from_pytorch(
                remainder_ts_model, remainder_test_input)

        # Generate the model. We set opt_level=3 to enable all optimizations.
        with tvm.transform.PassContext(opt_level=3, config=config):
            graph, lib, params = relay.build(model,
                                             target=target,
                                             params=params)
        tvm_model = graph_runtime.create(graph, lib, ctx)
        tvm_model.set_input(**params)
        if remainder_trace_input is not None:
            with tvm.transform.PassContext(opt_level=3, config=config):
                graph, lib, params = relay.build(remainder_model,
                                                 target=target,
                                                 params=remainder_params)
            tvm_remainder_model = graph_runtime.create(graph, lib, ctx)
            tvm_remainder_model.set_input(**params)

        # In the container we will be using the context to properly configure the input tensors.
        extra_config[constants.TVM_CONTEXT] = ctx
        extra_config[
            constants.TVM_INPUT_NAMES] = topology.raw_model.input_names
        if remainder_trace_input is not None:
            extra_config[constants.TVM_REMAINDER_MODEL] = tvm_remainder_model

        hb_model = tvm_model
    else:
        # Set the device for the model.
        if device != "cpu":
            if backend == torch.__name__ or torch.jit.__name__:
                torch_model = torch_model.to(device)

        # If the backend is tochscript, jit the model.
        if backend == torch.jit.__name__:
            trace_input, _ = _get_trace_input_from_test_input(
                extra_config[constants.TEST_INPUT], batch_size)
            if device != "cpu":
                trace_input.to(device)
            torch_model = torch.jit.trace(torch_model, trace_input).eval()
            torch.jit.optimized_execution(torch_model)

        hb_model = torch_model

    # Return if the container is not needed.
    if constants.CONTAINER in extra_config and not extra_config[
            constants.CONTAINER]:
        return hb_model

    # We scan the operators backwards until we find an operator with a defined type.
    # This is necessary because ONNX models can have arbitrary operators doing casting, reshaping etc.
    idx = len(operators) - 1
    while (idx >= 0 and not operator_map[operators[idx].full_name].regression
           and not operator_map[operators[idx].full_name].classification
           and not operator_map[operators[idx].full_name].anomaly_detection
           and not operator_map[operators[idx].full_name].transformer):
        idx -= 1

    assert idx >= 0, "Cannot detect container type. Please fill an issue at https://github.com/microsoft/hummingbird."

    # If is a transformer, we need to check whether there is another operator type before.
    # E.g., normalization after classification.
    tmp_idx = idx
    if operator_map[operators[idx].full_name].transformer:
        while (idx >= 0
               and not operator_map[operators[idx].full_name].regression
               and not operator_map[operators[idx].full_name].classification
               and
               not operator_map[operators[idx].full_name].anomaly_detection):
            idx -= 1
        if idx < 0:
            idx = tmp_idx

    # Get the proper container type.
    if operator_map[operators[idx].full_name].regression:
        # We are doing a regression task.
        if backend == torch.jit.__name__:
            container = TorchScriptSklearnContainerRegression
        elif backend == onnx.__name__:
            container = ONNXSklearnContainerRegression
        elif backend == tvm_backend:
            container = TVMSklearnContainerRegression
        else:
            container = PyTorchSklearnContainerRegression
    elif operator_map[operators[idx].full_name].anomaly_detection:
        # We are doing anomaly detection.
        if backend == torch.jit.__name__:
            container = TorchScriptSklearnContainerAnomalyDetection
        elif backend == onnx.__name__:
            container = ONNXSklearnContainerAnomalyDetection
        elif backend == tvm_backend:
            container = TVMSklearnContainerAnomalyDetection
        else:
            container = PyTorchSklearnContainerAnomalyDetection
    elif operator_map[operators[idx].full_name].transformer:
        # We are just transforming the input data.
        if backend == torch.jit.__name__:
            container = TorchScriptSklearnContainerTransformer
        elif backend == onnx.__name__:
            container = ONNXSklearnContainerTransformer
        elif backend == tvm_backend:
            container = TVMSklearnContainerTransformer
        else:
            container = PyTorchSklearnContainerTransformer
    else:
        # We are doing a classification task.
        if backend == torch.jit.__name__:
            container = TorchScriptSklearnContainerClassification
        elif backend == onnx.__name__:
            container = ONNXSklearnContainerClassification
        elif backend == tvm_backend:
            container = TVMSklearnContainerClassification
        else:
            container = PyTorchSklearnContainerClassification

    n_threads = None if constants.N_THREADS not in extra_config else extra_config[
        constants.N_THREADS]
    batch_size = None if constants.BATCH_SIZE not in extra_config else extra_config[
        constants.BATCH_SIZE]
    hb_model = container(hb_model,
                         n_threads,
                         batch_size,
                         extra_config=extra_config)

    return hb_model
예제 #35
0
    with tvm.target.create("cuda"):
        s, arg_bufs = conv2d(
            N, H, W, CO, CI, KH, KW, strides, padding, scaling_factor)
        print(tvm.lower(s, arg_bufs, simple_mode=True))
        func = tvm.build(s, arg_bufs)
        print(func.imported_modules[0].get_source())

# check correctness
a_np = np.random.randint(size=(N, CI//BI, H, W, BI), low=-128, high=127, dtype='int8')
w_np = np.random.randint(
    size=(CO//BO, CI//BI, KH, KW, BO, BI), low=-128, high=127, dtype='int8')
a_np_ = a_np.transpose((0, 1, 4, 2, 3)).ravel().reshape(N, CI, H, W)
w_np_ = w_np.transpose((0, 4, 1, 5, 2, 3)).ravel().reshape(CO, CI, KH, KW)
#c_np = conv2d_nchw_python(a_np_, w_np_, strides, padding).astype('int8')
#c_np = c_np.reshape(N, CO//BO, BO, *c_np.shape[2:]).transpose(0, 1, 3, 4, 2)
c_np = np.zeros((N, CO//BO, H, W, BO), dtype='int8')

ctx = tvm.gpu()
a_tvm = tvm.nd.empty(a_np.shape, dtype='int8', ctx=ctx).copyfrom(a_np)
w_tvm = tvm.nd.empty(w_np.shape, dtype='int8', ctx=ctx).copyfrom(w_np)
c_tvm = tvm.nd.empty(c_np.shape, dtype='int8', ctx=ctx)
func(a_tvm, w_tvm, c_tvm)

#np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

evaluator = func.time_evaluator(func.entry_name, ctx, number=1000)
t = evaluator(a_tvm, w_tvm, c_tvm).mean
num_flops = N*c_np.shape[-2] * c_np.shape[-3] * CO*CI*KH*KW*2
GFLOPS = num_flops / (t * 1e3) / 1e6
print('Time cost of this operator: %f, %g GFLOPS' % (t, GFLOPS))
예제 #36
0
def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
    A = te.placeholder((n, l), name='A', dtype='float16')
    B = te.placeholder((l, m), name='B', dtype='float16')
    k = te.reduce_axis((0, l), name='k')
    C = te.compute((n, m), lambda i, j: te.sum(
        A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k))
    s = te.create_schedule(C.op)
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    AA = s.cache_read(A, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BB = s.cache_read(B, "shared", [C])
    BL = s.cache_read(BB, "local", [C])
    CL = s.cache_write(C, "local")

    bx = 4
    by = 32
    step_k = 8
    v = 4
    TX = 8
    TY = 1
    tile_x = bx * TX
    tile_y = by * TY
    WX = min(warp_tile_m, tile_x)
    tile_k = 16
    vthread = 1

    yo, ty = s[C].split(y, tile_y * vthread)
    vy, ty = s[C].split(ty, tile_y)
    ty, yi = s[C].split(ty, TY)

    xo, xi = s[C].split(x, tile_x)
    tz, xi = s[C].split(xi, WX)
    tx, xi = s[C].split(xi, TX)
    ko, ki = s[CL].split(k, step_k * tile_k)
    kl, ki = s[CL].split(ki, tile_k)

    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
    s[C].bind(yo, te.thread_axis("blockIdx.y"))
    s[C].bind(xo, te.thread_axis("blockIdx.x"))
    s[C].bind(ty, te.thread_axis("threadIdx.y"))
    s[C].bind(tz, te.thread_axis("threadIdx.z"))
    s[C].bind(tx, te.thread_axis("threadIdx.x"))
    s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy"))
    s[CL].compute_at(s[C], tx)
    yo, xo = CL.op.axis
    s[CL].reorder(ko, kl, ki, yo, xo)

    s[AA].compute_at(s[CL], ko)
    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
    tx, vec = s[AA].split(tx, factor=v)
    fused = s[AA].fuse(s[AA].op.axis[0], xo)
    _, ty = s[AA].split(fused, factor=by)
    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
    s[AA].vectorize(vec)

    s[BB].compute_at(s[CL], ko)
    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
    tx, vec = s[BB].split(tx, factor=v)
    fused = s[BB].fuse(s[BB].op.axis[0], xo)
    _, ty = s[BB].split(fused, factor=by)
    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
    s[BB].vectorize(vec)

    s[AL].compute_at(s[CL], kl)
    s[BL].compute_at(s[CL], kl)

    s[CL].pragma(ko, 'tensor_core')

    func = tvm.build(s, [A, B, C], 'cuda')

    ctx = tvm.gpu(0)
    a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
    b_np = np.random.uniform(size=(l, m)).astype(B.dtype)
    c_np = np.zeros((n, m), dtype=np.float32)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
    func(a, b, c)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
    print('gemm m=%d n=%d k=%d: %f ms' %
          (m, n, l, evaluator(a, b, c).mean * 1e3))

    c_np = np.dot(a_np, b_np)
    np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
예제 #37
0
 def test_ctx_func(ctx):
     assert tvm.gpu(7) == ctx
     return tvm.cpu(0)
예제 #38
0
def conv2d_winograd_without_weight_transfrom_strategy_cuda(
        attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom cuda strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    assert dilation == (1, 1), "Do not support dilate now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    if layout == "NCHW":
        strategy.add_implementation(
            wrap_compute_conv2d(
                topi.cuda.conv2d_nchw_winograd_without_weight_transform),
            wrap_topi_schedule(
                topi.cuda.
                schedule_conv2d_nchw_winograd_without_weight_transform),
            name="conv2d_nchw_winograd_without_weight_transform.cuda",
        )
    elif layout == "NHWC":
        N, H, W, _ = get_const_tuple(data.shape)
        alpha, _, CI, CO = get_const_tuple(kernel.shape)
        dilation_h, dilation_w = dilation
        judge_winograd_tensorcore, _, _ = judge_winograd(
            N,
            H,
            W,
            alpha,
            alpha,
            CI,
            CO,
            padding,
            stride_h,
            stride_w,
            dilation_h,
            dilation_w,
            data.dtype,
            kernel.dtype,
            pre_flag=True,
        )
        if (target.kind.name == "cuda"
                and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                and judge_winograd_tensorcore):
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.
                    conv2d_nhwc_winograd_tensorcore_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.
                    schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform
                ),
                name=
                "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
            )
        else:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.
                    conv2d_nhwc_winograd_direct_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.
                    schedule_conv2d_nhwc_winograd_direct_without_weight_transform
                ),
                name=
                "conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
            )

        if PassContext.current().config.get("relay.backend.use_auto_scheduler",
                                            False):
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.nn.conv2d_winograd_nhwc_without_weight_transform),
                naive_schedule,  # this implementation should never be picked by autotvm
                name="conv2d_nhwc_winograd_without_weight_transform",
                plevel=15,
            )
    else:
        raise RuntimeError(
            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".
            format(layout))
    return strategy
예제 #39
0
    s[B].bind(s[B].op.reduce_axis[0], tx)
    s[B].bind(s[B].op.axis[0], bx)

    s[BF].compute_at(s[B], s[B].op.axis[0])

    _, noi = s[BF].split(s[BF].op.reduce_axis[0], factor=2)

    BF2 = s.rfactor(BF, noi, 0)

    s[BF].bind(s[BF].op.axis[0], tx)
    s[BF2].compute_at(s[BF], s[BF].op.axis[1])

    fcuda = tvm.build(s, [A, B], "cuda")


@unittest.skipIf(not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"),
                 "skip because cuda is not enabled..")
def test_cuda_const_float_to_half():
    # This import is required to use nvcc to perform code gen;
    # otherwise it is found that the code gen is done by nvrtc.
    from tvm import autotvm
    shape = (2, 3, 4)
    a = te.placeholder(shape, dtype='float16', name='a')
    b = tvm.tir.const(0.5, dtype='float16')
    c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name='c')
    s = te.create_schedule(c.op)
    axes = [axis for axis in c.op.axis]
    fused = s[c].fuse(*axes)
    bx, tx = s[c].split(fused, factor=64)
    s[c].bind(bx, te.thread_axis('blockIdx.x'))
    s[c].bind(tx, te.thread_axis('threadIdx.x'))
예제 #40
0
def run_case(dtype, image, target):
    # Check image
    import os
    import json
    import sys

    STAT_REPEAT = os.environ.get('STAT_REPEAT', '')
    if STAT_REPEAT == '' or STAT_REPEAT == None:
        STAT_REPEAT = 10
    STAT_REPEAT = int(STAT_REPEAT)

    # FGG: set model files via CK env
    CATEG_FILE = '../synset.txt'
    synset = eval(open(os.path.join(CATEG_FILE)).read())

    files = []
    val = {}

    if image != None and image != '':
        files = [image]
    else:
        ipath = os.environ.get('CK_ENV_DATASET_IMAGENET_VAL', '')
        if ipath == '':
            print('Error: path to ImageNet dataset is not set!')
            exit(1)
        if not os.path.isdir(ipath):
            print('Error: path to ImageNet dataset was not found!')
            exit(1)

        # get all files
        d = os.listdir(ipath)
        for x in d:
            x1 = x.lower()
            if x1.startswith('ilsvrc2012_val_'):
                files.append(os.path.join(ipath, x))

        files = sorted(files)

        STAT_REPEAT = 1

        # Get correct labels
        ival = os.environ.get('CK_CAFFE_IMAGENET_VAL_TXT', '')
        fval = open(ival).read().split('\n')

        val = {}
        for x in fval:
            x = x.strip()
            if x != '':
                y = x.split(' ')
                val[y[0]] = int(y[1])

    # FGG: set timers
    import time
    timers = {}

    # Get first shape (expect that will be the same for all)
    dt = time.time()
    image = Image.open(os.path.join(files[0])).resize((224, 224))
    if image.mode != 'RGB': image = image.convert('RGB')
    timers['execution_time_load_image'] = time.time() - dt

    dt = time.time()
    img = transform_image(image)
    timers['execution_time_transform_image'] = time.time() - dt

    # load model
    from mxnet.gluon.model_zoo.vision import get_model
    from mxnet.gluon.utils import download

    model_path = os.environ['CK_ENV_MODEL_MXNET']
    model_id = os.environ['MXNET_MODEL_ID']
    block = get_model(model_id, pretrained=True, root=model_path)

    # We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
    net, params = nnvm.frontend.from_mxnet(block)
    # we want a probability so add a softmax operator
    net = nnvm.sym.softmax(net)

    # convert to wanted dtype (https://github.com/merrymercy/tvm-mali/issues/3)
    if dtype != 'float32':
        params = {
            k: tvm.nd.array(v.asnumpy().astype(dtype))
            for k, v in params.items()
        }

    # compile
    if target == None or target == 'cpu':
        xtarget = 'llvm'
    elif target == 'cuda':
        xtarget = 'cuda'

    opt_level = 2 if dtype == 'float32' else 1
    with nnvm.compiler.build_config(opt_level=opt_level):
        graph, lib, params = nnvm.compiler.build(net,
                                                 target=xtarget,
                                                 shape={"data": data_shape},
                                                 params=params,
                                                 dtype=dtype,
                                                 target_host=None)

    # upload model to remote device
    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.tar')
    lib.export_library(lib_fname)

    if target == None or target == 'cpu':
        ctx = tvm.cpu(0)
    elif target == 'cuda':
        ctx = tvm.gpu(0)
    rlib = lib
    rparams = params

    # create graph runtime
    dt = time.time()
    module = runtime.create(graph, rlib, ctx)
    module.set_input(
        'data',
        tvm.nd.array(np.random.uniform(size=(data_shape)).astype(dtype)))
    module.set_input(**rparams)
    timers['execution_time_create_run_time_graph'] = (time.time() - dt)

    total_images = 0
    correct_images_top1 = 0
    correct_images_top5 = 0

    # Shuffle files and pre-read JSON with accuracy to continue aggregating it
    # otherwise if FPGA board hangs, we can continue checking random images ...

    import random
    random.shuffle(files)

    if len(files) > 1 and os.path.isfile('aggregate-ck-timer.json'):
        x = json.load(open('aggregate-ck-timer.json'))

        if 'total_images' in x:
            total_images = x['total_images']
        if 'correct_images_top1' in x:
            correct_images_top1 = x['correct_images_top1']
        if 'correct_images_top5' in x:
            correct_images_top5 = x['correct_images_top5']

    dt1 = time.time()
    for f in files:
        total_images += 1

        print(
            '==============================================================================='
        )
        print('Image ' + str(total_images) + ' of ' + str(len(files)) + ' : ' +
              f)

        image = Image.open(os.path.join(f)).resize((224, 224))
        if image.mode != 'RGB': image = image.convert('RGB')
        img = transform_image(image)

        # set inputs
        module.set_input('data', tvm.nd.array(img.astype(dtype)))
        module.set_input(**rparams)

        # perform some warm up runs
        # print("warm up..")
        warm_up_timer = module.module.time_evaluator("run", ctx, 1)
        warm_up_timer()

        # execute
        print('')
        print("run (" + str(STAT_REPEAT) + " statistical repetitions)")
        dt = time.time()
        timer = module.module.time_evaluator("run", ctx, number=STAT_REPEAT)
        tcost = timer()
        timers['execution_time_classify'] = (time.time() - dt) / STAT_REPEAT

        # get outputs
        tvm_output = module.get_output(0, tvm.nd.empty((1000, ), dtype, ctx))

        top1 = np.argmax(tvm_output.asnumpy())

        top5 = []
        atop5 = get_top5(tvm_output.asnumpy())

        print('')
        print('TVM prediction Top1:', top1, synset[top1])

        print('')
        print('TVM prediction Top5:')
        for q in atop5:
            x = q[1]
            y = synset[x]
            top5.append(x)
            print(x, y)

        print('')
        print("Internal T-cost: %g" % tcost.mean)

        # Check correctness if available
        if len(val) > 0:
            top = val[os.path.basename(f)]

            correct_top1 = False
            if top == top1:
                correct_top1 = True
                correct_images_top1 += 1

            print('')
            if correct_top1:
                print('Current prediction Top1: CORRECT')
            else:
                print('Current prediction Top1: INCORRECT +(' + str(top) + ')')

            accuracy_top1 = float(correct_images_top1) / float(total_images)
            print('Current accuracy Top1:   ' + ('%.5f' % accuracy_top1))

            correct_top5 = False
            if top in top5:
                correct_top5 = True
                correct_images_top5 += 1

            print('')
            if correct_top5:
                print('Current prediction Top5: CORRECT')
            else:
                print('Current prediction Top5: INCORRECT +(' + str(top) + ')')

            accuracy_top5 = float(correct_images_top5) / float(total_images)
            print('Current accuracy Top5:   ' + ('%.5f' % accuracy_top5))

            print('')
            print('Total elapsed time: ' + ('%.1f' % (time.time() - dt1)) +
                  ' sec.')

            timers['total_images'] = total_images
            timers['correct_images_top1'] = correct_images_top1
            timers['accuracy_top1'] = accuracy_top1
            timers['correct_images_top5'] = correct_images_top5
            timers['accuracy_top5'] = accuracy_top5

        timers['execution_time_classify_internal'] = tcost.mean
        timers['execution_time'] = tcost.mean

        with open('tmp-ck-timer.json', 'w') as ftimers:
            json.dump(timers, ftimers, indent=2)

        with open('aggregate-ck-timer.json', 'w') as ftimers:
            json.dump(timers, ftimers, indent=2)

        sys.stdout.flush()

    return
예제 #41
0
# To generate the module library, TVM will first transfer the high level IR
# into the lower intrinsic IR of the specified target backend, which is CUDA
# in this example. Then the machine code will be generated as the module library.

opt_level = 3
target = tvm.target.cuda()
with tvm.transform.PassContext(opt_level=opt_level):
    lib = relay.build(mod, target, params=params)

#####################################################################
# Run the generate library
# ------------------------
# Now we can create graph runtime and run the module on Nvidia GPU.

# create random input
dev = tvm.gpu()
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
# create module
module = graph_runtime.GraphModule(lib["default"](dev))
# set input and parameters
module.set_input("data", data)
# run
module.run()
# get output
out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()

# Print first 10 elements of output
print(out.flatten()[0:10])

######################################################################
# Save and Load Compiled Module
예제 #42
0
파일: conv-i2cfused.py 프로젝트: were/UNIT
        timing = time.time()
    else:
        print('Executes: ', info.name, (time.time() - timing) * 1000)


np_a = np.random.randn(n, c // 16, h, w, 16).astype('float16')
np_b = np.random.randn(ko // 16, ic // 16, kh, kw, 16, 16).astype('float16')
#np_a = (np.arange(n * (c // 16) * h * w * 16) % 7).astype('float16')
#np_b = (np.arange((ko // 16) * kh * kw * ic * 16) % 7).astype('float16')
#np_a.shape = (n, c // 16, h, w, 16)
#np_b.shape = (ko // 16, ic // 16, kh, kw, 16, 16)

np_c = np.random.randn(n, ko // 16, (h - kh) // stride_h + 1,
                       (w - kw) // stride_w + 1, 16).astype('float32')

nd_a = tvm.nd.array(np_a, tvm.gpu())
nd_b = tvm.nd.array(np_b, tvm.gpu())
nd_c = tvm.nd.array(np_c, tvm.gpu())

import tensorizer
passes = [(1, tensorizer.loop_swizzle), (1, tensorizer.rewrite),
          (1, tensorizer.inject_sync), (1, tensorizer.sliding_window)]
with tvm.transform.PassContext(opt_level=4,
                               config={'tir.add_lower_pass': passes}):
    #with tvm.transform.PassContext(opt_level=4):
    module = tvm.build(sch, [a, b, conv], 'nvptx')
    fte = module.time_evaluator(module.entry_name,
                                ctx=tvm.gpu(),
                                number=3,
                                repeat=10)
    res = fte(nd_a, nd_b, nd_c).results
예제 #43
0
print(tvm.lower(s, [A, A_ch], simple_mode=True))
"""
blockdim, threaddim = 32, 32
n, c, h, w = s[A_ch].op.axis
hw = s[A_ch].fuse(h, w)
no, ni = s[A_ch].split(n, nparts=blockdim)
co, ci = s[A_ch].split(c, nparts=blockdim)
hwo, hwi = s[A_ch].split(hw, nparts=32*32)
s[A_ch].reorder(no, co, hwo, ni, ci, hwi)
s[A_ch].bind(no, block_y)
s[A_ch].bind(co, block_x)
s[A_ch].bind(hwo, thread_x)
s[A_ch].vectorize(hwi)
"""
func = tvm.build(s, [A, A_ch], 'cuda')
ctx = tvm.gpu(0)
a_np = np.random.uniform(size=input_tensor).astype(A.dtype)
a = tvm.nd.array(a_np, ctx)
a_ch = tvm.nd.array(np.zeros(output_tensor, dtype=A_ch.dtype), ctx)
func(a, a_ch)
evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
conv_time = evaluator(a, a_ch).mean * 1e3
tot_byte = batch * in_channel * in_size * in_size * 4 / 1024 / 1024 / 1024  # GB
print('Convolution: %f ms, Bandwidth: %f GB/s' %
      (conv_time, tot_byte / conv_time * 1000 * 2))

dev_module = func.imported_modules[0]
print(dev_module)
print("----GPU code----")
print(dev_module.get_source())
예제 #44
0
def test_bind():
    if not tvm.gpu(0).exist:
        print('[Warning] No GPU found! Skip bind test!')
        return

    @script
    def vec_add(a, b):
        c = output_tensor((1000, ), 'float32')
        for tx in bind('threadIdx.x', 1000):
            c[tx] = a[tx] + b[tx]
        return c

    a = tvm.placeholder((1000, ), dtype='float32', name='a')
    b = tvm.placeholder((1000, ), dtype='float32', name='b')
    func, ins, outs = run_and_check(vec_add, [a, b], target='cuda')
    run_and_check(func, ins, outs=outs, target='cuda')

    @script
    def raw(a, b):
        c = output_tensor((1000, ), 'float32')
        for i in range(1000):
            c[i] = a[i] + b[i]
        return c

    c = raw(a, b)
    sch = tvm.create_schedule(c.op)
    x = tvm.thread_axis('threadIdx.x')
    sch[c].bind(c.op.axis[0], x)
    func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda')
    run_and_check(func, ins, outs=outs, target='cuda')


    @tvm.hybrid.script
    def foo(a):
        c = output_tensor((a.shape[0],), a.dtype)
        total = allocate((1,), a.dtype, 'local')
        len_i = a.shape[0]
        len_j = a.shape[1]
        for i in bind('threadIdx.x', len_i):
            total[0] = 0.
            for k in const_range(len_j):
                total[0] += a[i, k]
            c[i] = total[0]

        return c

    a = tvm.placeholder((8, 4), 'float32')
    c = foo(a)
    s = tvm.create_schedule(c.op)
    ir = tvm.lower(s, [a, c], simple_mode=True)
    assert not isinstance(ir, tvm.stmt.AttrStmt)
    func, ins, outs = run_and_check(foo, [a], target='cuda')
    run_and_check(func, ins, outs=outs, target='cuda')

    @tvm.hybrid.script
    def max_threads(a):
        b = output_tensor(a.shape, a.dtype)
        n = a.shape[0]
        m = max_num_threads(True)
        for i in bind('threadIdx.x', m):
            for j in bind('blockIdx.x', ceil_div(n, m)):
                if i * m + j < n:
                    b[i * m + j] = a[i * m + j] + a[i * m + j]
        return b

    a = tvm.placeholder((10000, ), 'float32')
    with tvm.target.create('cuda'):
        func, ins, outs = run_and_check(max_threads, [a], target='cuda')
        run_and_check(func, ins, outs=outs, target='cuda')
예제 #45
0
with open(temp.relpath("deploy_param.params"), "wb") as fo:
    fo.write(nnvm.compiler.save_param_dict(params))
print(temp.listdir())

######################################################################
# Deploy locally to Nvidia GPU
# ------------------------------
# Now we can load the module back.

import numpy as np
from tvm.contrib import graph_runtime

loaded_lib = tvm.module.load(path_lib)
loaded_json = open(temp.relpath("deploy_graph.json")).read()
loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
module.load_params(loaded_params)

input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
module.run(data=input_data)
out = module.get_output(0, out=tvm.nd.empty(out_shape))
# Print first 10 elements of output
print(out.asnumpy()[0][0:10])

######################################################################
# Compile and Deploy the Model to Raspberry Pi Remotely with RPC
# ------------------------------
# Following the steps above, we can also compile the model for Raspberry Pi.
# TVM provides rpc module to help with remote deploying.
#
# For demonstration, we simply start an RPC server on the same machine,
예제 #46
0
파일: get_started.py 프로젝트: bddppq/tvm
######################################################################
# Deploy and Run
# --------------
# Now that we have have compiled module, let us run it.
# We can use :any:`graph_runtime <tvm.contrib.graph_runtime.create>`
# in tvm to create a deployable :any:`GraphModule <tvm.contrib.graph_runtime.GraphModule>`.
# We can use the :any:`set_input <tvm.contrib.graph_runtime.GraphModule.set_input>`,
# :any:`run <tvm.contrib.graph_runtime.GraphModule.run>` and
# :any:`get_output <tvm.contrib.graph_runtime.GraphModule.get_output>` function
# to set the input, execute the graph and get the output we need.
#
import tvm
import numpy as np
from tvm.contrib import graph_runtime, util

module = graph_runtime.create(deploy_graph, lib, tvm.gpu(0))
x_np = np.array([1, 2, 3, 4]).astype("float32")
y_np = np.array([4, 4, 4, 4]).astype("float32")
# set input to the graph module
module.set_input(x=x_np, y=y_np)
# run forward computation
module.run()
# get the first output
out = module.get_output(0, out=tvm.nd.empty(shape))
print(out.asnumpy())

######################################################################
# Provide Model Parameters
# ------------------------
# Most deep learning models contains two types of inputs: parameters
# that remains fixed during inference and data input that need to
예제 #47
0
파일: from_onnx.py 프로젝트: LANHUIYING/tvm
# ---------------------------------------------
# We should be familiar with the process right now.
import nnvm.compiler
target = 'cuda'
# assume first input name is data
input_name = sym.list_input_names()[0]
shape_dict = {input_name: x.shape}
with nnvm.compiler.build_config(opt_level=3):
    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)

######################################################################
# Execute on TVM
# ---------------------------------------------
# The process is no different from other example
from tvm.contrib import graph_runtime
ctx = tvm.gpu(0)
dtype = 'float32'
m = graph_runtime.create(graph, lib, ctx)
# set inputs
m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
m.set_input(**params)
# execute
m.run()
# get outputs
output_shape = (1, 1, 672, 672)
tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy()

######################################################################
# Display results
# ---------------------------------------------
# We put input and output image neck to neck
예제 #48
0
def test_tensor_core_batch_matmal():
    batch_size = 4
    n = 512
    m, l = n, n
    assert n % 32 == 0
    assert m % 8 == 0
    assert l % 16 == 0
    nn, mm, ll = n // 32, m // 8, l // 16
    A = te.placeholder((batch_size, nn, ll, 32, 16), name="A", dtype="float16")
    B = te.placeholder((batch_size, ll, mm, 16, 8), name="B", dtype="float16")
    k1 = te.reduce_axis((0, ll), name="k1")
    k2 = te.reduce_axis((0, 16), name="k2")
    C = te.compute(
        (batch_size, nn, mm, 32, 8),
        lambda b, i, j, ii, jj: te.sum(
            A[b, i, k1, ii, k2].astype("float") * B[b, k1, j, k2, jj].astype("float"), axis=[k1, k2]
        ),
        name="Fragment_C",
    )
    s = te.create_schedule(C.op)

    warp_size = 32
    kernel_size = 16
    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    chunk = 4

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    block_z = te.thread_axis("blockIdx.z")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_z = te.thread_axis("threadIdx.z")

    AS = s.cache_read(A, "shared", [C])
    BS = s.cache_read(B, "shared", [C])
    AF = s.cache_read(AS, "wmma.matrix_a", [C])
    BF = s.cache_read(BS, "wmma.matrix_b", [C])
    CF = s.cache_write(C, "wmma.accumulator")

    b, i, j, kernel_i, kernel_j = s[C].op.axis
    i, ii = s[C].split(i, factor=warp_row_tiles)
    block_i, i = s[C].split(i, factor=block_row_warps)
    j, jj = s[C].split(j, factor=warp_col_tiles)
    block_j, j = s[C].split(j, factor=block_col_warps)
    s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j)
    s[C].bind(b, block_z)
    s[C].bind(block_i, block_x)
    s[C].bind(block_j, block_y)
    s[C].bind(i, thread_y)
    s[C].bind(j, thread_z)

    s[CF].compute_at(s[C], j)
    b, warp_i, warp_j, _i, _j = s[CF].op.axis
    k, _k = CF.op.reduce_axis
    ko, ki = s[CF].split(k, factor=chunk)
    s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k)

    s[AF].compute_at(s[CF], ki)
    s[BF].compute_at(s[CF], ki)

    s[AS].compute_at(s[CF], ko)
    b, xo, yo, xi, yi = AS.op.axis
    tx, xo = s[AS].split(xo, nparts=block_row_warps)
    ty, yo = s[AS].split(yo, nparts=block_col_warps)
    t = s[AS].fuse(xi, yi)
    to, ti = s[AS].split(t, nparts=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(to, thread_x)

    s[BS].compute_at(s[CF], ko)
    b, xo, yo, xi, yi = BS.op.axis
    tx, xo = s[BS].split(xo, nparts=block_row_warps)
    ty, yo = s[BS].split(yo, nparts=block_col_warps)
    t = s[BS].fuse(xi, yi)
    to, ti = s[BS].split(t, nparts=warp_size)
    s[BS].bind(tx, thread_y)
    s[BS].bind(ty, thread_z)
    s[BS].bind(to, thread_x)

    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_a"))
    s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_b"))
    s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16)))
    s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16)))

    func = tvm.build(s, [A, B, C], "cuda")

    dev = tvm.gpu(0)
    a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
    b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(b_np, dev)
    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev)
    func(a, b, c)
    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
    print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3))

    if VERIFY:
        func(a, b, c)
        a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        np.testing.assert_allclose(
            c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4
        )
예제 #49
0
with open(temp.relpath("deploy_param.params"), "wb") as fo:
    fo.write(nnvm.compiler.save_param_dict(params))
print(temp.listdir())

######################################################################
# Deploy locally to Nvidia GPU
# ------------------------------
# Now we can load the module back.

import numpy as np
from tvm.contrib import graph_runtime

loaded_lib = tvm.module.load(path_lib)
loaded_json = open(temp.relpath("deploy_graph.json")).read()
loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
module.load_params(loaded_params)

input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
module.run(data=input_data)
out = module.get_output(0, out=tvm.nd.empty(out_shape))
# Print first 10 elements of output
print(out.asnumpy()[0][0:10])

######################################################################
# Compile and Deploy the Model to Raspberry Pi Remotely with RPC
# ------------------------------
# Following the steps above, we can also compile the model for Raspberry Pi.
# TVM provides rpc module to help with remote deploying.
#
# For demonstration, we simply start an RPC server on the same machine,
예제 #50
0
파일: cuda.py 프로젝트: chisuhua/tvm-ppflow
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if data.dtype in ('int8', 'uint8') and kernel.dtype in ('int8',
                                                                    'uint8'):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda")
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda")
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \
                dilation_h == 1 and dilation_w == 1:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5)
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda")
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                name="conv2d_nhwc.cuda")
            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)
            # Winograd shape related judgment
            judge_winograd_tensorcore, judge_winograd_shape = winograd_judge(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                pre_flag=False)
            if judge_winograd_shape:
                if target.kind.name == "cuda" and \
                    nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \
                    judge_winograd_tensorcore:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_tensorcore),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore
                        ),
                        name="conv2d_nhwc_winograd_tensorcore.cuda",
                        plevel=5)
                else:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_direct),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                        name="conv2d_nhwc_winograd_direct.cuda",
                        plevel=5)
            if target.kind.name == "cuda":
                if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
                    if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
                            (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \
                            (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0):
                        strategy.add_implementation(
                            wrap_compute_conv2d(
                                topi.cuda.conv2d_nhwc_tensorcore),
                            wrap_topi_schedule(
                                topi.cuda.schedule_conv2d_nhwc_tensorcore),
                            name="conv2d_nhwc_tensorcore.cuda",
                            plevel=20)
        elif layout == "HWNC":
            assert kernel_layout in [
                "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"
            ]
            _, _, N, in_channels = get_const_tuple(data.shape)
            pre_computed = len(kernel.shape) == 6
            if pre_computed:
                _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(
                    kernel.shape)
                out_channels = oc_chunk * oc_block_factor
            else:
                _, _, out_channels, _ = get_const_tuple(kernel.shape)
            if topi.cuda.is_shape_tensorcore_direct_qualified(
                    batch=N,
                    in_channels=in_channels,
                    num_filter=out_channels,
                    in_dtype=data.dtype):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_hwnc_tensorcore),
                    name="conv2d_hwnc_tensorcore_direct.cuda",
                    plevel=20)
            else:
                raise RuntimeError("Unsupported shape for conv2d HWNC.\
                                    Need to satisfy tensor core schedule.")
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda")
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))
        # add cudnn implementation
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
                    padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25)
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda")
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda")
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        # add cudnn implementation, if any
        cudnn_impl = False
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
                    padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25)
                cudnn_impl = True

        if layout == 'NCHW':
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.cuda")
        elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda")
        elif not cudnn_impl:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy
예제 #51
0
파일: cuda.py 프로젝트: chisuhua/tvm-ppflow
def conv2d_winograd_without_weight_transfrom_strategy_cuda(
        attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom cuda strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    assert dilation == (1, 1), "Do not support dilate now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    if layout == "NCHW":
        strategy.add_implementation(
            wrap_compute_conv2d(
                topi.cuda.conv2d_nchw_winograd_without_weight_transform),
            wrap_topi_schedule(
                topi.cuda.
                schedule_conv2d_nchw_winograd_without_weight_transform),
            name="conv2d_nchw_winograd_without_weight_transform.cuda")
    elif layout == "NHWC":
        N, H, W, _ = get_const_tuple(data.shape)
        alpha, _, CI, CO = get_const_tuple(kernel.shape)
        dilation_h, dilation_w = dilation
        judge_winograd_tensorcore, _ = winograd_judge(N,
                                                      H,
                                                      W,
                                                      alpha,
                                                      alpha,
                                                      CI,
                                                      CO,
                                                      padding,
                                                      stride_h,
                                                      stride_w,
                                                      dilation_h,
                                                      dilation_w,
                                                      pre_flag=True)
        if target.kind.name == "cuda" and \
            nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \
            judge_winograd_tensorcore:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.
                    conv2d_nhwc_winograd_tensorcore_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.
                    schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform
                ),
                name=
                "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda"
            )
        else:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.
                    conv2d_nhwc_winograd_direct_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.
                    schedule_conv2d_nhwc_winograd_direct_without_weight_transform
                ),
                name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda"
            )
    else:
        raise RuntimeError(
            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".
            format(layout))
    return strategy
예제 #52
0
파일: utils.py 프로젝트: Liubusy/tvmt_v2
def create_ctx(device, did=0):
    if device == "x86":
        ctx = tvm.cpu(did)
    elif device == "gpu":
        ctx = tvm.gpu(did)
    return ctx
    s[B].bind(s[B].op.reduce_axis[0], tx)
    s[B].bind(s[B].op.axis[0], bx)

    s[BF].compute_at(s[B], s[B].op.axis[0])

    _, noi = s[BF].split(s[BF].op.reduce_axis[0], factor=2)

    BF2 = s.rfactor(BF, noi, 0)

    s[BF].bind(s[BF].op.axis[0], tx)
    s[BF2].compute_at(s[BF], s[BF].op.axis[1])

    fcuda = tvm.build(s, [A, B], "cuda")


@unittest.skipIf(not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"), "skip because cuda is not enabled..")
def test_cuda_const_float_to_half():
    # This import is required to use nvcc to perform code gen;
    # otherwise it is found that the code gen is done by nvrtc.
    from tvm import autotvm
    shape = (2, 3, 4)
    a = te.placeholder(shape, dtype='float16', name='a')
    b = tvm.tir.const(0.5, dtype='float16')
    c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name='c')
    s = te.create_schedule(c.op)
    axes = [axis for axis in c.op.axis]
    fused = s[c].fuse(*axes)
    bx, tx = s[c].split(fused, factor=64)
    s[c].bind(bx, te.thread_axis('blockIdx.x'))
    s[c].bind(tx, te.thread_axis('threadIdx.x'))
예제 #54
0
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if data.dtype in ('int8', 'uint8') and kernel.dtype in ('int8',
                                                                    'uint8'):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda")
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda")
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \
                dilation_h == 1 and dilation_w == 1:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5)
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda")
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                name="conv2d_nhwc.cuda")
            N, _, _, _ = get_const_tuple(data.shape)
            _, _, CI, CO = get_const_tuple(kernel.shape)
            if target.target_name == "cuda":
                if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
                    if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
                            (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \
                            (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0):
                        strategy.add_implementation(
                            wrap_compute_conv2d(
                                topi.cuda.conv2d_nhwc_tensorcore),
                            wrap_topi_schedule(
                                topi.cuda.schedule_conv2d_nhwc_tensorcore),
                            name="conv2d_nhwc_tensorcore.cuda",
                            plevel=20)
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda")
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))
        # add cudnn implementation
        if target.target_name == "cuda" and "cudnn" in target.libs:
            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
                    padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn, True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=15)
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda")
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda")
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        if layout == 'NCHW':
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.cuda")
        elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda")
        else:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy