def test_fp16_build(): dtype = "float16" dev = tvm.cuda(0) if dtype == "float16" and not have_fp16(dev.compute_version): print("skip because gpu does not support fp16") return x = relay.var("x", dtype=dtype, shape=(4, 4)) y = relay.var("y", dtype=dtype, shape=(4, 4)) z = x + y func = relay.Function([x, y], z) X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev) Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev) params = { "x": X, "y": Y, } # build g_json, mmod, params = relay.build(func, "cuda", params=params) # test rt = tvm.contrib.graph_executor.create(g_json, mmod, dev) rt.load_params(runtime.save_param_dict(params)) rt.run() out = rt.get_output(0) np.testing.assert_allclose(out.asnumpy(), X.asnumpy() + Y.asnumpy(), atol=1e-5, rtol=1e-5)
def check_cuda(dtype, m=32, n=32): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return a = te.placeholder((m, n), name="a", dtype=dtype) b = te.placeholder((m, n), name="b", dtype=dtype) c = a + b d = a * b e = topi.elemwise_sum([c, d]) g = topi.sum(e) with tvm.target.cuda(): sg = topi.cuda.schedule_reduce(g) ctx = tvm.gpu(0) func = tvm.build(sg, [a, b, g], 'cuda') a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.random.uniform(size=(m, n)).astype(b.dtype) g_np = np.sum(np.add(a_np * b_np, a_np + b_np)) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(b_np, ctx) g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx) func(a_nd, b_nd, g_nd) tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3)
def check_single_op(opfunc, ref, dtype): shape = (10, 4) dtype = dtype tp = relay.TensorType(shape) x = relay.var("x", tp, dtype=dtype) y = opfunc(x) # test printer assert ("{}(%x)".format(y.op.name)) in y.astext() # test type inference yy = run_infer_type(y) assert yy.checked_type == tp if ref is not None: data = np.random.rand(*shape).astype(dtype) ref_res = ref(data) func = relay.Function([x], y) for target, ctx in tvm.testing.enabled_targets(): # use graph by execuor default for testing, as we need # create function explicitly to avoid constant-folding. if dtype == 'float16' and target == 'cuda' and not have_fp16( tvm.gpu(0).compute_version): continue intrp = relay.create_executor("graph", ctx=ctx, target=target) op_res = intrp.evaluate(func)(data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
def check_cuda(dtype): if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return n, m = 16, 16 A = te.placeholder((n, m,), name='A', dtype=dtype) B = te.compute((n, m,), lambda j, i: A[j, (i + 1) % m], name='B') cuda_target = tvm.target.create("cuda") assert cuda_target.thread_warp_size == 2 * m with cuda_target: s = te.create_schedule(B.op) tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") bx = te.thread_axis("blockIdx.x") AA = s.cache_read(A, "warp", [B]) y, x = B.op.axis z, y = s[B].split(y, nparts=2) s[B].bind(x, tx) s[B].bind(y, ty) s[B].bind(z, bx) s[AA].compute_at(s[B], y) _, x = AA.op.axis s[AA].bind(x, tx) ctx = tvm.gpu(0) func = tvm.build(s, [A, B], "cuda") A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype) B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype) A_nd = tvm.nd.array(A_np, ctx) B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx) func(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3)
def check_conversion(tgt, ctx): if not tvm.runtime.enabled(tgt): print("skip because {} is not enabled.".format(tgt)) return elif tgt == "cuda" and ctx.exist and not have_fp16( ctx.compute_version): print("skip because gpu does not support fp16") return n = 10 for (src, dst) in [('float32', 'float16'), ('float16', 'float32')]: x = relay.var("x", relay.TensorType((n, ), src)) y = x.astype(dst) func = relay.Function([x], y) # init input X = tvm.nd.array(n * np.random.randn(n).astype(src) - n / 2) # build with relay.build_config(opt_level=1): g_json, mmod, params = relay.build( tvm.IRModule.from_expr(func), tgt) # test rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx) rt.set_input("x", X) rt.run() out = rt.get_output(0) np.testing.assert_allclose(out.asnumpy(), X.asnumpy().astype(dst), atol=1e-5, rtol=1e-5)
def check(t0, t1, factor): if (t0 == "float16" or t1 == "float16") and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return # compute n = 128 A = te.placeholder((n,), dtype=t0, name="A") B = te.placeholder((n,), dtype=t1, name="B") C = te.compute((n,), lambda i: A[i] + topi.cast(B[i], A.dtype), name="C") # schedule s = tvm.te.create_schedule(C.op) ob, ib = s[C].split(s[C].op.axis[0], factor=factor) s[C].vectorize(ib) s[C].bind(ob, tx) func = tvm.build(s, [A, B, C], "cuda") # correctness dev = tvm.cuda(0) low, high = (0, 20) if t0.startswith("u") or t1.startswith("u") else (-10, 10) a_np = np.random.randint(low, high, size=n).astype(A.dtype) b_np = np.random.randint(low, high, size=n).astype(B.dtype) c_np = (a_np + b_np).astype(A.dtype) a_nd = tvm.nd.array(a_np, dev) b_nd = tvm.nd.array(b_np, dev) c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev) func(a_nd, b_nd, c_nd) tvm.testing.assert_allclose(c_nd.numpy(), c_np, rtol=1e-3)
def check_binary_op(opfunc, ref, dtype): # TODO(@jroesch): this piece of code improperly uses type variables. n = tvm.var("n") s1 = (5, n, 5) s2 = (n, 1) t1 = relay.TensorType(s1) t2 = relay.TensorType(s2) x = relay.var("x", t1, dtype=dtype) y = relay.var("y", t2, dtype=dtype) z = opfunc(x, y) # test printer assert ("{}(%x, %y)".format(z.op.name)) in z.astext() zz = run_infer_type(z) assert zz.checked_type == t1 if ref is not None: t1 = relay.TensorType((5, 10, 5)) t2 = relay.TensorType((5, 10, 5)) x = relay.var("x", t1, dtype=dtype) y = relay.var("y", t2, dtype=dtype) z = opfunc(x, y) x_data = np.random.rand(5, 10, 5).astype(dtype) y_data = np.random.rand(5, 10, 5).astype(dtype) ref_res = ref(x_data, y_data) func = relay.Function([x, y], z) for target, ctx in ctx_list(): # use graph by execuor default for testing, as we need # create function explicitly to avoid constant-folding. if dtype == 'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version): continue intrp = relay.create_executor("graph", ctx=ctx, target=target) op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
def test_bias_add(): for dtype in ["float16", "float32"]: xshape = (10, 2, 3, 4) bshape = (2,) rtol = 1e-2 if dtype == "float16" else 1e-5 x = relay.var("x", shape=xshape, dtype=dtype) bias = relay.var("bias", dtype=dtype) z = relay.nn.bias_add(x, bias) zz = run_infer_type(z) assert "axis=" not in zz.astext() assert zz.args[1].checked_type == relay.TensorType(bshape, dtype) func = relay.Function([x, bias], z) x_data = np.random.uniform(size=xshape).astype(dtype) y_data = np.random.uniform(size=bshape).astype(dtype) ref_res = x_data + y_data.reshape((2, 1, 1)) for target, dev in tvm.testing.enabled_targets(): if ( dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version) ): continue op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)( x_data, y_data ) np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol)
def run_test(tvm_intrin, np_func, dtype): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return # set of intrinsics does not support fp16 yet. skip_set = { tvm.tir.abs, tvm.tir.round, tvm.tir.tan, tvm.tir.atan, tvm.tir.tanh, tvm.tir.cosh, tvm.tir.sinh, } if dtype == "float16" and tvm_intrin in skip_set: print("Skip because '{0}' does not support fp16 yet".format(tvm_intrin.__name__)) return n = 128 A = te.placeholder((n,), dtype=dtype, name="A") B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B") s = sched(B) f = tvm.build(s, [A, B], "cuda") dev = tvm.cuda(0) a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev) f(a, b) tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-3, rtol=1e-3)
def check_cuda(dtype, n, l, padding, lanes): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return dev = tvm.cuda(0) A = tvm.te.placeholder((n, l), name="A", dtype=dtype) B = tvm.te.compute( (n // lanes, l + 2 * padding, lanes), lambda i, j, k: tvm.te.if_then_else( tvm.te.any(j < padding, j >= l + padding), tvm.runtime.convert(0).astype(dtype), A[i * lanes + k, j - padding], ), name="B", ) s = te.create_schedule(B.op) block, thread, vectorize = s[B].op.axis s[B].bind(block, bx) s[B].bind(thread, tx) s[B].vectorize(vectorize) fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad") np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype) a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a) b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev) fun(a, b) np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1) ref = np.pad( np_a_reshape, ((0, 0), (padding, padding), (0, 0)), mode="constant", constant_values=0 ) tvm.testing.assert_allclose(b.numpy(), ref)
def test_fp16_conversion(target, dev): if target == "cuda" and not have_fp16(dev.compute_version): print("skip because gpu does not support fp16") return n = 10 for (src, dst) in [("float32", "float16"), ("float16", "float32")]: x = relay.var("x", relay.TensorType((n,), src)) y = x.astype(dst) func = relay.Function([x], y) # init input X = tvm.nd.array(n * np.random.randn(n).astype(src) - n / 2) # build with tvm.transform.PassContext(opt_level=1): g_json, mmod, params = relay.build(tvm.IRModule.from_expr(func), target) # test rt = tvm.contrib.graph_executor.create(g_json, mmod, dev) rt.set_input("x", X) rt.run() out = rt.get_output(0) np.testing.assert_allclose(out.asnumpy(), X.asnumpy().astype(dst), atol=1e-5, rtol=1e-5)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("skip because gpu does not support fp16") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n, ), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.compute((n, ), lambda i: A[i] + tvm.const(1, A.dtype), name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B], "cuda") ctx = tvm.gpu(0) a = tvm.nd.empty((n, ), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n, ), B.dtype, ctx) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def test_concatenate(): for dtype in ["float16", "float32"]: n, t, d = te.size_var("n"), te.size_var("t"), 100 x = relay.var("x", shape=(n, t, d)) y = relay.var("y", shape=(n, t, d)) z = relay.concatenate((x, y), axis=-1) assert "axis=" in z.astext() zz = run_infer_type(z) assert zz.checked_type == relay.TensorType((n, t, 200)) x = relay.exp(x) z = relay.concatenate((x, y), axis=2) zz = run_infer_type(z) assert zz.checked_type == relay.TensorType((n, t, 200)) z = relay.concatenate((x, y), axis=1) zz = run_infer_type(z) assert zz.checked_type == relay.TensorType((n, t + t, 100)) # check shape mismatches (the following case is expected to raise tvm._ffi.base.TVMError. try: x = relay.var("p1", shape=(2, 5)) y = relay.var("p2", shape=(2, 3)) c = relay.concatenate([x, y], axis=0) func = relay.Function([x, y], c) zz = run_infer_type(func) except tvm._ffi.base.TVMError: pass else: assert False x = relay.var("x", shape=(10, 5), dtype=dtype) y = relay.var("y", shape=(10, 5), dtype=dtype) t = relay.var("z", shape=(), dtype=dtype) z = relay.concatenate((x, y), axis=1) z = relay.add(z, t) # Check result. func = relay.Function([x, y, t], z) x_data = np.random.rand(10, 5).astype(dtype) y_data = np.random.rand(10, 5).astype(dtype) t_data = np.random.uniform(size=()).astype(dtype) ref_res = np.concatenate((x_data, y_data), axis=1) + t_data for target, dev in tvm.testing.enabled_targets(): if ( dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version) ): continue op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( x_data, y_data, t_data ) tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.01) op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( x_data, y_data, t_data ) tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.01)
def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis): x = relay.Var("x", relay.TensorType(dshape, dtype)) func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis)) for target, ctx in ctx_list(): if dtype == 'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version): continue data = np.random.uniform(size=dshape).astype(dtype) ref_res = data.reshape(oshape) intrp = relay.create_executor("graph", ctx=ctx, target=target) op_res = intrp.evaluate(func)(data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis): x = relay.Var("x", relay.TensorType(dshape, dtype)) func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis)) for target, dev in tvm.testing.enabled_targets(): if (dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version)): continue data = np.random.uniform(size=dshape).astype(dtype) ref_res = data.reshape(oshape) op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data) np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
def check_target(target, dev): if dtype == "float16" and target == "cuda" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because %s does not have fp16 support" % target) return print("Running on target: %s" % target) with tvm.target.Target(target): s = tvm.topi.testing.get_elemwise_schedule(target)(B) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) foo = tvm.build(s, [A, B], target, name="relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_cuda(dtype): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return n, m = 16, 16 A = te.placeholder( ( n, m, ), name="A", dtype=dtype, ) B = te.compute( ( n, m, ), lambda j, i: A[j, (i + 1) % m], name="B", ) cuda_target = tvm.target.Target("cuda") assert cuda_target.thread_warp_size == 2 * m with cuda_target: s = te.create_schedule(B.op) tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") bx = te.thread_axis("blockIdx.x") AA = s.cache_read(A, "warp", [B]) y, x = B.op.axis z, y = s[B].split(y, nparts=2) s[B].bind(x, tx) s[B].bind(y, ty) s[B].bind(z, bx) s[AA].compute_at(s[B], y) _, x = AA.op.axis s[AA].bind(x, tx) dev = tvm.cuda(0) # building with the CSE pass disabled as otherwise it would do some commoning with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): func = tvm.build(s, [A, B], "cuda") A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype) B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype) A_nd = tvm.nd.array(A_np, dev) B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev) func(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
def check_device(device, ctx): if in_dtype == "float16" and device == "cuda" and not have_fp16(ctx.compute_version): print("Skip because %s does not have fp16 support" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): s = tvm.topi.testing.get_elemwise_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="reinterpret") data_npy = generator(in_shape).astype(in_dtype) out_npy = data_npy.view(B.dtype) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.array(np.empty(in_shape).astype(B.dtype), ctx) foo(data_nd, out_nd) np.testing.assert_equal(out_nd.asnumpy(), out_npy)
def test_unary_op(self, target, dev, relay_op, ref_func, supports_fp16, dtype): target = tvm.target.Target(target) if dtype == "float16": if target.kind.name == "cuda": if not have_fp16(tvm.cuda(0).compute_version): pytest.xfail( "No float16 support on local cuda device (compute_version != 5.3 and < 6.0)" ) elif target.kind.name == "vulkan" and not target.attrs.get( "supports_float16", False): pytest.xfail( "No float16 support on vulkan target (supports_float16=False)" ) elif not supports_fp16: pytest.xfail( f"No float16 support on {target.kind.name} target") if target.kind.name == "vulkan" and relay_op in [ tvm.relay.erf, tvm.relay.tan, tvm.relay.atan, ]: pytest.xfail(f"Vulkan runtime doesn't yet support {relay_op}") shape = (10, 4) dtype = dtype tp = relay.TensorType(shape, dtype=dtype) x = relay.var("x", type_annotation=tp) y = relay_op(x) # test printer assert ("{}(%x)".format(y.op.name)) in y.astext() # test type inference yy = run_infer_type(y) assert yy.checked_type == tp if ref_func is not None: data = np.random.rand(*shape).astype(dtype) ref_res = ref_func(data).astype(dtype) func = relay.Function([x], y) # use graph by execuor default for testing, as we need # create function explicitly to avoid constant-folding. op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data) tolerance = 1e-2 if dtype == "float16" else 1e-5 np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=tolerance)
def check_cuda(dtype): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return m = 32 A = te.placeholder((m, ), name='A', dtype=dtype) B = te.placeholder((m, ), name='B', dtype=dtype) C = te.compute((m, ), lambda i: A[(i + 1) % m] + B[(i + 1) % m], name='C') cuda_target = tvm.target.create("cuda") assert m <= cuda_target.thread_warp_size with cuda_target: s = te.create_schedule(C.op) tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") AA = s.cache_read(A, "warp", [C]) BB = s.cache_read(B, "warp", [C]) xo, xi = s[C].split(C.op.axis[0], nparts=1) s[C].bind(xi, tx) s[C].bind(xo, bx) s[AA].compute_at(s[C], xo) s[BB].compute_at(s[C], xo) xo, xi = s[AA].split(s[AA].op.axis[0], nparts=1) s[AA].bind(xo, bx) s[AA].bind(xi, tx) xo, xi = s[BB].split(s[BB].op.axis[0], nparts=1) s[BB].bind(xo, bx) s[BB].bind(xi, tx) ctx = tvm.gpu(0) func = tvm.build(s, [A, B, C], "cuda") AB_np = np.array(list(range(m)), dtype=dtype) C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2 A_nd = tvm.nd.array(AB_np, ctx) B_nd = tvm.nd.array(AB_np, ctx) C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), ctx) func(A_nd, B_nd, C_nd) tvm.testing.assert_allclose(C_nd.asnumpy(), C_np, rtol=1e-3)
def check_device(device): if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return with tvm.target.Target(device): ctx = tvm.context(device, 0) A = te.placeholder((n, m), name="A", dtype=dtype) B = te.compute((n, m), lambda i, j: A[i, j] + tvm.tir.const(1, A.dtype), name="B") S = tvm.topi.testing.get_elemwise_schedule(device)(B) fun = tvm.build(S, [A, B], device) np_A = tvm.nd.empty((n, m), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, m))) np_B = tvm.nd.empty((n, m), B.dtype, ctx) fun(np_A, np_B) tvm.testing.assert_allclose(np_B.asnumpy(), np_A.asnumpy() + 1, rtol=1e-5)
def check_cuda(dtype, n, lanes): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return if dtype == "int8" and not have_int8(tvm.cuda(0).compute_version): print("skip because gpu does not support int8") return A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, bx) s[B].bind(xi, tx) fun = tvm.build(s, [A, B], "cuda") dev = tvm.cuda(0) a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), B.dtype, dev) fun(a, c) tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if dtype == "float16" and device == "cuda" and not have_fp16( tvm.gpu(0).compute_version): print("Skip because %s does not have fp16 support" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.testing.get_elemwise_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def test_relu(target, dev, m, n, dtype): A = te.placeholder((m, n), name="A", dtype=dtype) B = topi.nn.relu(A) a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) if dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version): pytest.skip("Skip because %s does not have fp16 support" % target) print("Running on target: %s" % target) with tvm.target.Target(target): s = tvm.topi.testing.get_elemwise_schedule(target)(B) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) foo = tvm.build(s, [A, B], target, name="relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_cuda(dtype): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return m = 128 A = te.placeholder((m,), name="A", dtype=dtype) B = te.compute((m,), lambda i: A[i // 32 * 32 + (i + 1) % 32], name="B") cuda_target = tvm.target.Target("cuda") assert cuda_target.thread_warp_size == 32 with cuda_target: s = te.create_schedule(B.op) AA = s.cache_read(A, "warp", [B]) xo, xi = s[B].split(B.op.axis[0], 64) xi0, xi1 = s[B].split(xi, factor=32) tx = te.thread_axis("threadIdx.x") s[B].bind(xi1, tx) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[AA].compute_at(s[B], xo) xo, xi = s[AA].split(s[AA].op.axis[0], 32) s[AA].bind(xi, tx) dev = tvm.cuda(0) # building with the CSE pass disabled as otherwise it would do some commoning with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): func = tvm.build(s, [A, B], "cuda") A_np = np.array(list(range(m)), dtype=dtype) B_np = np.array( list(range(1, 32)) + [0] + list(range(33, 64)) + [32] + list(range(65, 96)) + [64] + list(range(97, 128)) + [96], dtype=dtype, ) A_nd = tvm.nd.array(A_np, dev) B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev) func(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
def check_cuda(dtype): if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): print("Skip because gpu does not have fp16 support") return m = 32 A = te.placeholder((m,), name="A", dtype=dtype) B = te.placeholder((m,), name="B", dtype=dtype) C = te.compute((m,), lambda i: A[(i + 1) % m] + B[(i + 1) % m], name="C") cuda_target = tvm.target.Target("cuda") assert m <= cuda_target.thread_warp_size with cuda_target: s = te.create_schedule(C.op) tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") AA = s.cache_read(A, "warp", [C]) BB = s.cache_read(B, "warp", [C]) xo, xi = s[C].split(C.op.axis[0], nparts=1) s[C].bind(xi, tx) s[C].bind(xo, bx) s[AA].compute_at(s[C], xo) s[BB].compute_at(s[C], xo) xo, xi = s[AA].split(s[AA].op.axis[0], nparts=1) s[AA].bind(xo, bx) s[AA].bind(xi, tx) xo, xi = s[BB].split(s[BB].op.axis[0], nparts=1) s[BB].bind(xo, bx) s[BB].bind(xi, tx) dev = tvm.cuda(0) # building with the CSE pass disabled as otherwise it would do some commoning with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): func = tvm.build(s, [A, B, C], "cuda") AB_np = np.array(list(range(m)), dtype=dtype) C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2 A_nd = tvm.nd.array(AB_np, dev) B_nd = tvm.nd.array(AB_np, dev) C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), dev) func(A_nd, B_nd, C_nd) tvm.testing.assert_allclose(C_nd.numpy(), C_np, rtol=1e-3)
def check(device, dtype, m=32, n=32): if not tvm.testing.device_enabled(device): print("Skipping", device) return dev = tvm.device(device, 0) if dtype == "float16" and not have_fp16(dev.compute_version): print("Skip because gpu does not have fp16 support") return a = tvm.te.placeholder((m, n), name="a", dtype=dtype) b = topi.sum(a) with tvm.target.Target(device): sb = tvm.te.create_schedule(b.op) i, _ = b.op.reduce_axis sb[b].bind(i, tvm.te.thread_axis("threadIdx.x")) func = tvm.build(sb, [a, b], device) a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.sum(a_np) a_nd = tvm.nd.array(a_np, dev) b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev) func(a_nd, b_nd) tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
def test_bias_add(): for dtype in ['float16', 'float32']: xshape=(10, 2, 3, 4) bshape=(2,) rtol = 1e-2 if dtype == 'float16' else 1e-5 x = relay.var("x", shape=xshape, dtype=dtype) bias = relay.var("bias", dtype=dtype) z = relay.nn.bias_add(x, bias) zz = run_infer_type(z) assert "axis=" not in zz.astext() assert zz.args[1].checked_type == relay.TensorType(bshape, dtype) func = relay.Function([x, bias], z) x_data = np.random.uniform(size=xshape).astype(dtype) y_data = np.random.uniform(size=bshape).astype(dtype) ref_res = x_data + y_data.reshape((2, 1, 1)) for target, ctx in ctx_list(): if dtype == 'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version): continue intrp = relay.create_executor("graph", ctx=ctx, target=target) op_res = intrp.evaluate(func)(x_data, y_data) np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol)
def check_cuda(dtype): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return m = 128 A = te.placeholder((m, ), name='A', dtype=dtype) B = te.compute((m, ), lambda i: A[i // 32 * 32 + (i + 1) % 32], name='B') cuda_target = tvm.target.create("cuda") assert cuda_target.thread_warp_size == 32 with cuda_target: s = te.create_schedule(B.op) AA = s.cache_read(A, "warp", [B]) xo, xi = s[B].split(B.op.axis[0], 64) xi0, xi1 = s[B].split(xi, factor=32) tx = te.thread_axis("threadIdx.x") s[B].bind(xi1, tx) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[AA].compute_at(s[B], xo) xo, xi = s[AA].split(s[AA].op.axis[0], 32) s[AA].bind(xi, tx) ctx = tvm.gpu(0) func = tvm.build(s, [A, B], "cuda") A_np = np.array(list(range(m)), dtype=dtype) B_np = np.array(list(range(1, 32)) + [0] + list(range(33, 64)) + [32] + list(range(65, 96)) + [64] + list(range(97, 128)) + [96], dtype=dtype) A_nd = tvm.nd.array(A_np, ctx) B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx) func(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3)
def check_cuda(dtype, m=32, n=32): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return a = tvm.te.placeholder((m, n), name="a", dtype=dtype) b = topi.sum(a) with tvm.target.cuda(): sb = tvm.te.create_schedule(b.op) i, _ = b.op.reduce_axis sb[b].bind(i, tvm.te.thread_axis("threadIdx.x")) ctx = tvm.gpu(0) func = tvm.build(sb, [a, b], 'cuda') a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.sum(a_np) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx) func(a_nd, b_nd) tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("skip because gpu does not support fp16") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B], "cuda") ctx = tvm.gpu(0) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom( np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), B.dtype, ctx) fun(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)