def my_cuda_mylog_rule(op): """CUDA lowering rule for log""" if op.dtype == "float32": return tvm.call_pure_extern("float32", "logf", op.args[0]) elif op.dtype == "float64": return tvm.call_pure_extern("float32", "log", op.args[0]) else: return op
def my_cuda_log_rule(op): if op.dtype == "float32": # logf:外部函数名,应该是cuda的内部函数 return tvm.call_pure_extern("float32", "logf", op.args[0]) elif op.dtype == "float64": return tvm.call_pure_extern("float64", "log", op.args[0]) else: return op
def my_cuda_mylog_rule(op): """CUDA lowering rule for log""" if op.dtype == "float32": return tvm.call_pure_extern("float32", "logf", op.args[0]) elif op.dtype == "float64": return tvm.call_pure_extern("float64", "log", op.args[0]) else: return op
def cuda_atomic_add_rule(op): if op.dtype == "float32": return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1]) if op.dtype == "float64": return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1]) if op.dtype == "int32": return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) raise RuntimeError("only support int32, float32 and float64")
def my_cuda_math_rule(op): """Customized CUDA intrinsic lowering rule""" assert isinstance(op, tvm.tir.Call) if op.dtype == "float32": # call float function return tvm.call_pure_extern("float32", "%sf" % op.name, op.args[0]) elif op.dtype == "float64": # call double function return tvm.call_pure_extern("float32", op.name, op.args[0]) else: # cannot do translation, return self. return op
def my_cuda_math_rule(op): """Customized CUDA intrinsic lowering rule""" assert isinstance(op, tvm.expr.Call) if op.dtype == "float32": # call float function return tvm.call_pure_extern("float32", "%sf" % op.name, op.args[0]) elif op.dtype == "float64": # call double function return tvm.call_pure_extern("float32", op.name, op.args[0]) else: # cannot do translation, return self. return op
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes)) C = tvm.placeholder((n,), name='C', dtype="int32") D = tvm.compute((n,), lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D') s = tvm.create_schedule(D.op) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, tvm.thread_axis("blockIdx.x")) s[D].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B, C, D], "cuda") np_a = np.random.randint(low=-128, high=127, size=(n,lanes)) np_b = np.random.randint(low=-128, high=127, size=(n,lanes)) np_c = np.random.randint(low=0, high=127, size=(n,)) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] ctx = tvm.gpu(0) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a) b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b) c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c) d = tvm.nd.empty((n,), D.dtype, ctx) fun(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), np_d)
def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n, ), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.placeholder((n, ), name='B', dtype="%sx%d" % (dtype, lanes)) C = tvm.placeholder((n, ), name='C', dtype="int32") D = tvm.compute((n, ), lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D') s = tvm.create_schedule(D.op) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, bx) s[D].bind(xi, tx) fun = tvm.build(s, [A, B, C, D], "cuda") np_a = np.random.randint(low=-128, high=127, size=(n, lanes)) np_b = np.random.randint(low=-128, high=127, size=(n, lanes)) np_c = np.random.randint(low=0, high=127, size=(n, )) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] ctx = tvm.gpu(0) a = tvm.nd.empty((n, ), A.dtype, ctx).copyfrom(np_a) b = tvm.nd.empty((n, ), B.dtype, ctx).copyfrom(np_b) c = tvm.nd.empty((n, ), C.dtype, ctx).copyfrom(np_c) d = tvm.nd.empty((n, ), D.dtype, ctx) fun(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), np_d)
def intrin_func(ins, outs): xx, yy = ins zz = outs[0] ib = tvm.ir_builder.create() dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a', xx.vload(0, dtype='int8x4'), yy.vload(0, dtype='int8x4'), zz.vload(0))) ib.emit(dp4a) body = ib.get() return body, zz.vstore(0, 0), body
def _instr(index): xx, yy = ins zz = outs[0] if index == 1: return zz.vstore(0, 0) ib = tvm.ir_builder.create() vec_x = xx.vload(0, dtype='int8x4') vec_y = yy.vload(0, dtype='int8x4') prev_z = 0 if index == 0 else zz.vload(0) new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z) ib.emit(zz.vstore(0, new_z)) return ib.get()
def lesson1(): """ The most straight-forward way to call target specific function is via extern function call construct in tvm. In th following example, we use :any:`tvm.call_pure_extern` to call :code:`__expf` function, which is only available under CUDA. """ n = tvm.var("n") A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda i: tvm.call_pure_extern("float32", "__expf", A[i]), name="B") s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "cuda", name="myexp") print(f.imported_modules[0].get_source())
def test_llvm_import(): # extern "C" is necessary to get the correct signature cc_code = """ extern "C" float my_add(float x, float y) { return x + y; } """ n = 10 A = tvm.placeholder((n, ), name='A') B = tvm.compute( (n, ), lambda *i: tvm.call_pure_extern("float32", "my_add", A(*i), 1.0), name='B') def check_llvm(use_file): if not tvm.module.enabled("llvm"): return if not clang.find_clang(required=False): print("skip because clang is not available") return temp = util.tempdir() ll_path = temp.relpath("temp.ll") ll_code = clang.create_llvm(cc_code, output=ll_path) s = tvm.create_schedule(B.op) if use_file: s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path) else: s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code) # BUILD and invoke the kernel. f = tvm.build(s, [A, B], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1.0) check_llvm(use_file=True) check_llvm(use_file=False)
def test_llvm_import(): # extern "C" is necessary to get the correct signature cc_code = """ extern "C" float my_add(float x, float y) { return x + y; } """ n = 10 A = tvm.placeholder((n,), name='A') B = tvm.compute((n,), lambda *i: tvm.call_pure_extern("float32", "my_add", A(*i), 1.0), name='B') def check_llvm(use_file): if not tvm.module.enabled("llvm"): return if not clang.find_clang(required=False): print("skip because clang is not available") return temp = util.tempdir() ll_path = temp.relpath("temp.ll") ll_code = clang.create_llvm(cc_code, output=ll_path) s = tvm.create_schedule(B.op) if use_file: s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path) else: s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code) # BUILD and invoke the kernel. f = tvm.build(s, [A, B], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) f(a, b) tvm.testing.assert_allclose( b.asnumpy(), a.asnumpy() + 1.0) check_llvm(use_file=True) check_llvm(use_file=False)
# Declare Variable A0 = tvm.placeholder((n, ), name='A0', dtype='float32') A1 = tvm.placeholder((n, ), name='A1', dtype='float32') A2 = tvm.placeholder((n, ), name='A2', dtype='float32') B0 = tvm.placeholder((n, ), name='B0', dtype='float32') B1 = tvm.placeholder((n, ), name='B1', dtype='float32') B2 = tvm.placeholder((n, ), name='B2', dtype='float32') D = tvm.placeholder((n, ), name='D', dtype='float32') D_ij = lambda i : (A0[i] - B0[j]) * (B0[j] - A0[i]) \ + (A1[i] - B1[j]) * (B1[j] - A1[i]) \ + (A2[i] - B2[j]) * (B2[j] - A2[i]) K_ij = lambda i: tvm.call_pure_extern("float32", "__expf", D_ij(i)) C0 = tvm.compute((n, ), lambda i: tvm.sum(K_ij(i) * D[j], axis=j), name="C0") # Scheduled the computation s0 = tvm.create_schedule(C0.op) bx, tx = s0[C0].split(C0.op.axis[0], factor=192) s0[C0].bind(bx, tvm.thread_axis("blockIdx.x")) s0[C0].bind(tx, tvm.thread_axis("threadIdx.x")) # Actually build the binary fconv0 = tvm.build(s0, [A0, A1, A2, B0, B1, B2, D, C0], tgt, target_host=tgt_host, name="myconv0")
import tvm import numpy as np # direct declare extern math call n = tvm.var('n') A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda i: tvm.call_pure_extern('float32', '__expf', A[i]), name='B') s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis('blockIdx.x')) s[B].bind(tx, tvm.thread_axis('threadIdx.x')) f = tvm.build(s, [A, B], 'cuda', name='my_exp') # print(f.imported_modules[0].get_source()) # unified intrinsic call n = tvm.var('n') A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda i: tvm.exp(A[i]), name='B') s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis('blockIdx.x')) s[B].bind(tx, tvm.thread_axis('threadIdx.x')) f_cuda = tvm.build(s, [A, B], 'cuda', name='my_exp') print(f_cuda.imported_modules[0].get_source()) fopencl = tvm.build(s, [A, B], "opencl", name="myexp")
def reduce_op(x, y): assert x.dtype == y.dtype, "Reduing elements that don't have same data type: %s v.s. %s" % ( x.dtype, y.dtype) return tvm.call_pure_extern(x.dtype, name, x, y, *args[1:])
import tvm import numpy as np ###################################################################### # Direct Declare Extern Math Call # ------------------------------- # The most straight-forward way to call target specific function is via # extern function call construct in tvm. # In the following example, we use :any:`tvm.call_pure_extern` to call # :code:`__expf` function, which is only available under CUDA. # n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda i: tvm.call_pure_extern("float32", "__expf", A[i]), name="B") s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "cuda", name="myexp") print(f.imported_modules[0].get_source()) ###################################################################### # Unified Intrinsic Call # ---------------------- # The above code verifies that direct external call can be used to # call into device specific functions. # However, the above way only works for CUDA target with float type.
import tvm import numpy as np ###################################################################### # Direct Declare Extern Math Call # ------------------------------- # The most straight-forward way to call target specific function is via # extern function call construct in tvm. # In th following example, we use :any:`tvm.call_pure_extern` to call # :code:`__expf` function, which is only available under CUDA. # n = tvm.var("n") A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda i: tvm.call_pure_extern("float32", "__expf", A[i]), name="B") s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "cuda", name="myexp") print(f.imported_modules[0].get_source()) ###################################################################### # Unified Intrinsic Call # ---------------------- # The above code verifies that direct external call can be used to # call into device specific functions. # However, the above way only works for CUDA target with float type.