def test_multilevel_splitting_with_indivisble_factors(): from tvm import topi A = te.placeholder((130, ), dtype="float32") B = topi.nn.relu(A) s = te.create_schedule(B.op) (y, ) = s[B].op.axis (yo, yi) = s[B].split(y, factor=8) (yoo, yoi) = s[B].split(yo, factor=16) s[B].reorder(yoo, yoi, yi) s[B].unroll(yi) ## But this does the right thing. with tvm.transform.PassContext( config={"tir.LoopPartition": { "partition_const_loop": True }}): lowered_body = tvm.lower(s, [A, B], name="x")["x"].body def visit_stmt(op): return isinstance(op, tvm.tir.Max) num_max = collect_visit(lowered_body, visit_stmt) assert num_max.count(True) == 10
def check_cuda(dtype): if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return m = 128 A = te.placeholder((m,), name='A', dtype=dtype) B = te.compute((m,), lambda i: A[i // 32 * 32 + (i + 1) % 32], name='B') cuda_target = tvm.target.create("cuda") assert cuda_target.thread_warp_size == 32 with cuda_target: s = te.create_schedule(B.op) AA = s.cache_read(A, "warp", [B]) xo, xi = s[B].split(B.op.axis[0], 64) xi0, xi1 = s[B].split(xi, factor=32) tx = te.thread_axis("threadIdx.x") s[B].bind(xi1, tx) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[AA].compute_at(s[B], xo) xo, xi = s[AA].split(s[AA].op.axis[0], 32) s[AA].bind(xi, tx) ctx = tvm.gpu(0) func = tvm.build(s, [A, B], "cuda") A_np = np.array(list(range(m)), dtype=dtype) B_np = np.array( list(range(1, 32)) + [0] + list(range(33, 64)) + [32] + list(range(65, 96)) + [64] + list(range(97, 128)) + [96], dtype=dtype) A_nd = tvm.nd.array(A_np, ctx) B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx) func(A_nd, B_nd) tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3)
def test_sort(): n = 2 l = 5 m = 3 data = te.placeholder((n, l, m), name="data") sort_num = te.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_ascend = False out = te.extern( data.shape, [data, sort_num], lambda ins, outs: tvm.tir.call_packed( "tvm.contrib.sort.argsort_nms", ins[0], ins[1], outs[0], axis, is_ascend ), dtype="int32", name="sort_tensor", ) input = [ [[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]], ] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [ [[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]], ] ctx = tvm.cpu(0) target = "llvm" s = te.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def test_in_bounds_vectorize_llvm(): n = 512 lanes = 2 A = te.placeholder((n, ), name="A", dtype="float32x%d" % lanes) B = te.compute((n, ), lambda i: A[i], name="B") C = te.compute((n, ), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C") s = te.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], nparts=2) _, xi = s[C].split(xi, factor=2) s[C].parallel(xo) s[C].vectorize(xi) s[B].compute_at(s[C], xo) xo, xi = s[B].split(B.op.axis[0], factor=2) s[B].vectorize(xi) # build and invoke the kernel. lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False) f = tvm.build(s, [A, C], "llvm") dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((n, ), A.dtype).copyfrom( np.random.uniform(size=[n] + ([] if lanes == 1 else [lanes]))) c = tvm.nd.empty((n, ), C.dtype, dev) f(a, c) tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
def check_rfactor_no_reset_multi_reduction(factor, rfactor): s = te.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) roo, roi = s[C].split(ro, factor=2) s[C].reorder(yo, roo, roi, yi, ri) gemv = intrin_gemv_no_reset(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.te.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir.structural_equal(out_dom[x].extent, 1) assert tvm.ir.structural_equal(out_dom[y].extent, factor) assert tvm.ir.structural_equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) ana = tvm.arith.Analyzer() assert tvm.ir.structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0])) stmt = tvm.te.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C])
def test_tensor_scalar(): # test te with scalar shape a = np.array(np.random.uniform(size=(1))[0], "float32") b = np.array(0.0, "float32") @tvm.register_func("tvm.test_tensor_scalar_copy") def mycopy(x, y): x.copyto(y) A = te.placeholder(a.shape, name="A") B = te.extern( a.shape, [A], lambda ins, outs: tvm.tir.call_packed("tvm.test_tensor_scalar_copy", ins[0], outs[0]), name="B", ) s = te.create_schedule(B.op) f = tvm.build(s, [A, B], "llvm") ta = tvm.nd.array(a) tb = tvm.nd.array(b) f(ta, tb) tvm.testing.assert_allclose(ta.numpy(), tb.numpy())
def mod(self, target, load_type, store_type, indirect_indices): target = tvm.target.Target(target) n = 4 dtype = "int32" A = te.placeholder((n, ), dtype=dtype, name="A") R = te.placeholder((n, ), dtype=dtype, name="R") def do_compute(ins, outs): ib = tvm.tir.ir_builder.create() A, R = map(ib.buffer_ptr, ins) B = ib.buffer_ptr(outs[0]) if "gpu" in target.keys: ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0) index_map = { "ramp": tvm.tir.Ramp(0, 1, 4), "broadcast": tvm.tir.Broadcast(0, 4), } load_index = index_map[load_type] store_index = index_map[store_type] if indirect_indices: load_index = tvm.tir.expr.Load("int32x4", R, load_index) transfer = tvm.tir.expr.Load("int32x4", A, load_index) ib.emit(tvm.tir.stmt.Store(B, transfer, store_index)) return ib.get() B = te.extern(A.shape, [A, R], do_compute, dtype="int32") s = te.create_schedule(B.op) return tvm.lower(s, [A, R, B])
def test_large_input(): @tvm.hybrid.script def compute(a, b): n = 16384 c = output_tensor((n, n), 'int32') for i in range(n): for j in range(n): c[i, j] = a[i, j] - b[i, j] return c n = 16384 shape = (n, n) a = te.placeholder(shape, name='a', dtype='int32') b = te.placeholder(shape, name='b', dtype='int32') c = te.compute(shape, lambda i, j: compute(a, b)[i, j]) c = te.compute(shape, lambda i, j: 1 + c[i, j]) s = te.create_schedule(c.op) stmt = tvm.lower(s, [a, b, c], simple_mode=True) def verify(n): if isinstance(n, tvm.tir.Allocate): assert n.extents[0].value == 268435456 tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
def try_warp_memory(): """skip this in default test because it require higher arch""" m = 128 A = te.placeholder((m, ), name='A') B = te.compute((m, ), lambda i: A[i] + 3, name='B') warp_size = 32 s = te.create_schedule(B.op) AA = s.cache_read(A, "warp", [B]) xo, xi = s[B].split(B.op.axis[0], warp_size * 2) xi0, xi1 = s[B].split(xi, factor=warp_size) tx = te.thread_axis("threadIdx.x") s[B].bind(xi1, tx) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[AA].compute_at(s[B], xo) xo, xi = s[AA].split(s[AA].op.axis[0], warp_size) s[AA].bind(xi, tx) @tvm.register_func def tvm_callback_cuda_compile(code): ptx = nvcc.compile_cuda(code, target="ptx") return ptx # one line to build the function. def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("skip because %s is not enabled.." % device) return f = tvm.build(s, [A, B], device) a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 3, rtol=1e-6) check_device("cuda")
def test_large_input(): @te.hybrid.script def compute(a, b): n = 16384 c = output_tensor((n, n), "int32") for i in range(n): for j in range(n): c[i, j] = a[i, j] - b[i, j] return c n = 16384 shape = (n, n) a = te.placeholder(shape, name="a", dtype="int32") b = te.placeholder(shape, name="b", dtype="int32") c = te.compute(shape, lambda i, j: compute(a, b)[i, j]) c = te.compute(shape, lambda i, j: 1 + c[i, j]) s = te.create_schedule(c.op) stmt = tvm.lower(s, [a, b, c])["main"].body def verify(n): if isinstance(n, tvm.tir.Allocate): assert n.extents[0].value == 268435456 tvm.tir.stmt_functor.post_order_visit(stmt, verify)
def test_storage_share_gpu(): m = te.var('m') A = [te.placeholder((m), name='A')] num_stage = 5 for t in range(num_stage): A.append( te.compute((m, ), lambda i: A[-1][i] + (t + 1), name='A%d_s' % t)) A.append(te.compute((m, ), lambda i: A[-1][i], name='A%d' % t)) s = te.create_schedule(A[-1].op) for t in range(num_stage): x = A[2 * t + 2].op.axis[0] bx, tx = s[A[2 * t + 2]].split(x, factor=32) s[A[2 * t + 2]].bind(bx, te.thread_axis("blockIdx.x")) s[A[2 * t + 2]].bind(tx, te.thread_axis("threadIdx.x")) s[A[2 * t + 1]].compute_at(s[A[2 * t + 2]], tx) s[A[2 * t + 1]].set_scope("shared") bounds = tvm.te.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.te.schedule.ScheduleOps(s, bounds) Ab = tvm.tir.decl_buffer(A[0].shape, A[0].dtype, name='A') Bb = tvm.tir.decl_buffer(A[0].shape, A[0].dtype, name='B') stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64) stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt) stmt = tvm.tir.ir_pass.Simplify(stmt) stmt = tvm.tir.ir_pass.StorageRewrite(stmt) alloc_stats = {"global": 0, "shared": 0} def verify(n): if isinstance(n, tvm.tir.AttrStmt): if n.attr_key == "storage_scope": alloc_stats[n.value.value] += 1 tvm.tir.ir_pass.PostOrderVisit(stmt, verify) assert alloc_stats["global"] == 2 assert alloc_stats["shared"] == num_stage
def test_tensor_intrin_scalar_params(): n = te.size_var("n") x = te.placeholder((n, ), name='x') v = te.size_var("v") w = te.size_var("w") z = te.compute((n, ), lambda i: x[i] * v + w, name='z') def intrin_func(ins, outs, sp): assert (isinstance(ins[0], tvm.te.schedule.Buffer)) assert (ins[0].shape[0] == n) assert (sp[0] == v) assert (sp[1] == w) return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1]) intrin = te.decl_tensor_intrin(z.op, intrin_func, scalar_params=[v, w], default_buffer_params={"offset_factor": 1}) assert intrin.op == z.op assert intrin.reduce_init is None assert tuple(intrin.inputs) == tuple(z.op.input_tensors) assert (intrin.buffers[0].shape[0] == n) assert tuple(intrin.scalar_params) == tuple((v, w)) A = te.placeholder((10, 10), name='A') # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs C = te.compute((10, 10), lambda i, j: intrin(i * i, A[i, j], i + j), name="C") s = te.create_schedule(C.op) stmt = tvm.lower(s, [A, C])["main"].body assert isinstance(stmt.body.body, tvm.tir.Evaluate) assert len(stmt.body.body.value.args) == 5 assert str(stmt.body.body.value.args[3]) == "(i: int32*i)" assert str(stmt.body.body.value.args[4]) == "(i: int32 + j: int32)"
def check_value(expr, vx, vy, data, fref): n = len(data) A = te.placeholder((n, ), name="A", dtype=expr.dtype) B = te.placeholder((n, ), name="B", dtype=expr.dtype) def make_binds(i): x = expr x = tvm.tir.Let(vx, A[i], x) x = tvm.tir.Let(vy, B[i], x) return x C = te.compute((n, ), make_binds) s = te.create_schedule([C.op]) if not tvm.runtime.enabled("llvm"): return f = tvm.build(s, [A, B, C], "llvm") a = tvm.nd.array(np.array([x for x, y in data], dtype=expr.dtype)) b = tvm.nd.array(np.array([y for x, y in data], dtype=expr.dtype)) c = tvm.nd.array(np.zeros(len(data), dtype=expr.dtype)) f(a, b, c) cref = np.array([fref(x, y) for x, y in data]) np.testing.assert_equal(c.asnumpy(), cref)
def test_matmul(): n = 1024 l = 128 m = 235 A = te.placeholder((n, l), name="A") B = te.placeholder((l, m), name="B") C = rocblas.matmul(A, B) s = te.create_schedule(C.op) def verify(target="rocm"): if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.rocm(0) f = tvm.build(s, [A, B, C], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) verify()
def test_inline_multi_reduce(): def argmax_comp(x, y): idx = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) return idx, val def argmax_init(idx_typ, val_typ): return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ) argmax = te.comm_reducer(argmax_comp, argmax_init, name="argmax") m = te.var("m") n = te.var("n") val = te.placeholder((m, n), name="val", dtype="float32") val1 = te.compute((m, n), lambda i, j: val[i, j] + 1, name="val1") val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name="val2") k = te.reduce_axis((0, n), "k") T_idx, T_val = te.compute((m, ), lambda i: argmax((k.var, val2[i, k]), axis=k), name="T") s = te.create_schedule(T_idx.op) s[val1].compute_inline() s = s.normalize() bounds = tvm.te.schedule.InferBound(s) stmt = tvm.te.schedule.ScheduleOps(s, bounds)
def test_large_uint_imm(): value = (1 << 63) + 123 other = tvm.tir.const(3, "uint64") n = 12 num_thread = 2 A = te.compute((n,), lambda *i: tvm.tir.const(value, "uint64") + other, name='A') s = te.create_schedule(A.op) xo, xi = s[A].split(A.op.axis[0], factor=num_thread) s[A].bind(xi, te.thread_axis("threadIdx.x")) s[A].bind(xo, te.thread_axis("blockIdx.x")) def check_target(device): ctx = tvm.context(device, 0) if not ctx.exist: return f = tvm.build(s, [A], device) # launch the kernel. a = tvm.nd.empty((n, ), dtype=A.dtype, ctx=ctx) f(a) assert a.asnumpy()[0] == value + 3 check_target("cuda") check_target("vulkan")
def test_multi_kernel(): # graph n = tvm.runtime.convert(1024) A = te.placeholder((n, ), name='A') B = te.placeholder((n, ), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') D = te.compute(A.shape, lambda *i: A(*i) + C(*i), name='D') s = te.create_schedule(D.op) # create iter var and assign them tags. px, x = s[C].split(C.op.axis[0], nparts=1) s[C].bind(px, te.thread_axis("pipeline")) px, x = s[D].split(D.op.axis[0], nparts=1) s[D].bind(px, te.thread_axis("pipeline")) # one line to build the function. def check_device(device, host="llvm"): if not tvm.runtime.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx) d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx) fadd(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5) check_device("sdaccel") check_device("aocl_sw_emu")
def test_matmul(): n = 1024 l = 128 m = 256 A = te.placeholder((n, l), name="A") B = te.placeholder((l, m), name="B") C = mps.matmul(A, B) D = te.compute(C.shape, lambda *i: C(*i) + 1.0) s = te.create_schedule(D.op) yo, xo = D.op.axis block_y = te.thread_axis("blockIdx.y") block_x = te.thread_axis("blockIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_x = te.thread_axis("threadIdx.x") by, ty = s[D].split(yo, factor=16) bx, tx = s[D].split(xo, factor=16) s[D].bind(by, block_y) s[D].bind(bx, block_x) s[D].bind(ty, thread_y) s[D].bind(tx, thread_x) def verify(A, B, D, s, target="metal"): if not tvm.get_global_func("tvm.contrib.mps.matmul", True): print("skip because extern function is not available") return dev = tvm.metal(0) f = tvm.build(s, [A, B, D], "metal") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()) + 1, rtol=1e-5) verify(A, B, D, s)
def verify_dense_sw(batch, in_dim, out_dim, use_bias=True, dtype='float32'): nonzeros = te.var('nonzeros') A = te.placeholder((batch, in_dim), dtype=dtype, name='A') B = tvmsp.placeholder(shape=(out_dim, in_dim), nonzeros=nonzeros, dtype=dtype, name='B') C = te.placeholder((out_dim,), dtype=dtype, name='C') D = topi.sparse.dense(A, B, C if use_bias else None) s = te.create_schedule(D.op) # get the test data def get_ref_data(): mag = 10. a_np = (mag*(np.random.uniform(size=(batch, in_dim)).astype('float32')-.5)).astype(dtype) b_np = np.maximum(mag*(np.random.uniform(size=(out_dim, in_dim)).astype('float32')-0.5), 0.).astype(dtype) c_np = (mag*(np.random.uniform(size=(out_dim,)).astype('float32')-.5)).astype(dtype) if use_bias: d_np = np.dot(a_np, b_np.T) + c_np else: d_np = np.dot(a_np, b_np.T) return (a_np, b_np, c_np, d_np) a_np, b_np, c_np, d_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) a = tvm.nd.array(a_np, ctx) b = tvmsp.array(b_np, ctx) c = tvm.nd.array(c_np, ctx) d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense") f(a, b.data, b.indices, b.indptr, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4) check_device('llvm')
def test_add(): """Test a module which performs addition.""" if not tvm.runtime.enabled("micro_dev"): return shape = (1024, ) dtype = "float32" reset_gdbinit() # Construct TVM expression. tvm_shape = tvm.runtime.convert(shape) A = te.placeholder(tvm_shape, name="A", dtype=dtype) B = te.placeholder(tvm_shape, name="B", dtype=dtype) C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") s = te.create_schedule(C.op) func_name = "fadd" c_mod = tvm.build(s, [A, B, C], target="c", name=func_name) with micro.Session(DEV_CONFIG_A) as sess: micro_mod = micro.create_micro_mod(c_mod, DEV_CONFIG_A) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) a_np = np.random.uniform(size=shape).astype(dtype) a = tvm.nd.array(a_np, ctx) b_np = np.random.uniform(size=shape).astype(dtype) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) micro_func(a, b, c) # ensure inputs weren't corrupted tvm.testing.assert_allclose(a.asnumpy(), a_np) tvm.testing.assert_allclose(b.asnumpy(), b_np) # ensure output is correct tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
def test_cuda_const_float_to_half(): # This import is required to use nvcc to perform code gen; # otherwise it is found that the code gen is done by nvrtc. from tvm import autotvm shape = (2, 3, 4) a = te.placeholder(shape, dtype="float16", name="a") b = tvm.tir.const(0.5, dtype="float16") c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name="c") s = te.create_schedule(c.op) axes = [axis for axis in c.op.axis] fused = s[c].fuse(*axes) bx, tx = s[c].split(fused, factor=64) s[c].bind(bx, te.thread_axis("blockIdx.x")) s[c].bind(tx, te.thread_axis("threadIdx.x")) func = tvm.build(s, [a, c], "cuda") dev = tvm.gpu(0) a_np = np.random.uniform(size=shape).astype(a.dtype) c_np = np.zeros(shape=shape, dtype=c.dtype) a = tvm.nd.array(a_np, dev) c = tvm.nd.array(c_np, dev) func(a, c) np.testing.assert_equal(c.asnumpy(), a_np > b.value)
def test_sort_by_key_gpu(): size = 6 keys = te.placeholder((size, ), name="keys", dtype="int32") values = te.placeholder((size, ), name="values", dtype="int32") for target in ["cuda", "nvptx", "opencl", "rocm"]: if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) continue with tvm.target.Target(target): keys_out, values_out = sort_by_key(keys, values) ctx = tvm.context(target) s = te.create_schedule([keys_out.op, values_out.op]) f = tvm.build(s, [keys, values, keys_out, values_out], target) keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32) values_np = np.random.randint(0, 10, size=(size, )).astype(np.int32) keys_np_out = np.zeros(keys_np.shape, np.int32) values_np_out = np.zeros(values_np.shape, np.int32) keys_in = tvm.nd.array(keys_np, ctx) values_in = tvm.nd.array(values_np, ctx) keys_out = tvm.nd.array(keys_np_out, ctx) values_out = tvm.nd.array(values_np_out, ctx) f(keys_in, values_in, keys_out, values_out) ref_keys_out = np.sort(keys_np) ref_values_out = np.array( [values_np[i] for i in np.argsort(keys_np)]) tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5) tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
def check_correct_assembly(dtype): n = (1024, ) A = te.placeholder(n, dtype=dtype, name='A') B = te.compute( A.shape, lambda i: tvm.tir.Select(A[i] >= 0, A[i] + tvm.tir.const(1, dtype), tvm.tir.const(0, dtype)), name='B') s = te.create_schedule(B.op) (bx, tx) = s[B].split(s[B].op.axis[0], factor=128) (tx, vx) = s[B].split(tx, factor=4) s[B].bind(bx, te.thread_axis("blockIdx.x")) s[B].bind(tx, te.thread_axis("threadIdx.x")) s[B].vectorize(vx) f = tvm.build(s, [A, B], target) # Verify we generate the boolx4 type declaration and the OpSelect # v4{float,half,int} instruction assembly = f.imported_modules[0].get_source() matches = re.findall("%v4bool = OpTypeVector %bool 4", assembly) assert len(matches) == 1 matches = re.findall("OpSelect %v4.*", assembly) assert len(matches) == 1
def test_scan_inline2(): m = te.var("m") n = te.var("n") x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") s_state1 = te.placeholder((m, n)) s_state2 = te.placeholder((m, n)) s_init1 = te.compute((1, n), lambda _, i: x[0, i]) s_init2 = te.compute((1, n), lambda _, i: x[0, i]) s_xx = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + x[t, i], name="xx") s_x1 = te.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1") s_x2 = te.compute((m, n), lambda t, i: s_xx[t, i] + s_state2[t - 1, 2], name="x2") s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1") s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2") res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2]) s = te.create_schedule(res1.op) s[s_xx].compute_inline() s[s_x1].compute_inline() s[s_x2].compute_inline() stmt = tvm.lower(s, [x, res1, res2])
def test_single_likely(): n = 60 A = te.placeholder((n, ), name="A") B = te.placeholder((n, ), name="B") T = te.compute((n, ), lambda i: A[i] + B[i]) s = te.create_schedule(T.op) x = T.op.axis[0] xo, xi = s[T].split(x, factor=16) bounds = tvm.te.schedule.InferBound(s) stmt = tvm.te.schedule.ScheduleOps(s, bounds) mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt)) with tvm.transform.PassContext( config={"tir.LoopPartition": { "partition_const_loop": True }}): mod = tvm.tir.transform.LoopPartition()(mod) stmt = tvm.tir.transform.Simplify()(mod)["main"].body assert not any( collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
def test_tensor_core_batch_conv(): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): print("skip because gpu does not support tensor core") return # The sizes of inputs and filters batch_size = 32 height = 14 width = 14 in_channels = 32 out_channels = 64 kernel_h = 3 kernel_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 block_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 warp_size = 32 chunk = 2 # Input feature map: (N, H, W, IC, n, ic) data_shape = (batch_size // block_size, height, width, in_channels // block_size, block_size, block_size) # Kernel: (H, W, IC, OC, ic, oc) kernel_shape = (kernel_h, kernel_w, in_channels // block_size, out_channels // block_size, block_size, block_size) # Output feature map: (N, H, W, OC, n, oc) output_shape = (batch_size // block_size, height, width, out_channels // block_size, block_size, block_size) assert (batch_size % block_size == 0) assert (in_channels % block_size == 0) assert (out_channels % block_size == 0) kh = te.reduce_axis((0, kernel_h), name='kh') kw = te.reduce_axis((0, kernel_w), name='kw') ic = te.reduce_axis((0, in_channels // block_size), name='ic') ii = te.reduce_axis((0, block_size), name='ii') # Algorithm A = te.placeholder(data_shape, name='A', dtype="float16") W = te.placeholder(kernel_shape, name='W', dtype="float16") Apad = te.compute( (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size, block_size), lambda n, h, w, i, nn, ii: tvm.tir.if_then_else( tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0., "float16")), name='Apad') Conv = te.compute( output_shape, lambda n, h, w, o, nn, oo: te.sum(Apad[ n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype( "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"), axis=[ic, kh, kw, ii]), name="Conv") s = te.create_schedule(Conv.op) s[Apad].compute_inline() AS = s.cache_read(Apad, 'shared', [Conv]) WS = s.cache_read(W, 'shared', [Conv]) AF = s.cache_read(AS, 'wmma.matrix_a', [Conv]) WF = s.cache_read(WS, 'wmma.matrix_b', [Conv]) ConvF = s.cache_write(Conv, 'wmma.accumulator') block_x = te.thread_axis('blockIdx.x') block_y = te.thread_axis('blockIdx.y') block_z = te.thread_axis('blockIdx.z') thread_x = te.thread_axis('threadIdx.x') thread_y = te.thread_axis('threadIdx.y') thread_z = te.thread_axis('threadIdx.z') nc, hc, wc, oc, nnc, ooc = Conv.op.axis block_k = s[Conv].fuse(hc, wc) s[Conv].bind(block_k, block_z) nc, nci = s[Conv].split(nc, factor=warp_row_tiles) block_i, nc = s[Conv].split(nc, factor=block_row_warps) oc, oci = s[Conv].split(oc, factor=warp_col_tiles) block_j, oc = s[Conv].split(oc, factor=block_col_warps) s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) s[Conv].bind(block_i, block_x) s[Conv].bind(block_j, block_y) s[Conv].bind(nc, thread_y) s[Conv].bind(oc, thread_z) s[ConvF].compute_at(s[Conv], oc) n, h, w, o, nnf, oof = ConvF.op.axis ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) s[AF].compute_at(s[ConvF], kw) s[WF].compute_at(s[ConvF], kw) s[WS].compute_at(s[ConvF], kh) s[AS].compute_at(s[ConvF], kh) n, h, w, i, nn, ii = AS.op.axis tx, xo = s[AS].split(n, nparts=block_row_warps) ty, yo = s[AS].split(xo, nparts=block_col_warps) t = s[AS].fuse(nn, ii) to, ti = s[AS].split(t, factor=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(ti, thread_x) kh, kw, ic, o, ii, oo = WS.op.axis tx, xo = s[WS].split(o, nparts=block_row_warps) ty, yo = s[WS].split(xo, nparts=block_col_warps) t = s[WS].fuse(ii, oo) to, ti = s[WS].split(t, nparts=warp_size) s[WS].bind(tx, thread_y) s[WS].bind(ty, thread_z) s[WS].bind(to, thread_x) s[WS].vectorize(ti) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_a')) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_b')) s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16))) s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16))) func = tvm.build(s, [A, W, Conv], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=3) print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3)) if VERIFY: func(a, w, c) a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels) w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(kernel_h, kernel_w, in_channels, out_channels) c_np = c.asnumpy().transpose( (0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels) c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w)).astype(Conv.dtype) np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
def test_tensor_core_batch_matmal(): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): print("skip because gpu does not support tensor core") return batch_size = 4 n = 512 m, l = n, n assert (n % 32 == 0) assert (m % 8 == 0) assert (l % 16 == 0) nn, mm, ll = n // 32, m // 8, l // 16 A = te.placeholder((batch_size, nn, ll, 32, 16), name='A', dtype='float16') B = te.placeholder((batch_size, ll, mm, 16, 8), name='B', dtype='float16') k1 = te.reduce_axis((0, ll), name='k1') k2 = te.reduce_axis((0, 16), name='k2') C = te.compute((batch_size, nn, mm, 32, 8), lambda b, i, j, ii, jj: te.sum(A[b, i, k1, ii, k2].astype( 'float') * B[b, k1, j, k2, jj].astype('float'), axis=[k1, k2]), name='Fragment_C') s = te.create_schedule(C.op) warp_size = 32 kernel_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 chunk = 4 block_x = te.thread_axis('blockIdx.x') block_y = te.thread_axis('blockIdx.y') block_z = te.thread_axis('blockIdx.z') thread_x = te.thread_axis('threadIdx.x') thread_y = te.thread_axis('threadIdx.y') thread_z = te.thread_axis('threadIdx.z') AS = s.cache_read(A, 'shared', [C]) BS = s.cache_read(B, 'shared', [C]) AF = s.cache_read(AS, 'wmma.matrix_a', [C]) BF = s.cache_read(BS, 'wmma.matrix_b', [C]) CF = s.cache_write(C, 'wmma.accumulator') b, i, j, kernel_i, kernel_j = s[C].op.axis i, ii = s[C].split(i, factor=warp_row_tiles) block_i, i = s[C].split(i, factor=block_row_warps) j, jj = s[C].split(j, factor=warp_col_tiles) block_j, j = s[C].split(j, factor=block_col_warps) s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j) s[C].bind(b, block_z) s[C].bind(block_i, block_x) s[C].bind(block_j, block_y) s[C].bind(i, thread_y) s[C].bind(j, thread_z) s[CF].compute_at(s[C], j) b, warp_i, warp_j, _i, _j = s[CF].op.axis k, _k = CF.op.reduce_axis ko, ki = s[CF].split(k, factor=chunk) s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k) s[AF].compute_at(s[CF], ki) s[BF].compute_at(s[CF], ki) s[AS].compute_at(s[CF], ko) b, xo, yo, xi, yi = AS.op.axis tx, xo = s[AS].split(xo, nparts=block_row_warps) ty, yo = s[AS].split(yo, nparts=block_col_warps) t = s[AS].fuse(xi, yi) to, ti = s[AS].split(t, nparts=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(to, thread_x) s[BS].compute_at(s[CF], ko) b, xo, yo, xi, yi = BS.op.axis tx, xo = s[BS].split(xo, nparts=block_row_warps) ty, yo = s[BS].split(yo, nparts=block_col_warps) t = s[BS].fuse(xi, yi) to, ti = s[BS].split(t, nparts=warp_size) s[BS].bind(tx, thread_y) s[BS].bind(ty, thread_z) s[BS].bind(to, thread_x) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_a')) s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_b')) s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16))) s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16))) func = tvm.build(s, [A, B, C], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype) b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), ctx) func(a, b, c) evaluator = func.time_evaluator(func.entry_name, ctx, number=3) print('gemm with tensor core: %f ms' % (evaluator(a, b, c).mean * 1e3)) if VERIFY: func(a, b, c) a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) np.testing.assert_allclose(c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4)
def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96): A = te.placeholder((n, l), name="A", dtype="float16") B = te.placeholder((l, m), name="B", dtype="float16") k = te.reduce_axis((0, l), name="k") C = te.compute((n, m), lambda i, j: te.sum( A[i, k].astype("float32") * B[k, j].astype("float32"), axis=k)) s = te.create_schedule(C.op) y, x = s[C].op.axis k = s[C].op.reduce_axis[0] AA = s.cache_read(A, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BB = s.cache_read(B, "shared", [C]) BL = s.cache_read(BB, "local", [C]) CL = s.cache_write(C, "local") bx = 4 by = 32 step_k = 8 v = 4 TX = 8 TY = 1 tile_x = bx * TX tile_y = by * TY WX = min(warp_tile_m, tile_x) tile_k = 16 vthread = 1 yo, ty = s[C].split(y, tile_y * vthread) vy, ty = s[C].split(ty, tile_y) ty, yi = s[C].split(ty, TY) xo, xi = s[C].split(x, tile_x) tz, xi = s[C].split(xi, WX) tx, xi = s[C].split(xi, TX) ko, ki = s[CL].split(k, step_k * tile_k) kl, ki = s[CL].split(ki, tile_k) s[C].reorder(yo, xo, tz, ty, tx, yi, xi) s[C].bind(yo, te.thread_axis("blockIdx.y")) s[C].bind(xo, te.thread_axis("blockIdx.x")) s[C].bind(ty, te.thread_axis("threadIdx.y")) s[C].bind(tz, te.thread_axis("threadIdx.z")) s[C].bind(tx, te.thread_axis("threadIdx.x")) s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy")) s[CL].compute_at(s[C], tx) yo, xo = CL.op.axis s[CL].reorder(ko, kl, ki, yo, xo) s[AA].compute_at(s[CL], ko) xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v) tz, tx = s[AA].split(xi, factor=(WX // TX) * v) tx, vec = s[AA].split(tx, factor=v) fused = s[AA].fuse(s[AA].op.axis[0], xo) _, ty = s[AA].split(fused, factor=by) s[AA].bind(ty, te.thread_axis("threadIdx.y")) s[AA].bind(tz, te.thread_axis("threadIdx.z")) s[AA].bind(tx, te.thread_axis("threadIdx.x")) s[AA].vectorize(vec) s[BB].compute_at(s[CL], ko) xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v) tz, tx = s[BB].split(xi, factor=(WX // TX) * v) tx, vec = s[BB].split(tx, factor=v) fused = s[BB].fuse(s[BB].op.axis[0], xo) _, ty = s[BB].split(fused, factor=by) s[BB].bind(ty, te.thread_axis("threadIdx.y")) s[BB].bind(tz, te.thread_axis("threadIdx.z")) s[BB].bind(tx, te.thread_axis("threadIdx.x")) s[BB].vectorize(vec) s[AL].compute_at(s[CL], kl) s[BL].compute_at(s[CL], kl) s[CL].pragma(ko, "tensor_core") func = tvm.build(s, [A, B, C], "cuda") dev = tvm.cuda(0) a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(l, m)).astype(B.dtype) c_np = np.zeros((n, m), dtype=np.float32) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) func(a, b, c) evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("gemm m=%d n=%d k=%d: %f ms" % (m, n, l, evaluator(a, b, c).mean * 1e3)) c_np = np.dot(a_np, b_np) np.testing.assert_allclose(c_np, c.numpy(), rtol=1e-3)
def rnn_matexp(): n_num_step = 128 n_num_hidden = 1152 n_batch_size = 4 detect_global_barrier = DETECT_GLOBAL_BARRIER num_step = te.var("num_step") num_hidden = tvm.runtime.convert(n_num_hidden) batch_size = tvm.runtime.convert(n_batch_size) num_thread_y = 8 num_thread_x = 16 * 3 num_sm = 24 Whh = te.placeholder((num_hidden, num_hidden), name="Whh") s_init = te.compute((1, batch_size, num_hidden), lambda _, i, j: 1.0, name="init") s_state = te.placeholder((num_step, batch_size, num_hidden)) kh = te.reduce_axis((0, num_hidden), name="kh") s_update = te.compute( (num_step, batch_size, num_hidden), lambda t, i, j: te.sum(s_state[t - 1, i, kh] * Whh[kh, j], axis=kh), name="update") s_scan = tvm.te.scan(s_init, s_update, s_state) # schedule s = te.create_schedule(s_scan.op) CL = s_update SS = s.cache_read(s_state, "shared", [CL]) SL = s.cache_read(SS, "local", [CL]) WhhL = s.cache_read(Whh, "local", [CL]) ko, ki = s[CL].split(s[CL].op.reduce_axis[0], nparts=num_thread_y) CLF = s.rfactor(CL, ko) block_x = te.thread_axis((0, num_sm), "blockIdx.x") thread_x = te.thread_axis((0, num_thread_x), "threadIdx.x") thread_y = te.thread_axis((0, num_thread_y), "threadIdx.y") if PERSIST_KERNEL: s[s_scan.op].env_threads([block_x, thread_y, thread_x]) bx, xi = s[s_init].split(s_init.op.axis[2], nparts=num_sm) tx, xi = s[s_init].split(xi, nparts=num_thread_x) s[s_init].bind(bx, block_x) s[s_init].bind(tx, thread_x) bx, xi = s[s_update].split(s[CL].op.axis[2], nparts=num_sm) tx, xi = s[s_update].split(xi, nparts=num_thread_x) s[s_update].bind(bx, block_x) s[s_update].bind(tx, thread_x) s[CL].bind(s[CL].op.reduce_axis[0], thread_y) s[CLF].compute_at(s[CL], s[CL].op.reduce_axis[0]) # Duplicate store predicate. s[CL].set_store_predicate(thread_y.equal(0)) if PERSIST_KERNEL: s[WhhL].compute_at(s[s_scan], thread_x) s[WhhL].unroll(WhhL.op.axis[0]) else: s[WhhL].compute_at(s[CLF], CLF.op.axis[3]) kr, ki = s[CLF].split(CLF.op.reduce_axis[0], nparts=1) ko, ki = s[CLF].split(ki, factor=4) s[SS].compute_at(s[CLF], kr) s[SL].compute_at(s[CLF], ko) xo, xi = s[SS].split(SS.op.axis[2], factor=num_thread_x * num_thread_y * 3) ty, xi = s[SS].split(xi, nparts=num_thread_y) tx, xi = s[SS].split(xi, nparts=num_thread_x) s[SS].bind(ty, thread_y) s[SS].bind(tx, thread_x) def check_device(target): with tvm.target.build_config( detect_global_barrier=detect_global_barrier, auto_unroll_max_step=128, unroll_explicit=False): f = tvm.build(s, [s_scan, Whh], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. res_np = np.zeros( (n_num_step, n_batch_size, n_num_hidden)).astype("float32") Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32") Whh_np[:] = 2.0 / n_num_hidden Whh_np[:, n_num_hidden // 2:] = 0 res_a = tvm.nd.array(res_np, ctx) Whh_a = tvm.nd.array(Whh_np, ctx) # Skip first pass as it is compilation f(res_a, Whh_a) ctx.sync() # measure time cost of second step. tstart = time.time() f(res_a, Whh_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap) # correctness if not SKIP_CHECK: res_gpu = res_a.asnumpy() res_cmp = np.ones_like(res_np).astype("float64") Whh_np = Whh_np.astype("float64") for t in range(1, n_num_step): res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np) for i in range(n_num_step): for j in range(n_num_hidden): if abs(res_cmp[i, 0, j] - res_gpu[i, 0, j]) > 1e-5: print("%d, %d: %g vs %g" % (i, j, res_cmp[i, 0, j], res_gpu[i, 0, j])) tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3) check_device("cuda")
from __future__ import absolute_import, print_function import tvm from tvm import te import numpy as np ###################################################################### # We first write a very simple vector add and build it with the default schedule. Then, we use # our customized lowering pass to manipulate the IR directly instead of using schedule primitives. # n = tvm.tir.const(128, "int32") a = te.placeholder((n, ), name="a") b = te.placeholder((n, ), name="b") c = te.compute((n, ), lambda i: a[i] + b[i], name='c') sch = te.create_schedule(c.op) ir = tvm.lower(sch, [a, b, c], simple_mode=True) print(ir) ###################################################################### # Writing a Pass # -------------- # Essentially, an "IR transformation pass" is a function which maps a statement to a new statement. # Thus, we define this vectorize function and implement it step by step. # ###################################################################### # TVM already provides two class for users to both analyze and transform IR. # # IR Visitor # ~~~~~~~~~~