def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024): #Test Buffer register_mem(scope_tb, max_bits) m = 10 A = tvm.placeholder((m,), name='A') C = tvm.placeholder((m,), name='C') D = tvm.placeholder((m,), name='D') A0 = tvm.compute((m,), lambda i: A[i] + C[i], name='A0') A1 = tvm.compute((m,), lambda i: D[i] * D[i], name='A1') A2 = tvm.compute((m,), lambda i: A0[i] + A1[i], name='A2') B = tvm.compute((m,), lambda i: A2[i], name='B') s = tvm.create_schedule(B.op) A0L = s.cache_read(A0, scope_tb, [A2]) A1L = s.cache_read(A1, scope_tb, [A2]) A2L = s.cache_read(A2, scope_tb, [B]) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Cc = tvm.decl_buffer(C.shape, B.dtype, name='C') Dd = tvm.decl_buffer(D.shape, B.dtype, name='D') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) # verify only have one allocations. # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2
def test_storage_combine(): n = 8 A = tvm.placeholder((4,), name='A') num_stage = 5 B = A stages = [] for t in range(num_stage): B = tvm.compute((n, ), lambda i: B[i] + B[0] + (t+1), name='A%d' % t) stages.append(B) s = tvm.create_schedule(B.op) for S in stages[:-1]: s[S].set_scope("global:tag") bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 assert (n.extents[0].value == 16) tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 1
def test_buffer_index_merge_mult_mod(): m = tvm.var('m') n = tvm.var('n') s = tvm.var('s') k0 = tvm.var('k0') k1 = tvm.var('k1') A = tvm.decl_buffer((m, n), tvm.float32) A_stride = tvm.decl_buffer((m, n), tvm.float32, strides=(s, 1)) def assert_simplified_equal(index_simplified, index_direct): assert tvm.ir_pass.Equal(index_simplified, index_direct),\ "index_simplified=%s, index_direct=%s" %(index_simplified, index_direct) # Test Case1 index_simplified = A_stride.vload(((k0 % k1) / s, (k0 % k1) % s + (k0 / k1) * k1)) index_direct = A_stride.vload((0, k0)) assert_simplified_equal(index_simplified, index_direct) # Test Case2 index_simplified = A.vload(((k0 % (k1 / s)) / n, (k0 % (k1 / s)) % n + (k0 % k1))) index_direct = A.vload((0, k0 % k1 + k0 % (k1 / s))) assert_simplified_equal(index_simplified, index_direct) # Test Case3 index_simplified = A.vload((((k0 / (k1 / s)) * (k1 / s)) / n + (k0 % (k1 / s)) / n, ((k0 / (k1 / s)) * (k1 / s)) % n + (k0 % (k1 / s)) % n)) index_direct = A.vload((0, k0)) assert_simplified_equal(index_simplified, index_direct) # Test Case4 (not able to simplify) index_simplified = A.vload(((k0 % (k1 / s)) / n, (k0 % (k1 / n)) % n + (k0 % k1))) index_direct = A.vload((0, ((k0 % (k1 / s)) / n) * n + ((k0 % (k1 / n)) % n + (k0 % k1)))) assert_simplified_equal(index_simplified, index_direct)
def test_storage_share(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') num_stage = 5 B = A for t in range(num_stage): B = tvm.compute((m, l), lambda i, j: B[i, j] + (t+1), name='A%d' % t) s = tvm.create_schedule(B.op) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) # verify only have one allocations. # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 1
def test_dynamic_tensor(): dtype = 'float32' stype = 'csr' target = 'llvm' ctx = tvm.context(target, 0) nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n') A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype) assert(A.stype == 'csr') C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter') s = tvm.create_schedule(C.op) _nr, _nc = 3, 5 a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.) a = tvmsp.array(a, ctx) assert a.data.dtype == a.dtype Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr']) Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data') Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices') binds = {A.data: Ab.data, A.indices: Ab.indices} f = tvm.build(s, [nr, A.data, C], target, binds=binds) c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr f(a.data.shape[0], a.data, c.data) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
def test_inplace_rule(): m = 10 A = tvm.placeholder((m,), name='A') A0 = tvm.compute((m,), lambda i: A[i], name='A0') A1 = tvm.compute((m,), lambda i: A[i] + 1, name='A1') AA = tvm.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name='AA') B = tvm.compute((m,), lambda i: AA[i] + 1, name='B') s = tvm.create_schedule(B.op) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) # verify only have one allocations. # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2
def intrin_gemv(m, l): a = tvm.placeholder((l,), name='a') b = tvm.placeholder((m, l), name='b') k = tvm.reduce_axis((0, l), name='k') c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c') Ab = tvm.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1]) Bb = tvm.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[tvm.var("s1"), 1]) Cb = tvm.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1]) def intrin_func(ins, outs): ib = tvm.ir_builder.create() aa, bb = ins cc = outs[0] ib.emit(tvm.call_extern("int32", "gemv_update", cc.access_ptr("w"), aa.access_ptr("r"), bb.access_ptr("r"), m, l, bb.strides[0])) return ib.get() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
def test_storage_sync(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1') A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') s = tvm.create_schedule(A2.op) xo, xi = s[A2].split(A2.op.axis[0], factor=8) s[A2].bind(xo, tvm.thread_axis("blockIdx.x")) s[A1].compute_at(s[A2], xo) s[A1].set_scope("shared") bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64) f = tvm.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 0, True) flist = tvm.ir_pass.SplitHostDevice(f) f = flist[1] f = tvm.ir_pass.ThreadSync(f, "shared") body_list = tvm.make.stmt_list(f.body.body.body.body) assert(body_list[1].value.name == "tvm_storage_sync")
def test_storage_share_gpu(): m = tvm.var('m') A = [tvm.placeholder((m), name='A')] num_stage = 5 for t in range(num_stage): A.append(tvm.compute((m,), lambda i: A[-1][i] + (t+1), name='A%d_s' % t)) A.append(tvm.compute((m,), lambda i: A[-1][i], name='A%d' % t)) s = tvm.create_schedule(A[-1].op) for t in range(num_stage): x = A[2*t+2].op.axis[0] bx, tx = s[A[2*t+2]].split(x, factor=32) s[A[2*t+2]].bind(bx, tvm.thread_axis("blockIdx.x")) s[A[2*t+2]].bind(tx, tvm.thread_axis("threadIdx.x")) s[A[2*t+1]].compute_at(s[A[2*t+2]], tx) s[A[2*t+1]].set_scope("shared") bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A[0].shape, A[0].dtype, name='A') Bb = tvm.decl_buffer(A[0].shape, A[0].dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) alloc_stats = {"global": 0, "shared": 0} def verify(n): if isinstance(n, tvm.stmt.AttrStmt): if n.attr_key == "storage_scope": alloc_stats[n.value.value] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert alloc_stats["global"] == 2 assert alloc_stats["shared"] == num_stage
def test_copy_pad_split(): m = 4 * 3 A = tvm.placeholder((m, ), name="A") Apad = tvm.compute((m + 2,), lambda i: tvm.select(tvm.all(i >= 1, i <= m), A[i - 1], 0.0), "Apad") B = tvm.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2]) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=4) s[Apad].compute_at(s[B], xo) s[Apad].pragma(s[Apad].op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.CanonicalSimplify(stmt) def cb(src, dst, pad_before, pad_after, pad_value): assert(dst.elem_offset.value == 0) assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1) rpad_before = tvm.max(1 - xo * 4, 0) rpad_after = tvm.max(xo * 4 - 7, 0) assert_expr_equal(pad_before[0], rpad_before) assert_expr_equal(pad_after[0], rpad_after) assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after) return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_buffer(): m = tvm.var('m') n = tvm.var('n') l = tvm.var('l') Ab = tvm.decl_buffer((m, n), tvm.float32) Bb = tvm.decl_buffer((n, l), tvm.float32) assert isinstance(Ab, tvm.schedule.Buffer) assert Ab.dtype == tvm.float32 assert tuple(Ab.shape) == (m, n)
def test_buffer_access_ptr_extent(): m = tvm.var('m') n = tvm.var('n') Ab = tvm.decl_buffer((m, n), tvm.float32) aptr = Ab.access_ptr("rw") assert tvm.ir_pass.Equal(aptr.args[3], m * n) aptr = Ab.access_ptr("rw", offset=100) assert tvm.ir_pass.Equal(aptr.args[3], m * n - 100) Ab = tvm.decl_buffer((m, n), tvm.float32, strides=[n + 1 , 1]) aptr = Ab.access_ptr("rw", offset=100) assert tvm.ir_pass.Equal(aptr.args[3], Ab.strides[0] * m - 100)
def test_unroll_loop(): ib = tvm.ir_builder.create() dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) Aptr = ib.buffer_ptr(Ab) # for i in 0 to n-1: with ib.for_range(n, n + 2, name="i") as i: with ib.for_range(0, 8, name="i", for_type="unroll") as j: Aptr[j + 1] = Aptr[i] + 1 stmt = ib.get() assert isinstance(stmt, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, True) assert not isinstance(ret, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, 0, True) assert isinstance(ret, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, False) assert isinstance(ret, tvm.stmt.For) assert ret.for_type == tvm.stmt.For.Unrolled ib = tvm.ir_builder.create() ib.scope_attr(tvm.const(0, "int32"), "pragma_auto_unroll_max_step", 16) ib.emit(stmt) wrapped = ib.get() wrapped = tvm.make.Block(wrapped, stmt) assert isinstance(ret, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(wrapped, 0, 8, 0, False) assert isinstance(ret.first, tvm.stmt.For) assert ret.first.for_type == tvm.stmt.For.Unrolled assert isinstance(ret.rest, tvm.stmt.For) assert ret.rest.for_type != tvm.stmt.For.Unrolled
def intrin_gemv(m, n): w = tvm.placeholder((m, n), name='w') x = tvm.placeholder((n,), name='x') k = tvm.reduce_axis((0, n), name='k') z = tvm.compute((m,), lambda i: tvm.sum(w[i, k] * x[k], axis=k), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", offset_factor=16, strides=[tvm.var('ldw'), 1]) def intrin_func(ins, outs): ww, xx = ins zz = outs[0] ww_ptr = ww.access_ptr("r") xx_ptr = xx.access_ptr("r") zz_ptr = zz.access_ptr("w") body = tvm.call_packed( "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) reset = tvm.call_packed( "fill_zero", zz_ptr, n) update = tvm.call_packed( "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) return body, reset, update with tvm.build_config(data_alignment=16, offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb})
def check_c(): if not tvm.module.enabled("llvm"): return # Specifically allow offset to test codepath when offset is available Ab = tvm.decl_buffer( A.shape, A.dtype, elem_offset=tvm.var('Aoffset'), offset_factor=8, name='A') binds = {A : Ab} # BUILD and invoke the kernel. f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) mhost = tvm.codegen.build_module(fsplits[0], "c") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m["fadd_pipeline"] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy())
def test_buffer_vload(): m = tvm.var('m') n = tvm.var('n') Ab = tvm.decl_buffer((m, n), tvm.float32, elem_offset=100) load = Ab.vload([2, 3]) offset = tvm.ir_pass.Simplify(load.index) assert tvm.ir_pass.Equal(offset, n * 2 + 103)
def test_equal_compute(): x = tvm.var('x') y = tvm.var('y') n = 128 A = tvm.placeholder((n, n), name='A') B = tvm.placeholder((n, n), name='B') ii = tvm.var('i') jj = tvm.var('j') def func1(): k = tvm.reduce_axis((0, n), name='k') return tvm.sum(A[ii, k] * B[jj, k], axis=k) Ab = tvm.decl_buffer((n,), name='A') n = tvm.var("n") def func2(): ib = tvm.ir_builder.create() A = ib.buffer_ptr(Ab) with ib.for_range(0, n, name="i") as i: A[i] = A[i] + 1 with ib.for_range(0, 10, name="j") as j: A[j] = A[j] + 2 return ib.get() assert tvm.ir_pass.Equal(func1(), func1()) assert tvm.ir_pass.Equal(func2(), func2())
def test_flatten_storage_align(): m = 8 l = 16 A = tvm.placeholder((m, l), name='A') A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1') A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') s = tvm.create_schedule(A2.op) s[A1].storage_align(A1.op.axis[0], 2, 1) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64) stmt = tvm.ir_pass.Simplify(stmt) assert(stmt.body.extents[0].value == 17 * 8)
def dp4a(x_scope='local', y_scope='local', z_scope='local'): """ Int8 dot product reduced by every 4 elements using __dp4a Parameters ---------- x_scope : str, optional The storage scope of buffer for lhs y_scope : str, optional The storage scope of buffer for rhs z_scope : str, optional The storage scope of buffer for result Returns ------- intrin : TensorIntrin The dp4a TensorIntrin that can be used in tensorizing schedule. """ n = 4 # dp4a requires operands packed by 4 x = tvm.placeholder((n,), name='x', dtype='int8') y = tvm.placeholder((n,), name='y', dtype='int8') k = tvm.reduce_axis((0, n), name='rc') z = tvm.compute((1,), lambda i: tvm.sum( x[k].astype('int32') * y[k].astype('int32'), axis=[k])) def _intrin_func(ins, outs): def _instr(index): xx, yy = ins zz = outs[0] if index == 1: return zz.vstore(0, 0) ib = tvm.ir_builder.create() vec_x = xx.vload(0, dtype='int8x4') vec_y = yy.vload(0, dtype='int8x4') prev_z = 0 if index == 0 else zz.vload(0) new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z) ib.emit(zz.vstore(0, new_z)) return ib.get() return _instr(0), _instr(1), _instr(2) # body, reset, update with tvm.build_config(data_alignment=4, offset_factor=1) as cfg: scopes = {x: x_scope, y: y_scope, z: z_scope} binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, data_alignment=cfg.data_alignment, offset_factor=cfg.offset_factor, scope=scopes[t]) for t in [x, y, z]} return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
def test_single_point_test(): A = tvm.placeholder((1,), name='A') B = tvm.compute((1,), lambda i: A[i], name='B') s = tvm.create_schedule(B.op) s[B].pragma(B.op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) def cb(src, dst, pad_before, pad_after, pad_value): assert tvm.ir_pass.Simplify(src.elem_offset).value == 0 assert tvm.ir_pass.Simplify(dst.elem_offset).value == 0 assert tvm.ir_pass.Simplify(src.strides[0]).value == 1 assert tvm.ir_pass.Simplify(dst.strides[0]).value == 1 return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_flatten2(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1') A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') s = tvm.create_schedule(A2.op) xo, xi = s[A2].split(A2.op.axis[0], 8) s[A1].compute_at(s[A2], xo) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64) stmt = tvm.ir_pass.Simplify(stmt)
def test_buffer_access_ptr(): m = tvm.var('m') n = tvm.var('n') Ab = tvm.decl_buffer((m, n), tvm.float32, strides=[n + 1 , 1]) aptr = Ab.access_ptr("rw") assert tvm.ir_pass.Equal(aptr.args[3], Ab.strides[0] * m) assert aptr.args[0].dtype == Ab.dtype assert aptr.args[4].value == Buffer.READ | Buffer.WRITE aptr = Ab.access_ptr("w") assert aptr.args[4].value == Buffer.WRITE
def intrin_test(): m1 = tvm.var("m1") n1 = tvm.var("n1") a = tvm.placeholder((m1, n1), name='a') c = tvm.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c') Ab = tvm.decl_buffer(a.shape, name="Abuf", offset_factor=1) Cb = tvm.decl_buffer(c.shape, name="Cbuf", offset_factor=1) def intrin_func(ins, outs): aa = ins[0] cc = outs[0] def _body(): ib = tvm.ir_builder.create() ib.emit(tvm.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r"))) return ib.get() return _body() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb})
def test_copy2d(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') B = tvm.compute((m, l), lambda i, j: A[i, j], name='B') s = tvm.create_schedule(B.op) s[B].pragma(B.op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) def cb(src, dst, pad_before, pad_after, pad_value): assert dst.strides[0] == l assert dst.strides[1].value == 1 assert src.strides[0] == l assert tuple(src.shape) == (m, l) return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_cce_loop_1(): ib = tvm.ir_builder.create() dtype = 'float16' n = 514 m = 514 _A = tvm.placeholder((n*m,), name = 'A') Ab = tvm.decl_buffer((n*m,), dtype, name="A") A = ib.buffer_ptr(Ab) _B = tvm.placeholder((n*m,), name = 'B') Bb = tvm.decl_buffer((n*m,), dtype, name="B") B = ib.buffer_ptr(Bb) #for i in 0 to n-1: with ib.for_range(0, 11, name="i") as i: with ib.for_range(0, 160, name="j") as j: with ib.if_scope(ib.likely(((i*160) + j) < 1600)): A[(i+1)*m+j+1] = B[(i)*m+j+1] + B[(i+1)*m+j+1] + B[(i+2)*m+j+1] stmt = ib.get() stmt = tvm.ir_pass.LoopPartition(stmt, True) stmt = tvm.ir_pass.Simplify(stmt) assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
def get_vthread(name): tx = tvm.thread_axis(name) ty = tvm.thread_axis(name) ib = tvm.ir_builder.create() with ib.for_range(0, n) as i: ib.scope_attr(tx, "virtual_thread", nthread) ib.scope_attr(ty, "virtual_thread", nthread) A = ib.allocate("float32", m, name="A", scope="shared") B = ib.allocate("float32", m, name="B", scope="shared") C = ib.allocate("float32", m, name="C", scope="shared") cbuffer = tvm.decl_buffer((m,), dtype=C.dtype, data=C.asnode()) abuffer = tvm.decl_buffer((m,), dtype=A.dtype, data=A.asnode()) bbuffer = tvm.decl_buffer((m,), dtype=B.dtype, data=B.asnode()) A[tx] = tx + 1.0 B[ty] = ty + 1.0 ib.emit(tvm.call_extern("int32", "Run", abuffer.access_ptr("r"), bbuffer.access_ptr("r"), cbuffer.access_ptr("rw"))) return ib.get()
def test_flatten_prefetch(): A = tvm.placeholder((25, 100, 4), name = 'A') _A= tvm.decl_buffer(A.shape, A.dtype, name = 'A'); i = tvm.var('i') j = tvm.var('j') region = [tvm.make.range_by_min_extent(i[0], i[1]) for i in [(i, 2), (j, 8), (0, 4)]] stmt = tvm.make.Prefetch(A.op, 0, A.dtype, region) stmt = tvm.ir_pass.StorageFlatten(stmt, {A: _A}, 64) stmt = tvm.ir_pass.Simplify(stmt) assert stmt.extent.value == 2 assert isinstance(stmt.body, tvm.stmt.For) assert stmt.body.extent.value == 2
def intrin_multivadd(n): n_a = tvm.var("n_a") Ab = tvm.decl_buffer((n, ), tvm.float32, strides=[n_a]) n_b = tvm.var("n_b") Bb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_b]) n_c = tvm.var("n_c") Cb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_c]) z = tvm.compute((n,), lambda i: tvm.call_extern("float32", 'vadd', Ab.access_ptr("w", offset=n_a*i), Bb.access_ptr("r", offset=n_b*i), Cb.access_ptr("r", offset=n_c*i))) # replace the pattern with the multivadd call. I need to figure out # how to pass it the right parameters. def intrin_func(ins, outs): return tvm.call_packed("multivadd") with tvm.build_config(): return tvm.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
def test_stack_vm_basic(): a = tvm.nd.array(np.zeros(10, dtype='float32')) @tvm.register_func def tvm_call_back_get_shape(shape0): print(shape0) assert shape0 == a.shape[0] n = tvm.var('n') Ab = tvm.decl_buffer((n, ), tvm.float32) stmt = tvm.make.Evaluate(tvm.call_packed("tvm_call_back_get_shape", Ab.shape[0])) fapi = tvm.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) run_jit(fapi, lambda f: f(a))
def test_makeapi(): """Not yet working, mock design""" n = tvm.var('n') A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Cb = tvm.decl_buffer(C.shape, C.dtype, name='C') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, C:Cb}, 64) num_unpacked_args = 2 f = tvm.ir_pass.MakeAPI( stmt, "myadd", [n, Ab, Bb, Cb], num_unpacked_args, True) assert(f.handle_data_type[Ab.data].dtype == Ab.dtype) assert(len(f.args) == 5) output_ssa = False
def test_stack_vm_basic(): a = tvm.nd.array(np.zeros(10, dtype='float32')) @tvm.register_func def tvm_call_back_get_shape(shape0): print(shape0) assert shape0 == a.shape[0] n = tvm.var('n') Ab = tvm.decl_buffer((n, ), tvm.float32) stmt = tvm.make.Evaluate( tvm.call_packed("tvm_call_back_get_shape", Ab.shape[0])) fapi = tvm.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) run_jit(fapi, lambda f: f(a))
def test_unroll_fake_loop(): ib = tvm.ir_builder.create() dtype = 'int32' n = tvm.size_var('n') Ab = tvm.decl_buffer((n, ), dtype) Aptr = ib.buffer_ptr(Ab) # for i in 0 to n-1: with ib.for_range(0, 1, name="i") as i: Aptr[i*2] = 3 with ib.for_range(0, 10, name="j") as j: Aptr[j + 1] = Aptr[i] + 1 stmt = ib.get() ret = tvm.ir_pass.UnrollLoop(stmt, 8, 0, 1, True) assert isinstance(ret[0], tvm.stmt.Store)
def test_flatten_prefetch(): A = tvm.placeholder((25, 100, 4), name='A') _A = tvm.decl_buffer(A.shape, A.dtype, name='A') i = tvm.size_var('i') j = tvm.size_var('j') region = [ tvm.make.range_by_min_extent(i[0], i[1]) for i in [(i, 2), (j, 8), (0, 4)] ] stmt = tvm.make.Prefetch(A.op, 0, A.dtype, region) stmt = tvm.ir_pass.StorageFlatten(stmt, {A: _A}, 64) stmt = tvm.ir_pass.Simplify(stmt) assert stmt.extent.value == 2 assert isinstance(stmt.body, tvm.stmt.For) assert stmt.body.extent.value == 2
def test_inplace_rule2(scope_tb="local_TB2", max_bits=1024 * 1024 * 1024): #Test Buffer register_mem(scope_tb, max_bits) m = 10 A = tvm.placeholder((m, ), name='A') C = tvm.placeholder((m, ), name='C') D = tvm.placeholder((m, ), name='D') A0 = tvm.compute((m, ), lambda i: A[i] + C[i], name='A0') A1 = tvm.compute((m, ), lambda i: D[i] * D[i], name='A1') A2 = tvm.compute((m, ), lambda i: A0[i] + A1[i], name='A2') B = tvm.compute((m, ), lambda i: A2[i], name='B') s = tvm.create_schedule(B.op) A0L = s.cache_read(A0, scope_tb, [A2]) A1L = s.cache_read(A1, scope_tb, [A2]) A2L = s.cache_read(A2, scope_tb, [B]) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Cc = tvm.decl_buffer(C.shape, B.dtype, name='C') Dd = tvm.decl_buffer(D.shape, B.dtype, name='D') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D: Dd}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) # verify only have one allocations. # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2
def save_object(names): n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.var('i') # for i in 0 to n-1: stmt = tvm.make.For( i, 0, n - 1, 0, 0, tvm.make.Store(Ab.data, tvm.make.Load(dtype, Ab.data, i) + 1, i + 1)) fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) m = tvm.codegen.build_module(fapi, "llvm") for name in names: m.save(name)
def test_storage_share_gpu(): m = tvm.var('m') A = [tvm.placeholder((m), name='A')] num_stage = 5 for t in range(num_stage): A.append( tvm.compute((m, ), lambda i: A[-1][i] + (t + 1), name='A%d_s' % t)) A.append(tvm.compute((m, ), lambda i: A[-1][i], name='A%d' % t)) s = tvm.create_schedule(A[-1].op) for t in range(num_stage): x = A[2 * t + 2].op.axis[0] bx, tx = s[A[2 * t + 2]].split(x, factor=32) s[A[2 * t + 2]].bind(bx, tvm.thread_axis("blockIdx.x")) s[A[2 * t + 2]].bind(tx, tvm.thread_axis("threadIdx.x")) s[A[2 * t + 1]].compute_at(s[A[2 * t + 2]], tx) s[A[2 * t + 1]].set_scope("shared") bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A[0].shape, A[0].dtype, name='A') Bb = tvm.decl_buffer(A[0].shape, A[0].dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) alloc_stats = {"global": 0, "shared": 0} def verify(n): if isinstance(n, tvm.stmt.AttrStmt): if n.attr_key == "storage_scope": alloc_stats[n.value.value] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert alloc_stats["global"] == 2 assert alloc_stats["shared"] == num_stage
def lower(s, args): binds = {} arg_list = [] for x in args: assert isinstance(x, tvm.tensor.Tensor) buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.op.name) binds[x] = buf arg_list.append(buf) s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) return stmt
def test_buffer_access_ptr_offset(): m = tvm.var('m') n = tvm.var('n') Ab = tvm.decl_buffer((m, n), tvm.float32) aptr = Ab.access_ptr("rw", offset=100) offset = tvm.ir_pass.Simplify(aptr.args[2]) assert tvm.ir_pass.Equal(offset, 100) assert aptr.args[4].value == Buffer.READ | Buffer.WRITE v = tvm.var('int32') aptr = Ab.access_ptr("rw", offset=100 + 100 + v) offset = tvm.ir_pass.Simplify(aptr.args[2]) assert tvm.ir_pass.Equal(offset, 200 + v) assert aptr.args[4].value == Buffer.READ | Buffer.WRITE aptr = Ab.access_ptr("rw", offset=tvm.call_extern('int32', "test_call", 100 + 100 + v)) offset = tvm.ir_pass.Simplify(aptr.args[2]) assert tvm.ir_pass.Equal(offset, tvm.call_extern('int32', "test_call", 200 + v)) assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
def get_vthread(name): tx = tvm.thread_axis(name) ty = tvm.thread_axis(name) ib = tvm.ir_builder.create() A = ib.pointer("float32", name="A") C = ib.pointer("float32", name="C") with ib.for_range(0, n) as i: ib.scope_attr(tx, "virtual_thread", nthread) ib.scope_attr(ty, "virtual_thread", nthread) B = ib.allocate("float32", m, name="B", scope="shared") B[i] = A[i * nthread + tx] bbuffer = tvm.decl_buffer((m,), dtype=B.dtype, data=B.asnode()) ib.emit(tvm.call_extern("int32", "Run", bbuffer.access_ptr("r"), tvm.call_pure_intrin("int32", "tvm_context_id"))) C[i * nthread + tx] = B[i] + 1 return ib.get()
def intrin_conv(in_h, in_w, kern_h, kern_w): in_height = in_h in_width = in_w kernel_h = kern_h kernel_w = kern_w stride_h = 1 stride_w = 1 out_h = ((in_height - kernel_h) // stride_h + 1) out_w = ((in_width - kernel_w) // stride_w + 1) Input = tvm.placeholder((in_height, in_width), name='input') Filter = tvm.placeholder((kernel_h, kernel_w), name='filter') kh = tvm.reduce_axis((0, kernel_h), name='kh') kw = tvm.reduce_axis((0, kernel_w), name='kw') conv = tvm.compute( (out_h, out_w), lambda oh, ow: tvm.sum(Filter[kh, kw] * Input[oh + kh, ow + kw], axis=[kh, kw]), name='c') def intrin_func(ins, outs): ib = tvm.ir_builder.create() inp, filt = ins outp = outs[0] ib.emit( tvm.call_extern( "int32", "inst_conv", outp.access_ptr("w"), inp.access_ptr("r"), filt.access_ptr("r"), )) return ib.get() with tvm.build_config(offset_factor=1) as cfg: scopes = {Input: "local", Filter: "local", conv: "local"} binds = { t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, offset_factor=1) for t in [Input, Filter, conv] } return tvm.decl_tensor_intrin(conv.op, intrin_func, binds=binds)
def lower(s, args, name="mydot"): binds = {} arg_list = [] for x in args: assert isinstance(x, tvm.tensor.Tensor) buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.op.name) binds[x] = buf arg_list.append(buf) s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 16) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) fapi = tvm.ir_pass.MakeAPI(stmt, name, arg_list, 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) return fapi
def lower(sch, args): binds = {} arg_list = [] for x in args: if isinstance(x, tvm.tensor.Tensor): buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name) assert x not in binds binds[x] = buf arg_list.append(buf) else: raise ValueError("args must be Tensor, Buffer or Var") sch = sch.normalize() bounds = tvm.schedule.InferBound(sch) stmt = tvm.schedule.ScheduleOps(sch, bounds) stmt = tvm.ir_pass.LoopPartition(stmt, False) stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64) func = tvm.ir_pass.MakeAPI(stmt, "myadd", arg_list, 0, True) return func
def test_vm_parallel(): dtype = 'int64' n = tvm.size_var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.size_var('i') ib = tvm.ir_builder.create() A = ib.buffer_ptr(Ab) with ib.for_range(0, n, "i", for_type="parallel") as i: A[i] = A[i] + 1 stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) def check(f): a = tvm.nd.array(np.zeros(10, dtype=dtype)) f(a) np.testing.assert_equal(a.asnumpy(), np.ones(a.shape[0])) run_jit(fapi, check)
def trace1(): @tvm.register_func def my_debug(x): print("array=", x.asnumpy()) return 0 x = tvm.placeholder((4, ), name="x", dtype="int32") xbuffer = tvm.decl_buffer(x.shape, dtype=x.dtype) y = tvm.compute(x.shape, lambda i: tvm.call_packed("my_debug", xbuffer)) s = tvm.create_schedule(y.op) print(tvm.lower(s, [x, y], binds={x: xbuffer}, simple_mode=True)) f = tvm.build(s, [xbuffer, y], binds={x: xbuffer}) xnd = tvm.nd.array(np.ones((4, ), dtype=x.dtype)) ynd = tvm.nd.array(np.zeros((4, ), dtype=y.dtype)) f(xnd, ynd) print(ynd)
def test_static_init(): dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.var('i') ib = tvm.ir_builder.create() handle = tvm.call_intrin("handle", "tvm_static_handle") ib.emit(tvm.call_packed("test_static_callback", handle, Ab)) @tvm.register_func("test_static_callback") def test_cb(sh, A): assert isinstance(sh, ctypes.c_void_p) return sh stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) f = tvm.codegen.build_module(fapi, "llvm") a = tvm.nd.array(np.zeros(10, dtype=dtype)) f(a)
def lower(sch, args): binds = {} arg_list = [] for x in args: if isinstance(x, tvm.tensor.Tensor): buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name) assert x not in binds binds[x] = buf arg_list.append(buf) else: raise ValueError("args must be Tensor, Buffer or Var") sch = sch.normalize() bounds = tvm.schedule.InferBound(sch) stmt = tvm.schedule.ScheduleOps(sch, bounds) stmt = tvm.ir_pass.LoopPartition(stmt) stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.VectorizeLoop(stmt) stmt = tvm.ir_pass.Simplify(stmt) return stmt
def test_stack_vm_loop(): dtype = 'int64' n = tvm.size_var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.size_var('i') ib = tvm.ir_builder.create() A = ib.buffer_ptr(Ab) with ib.for_range(0, n - 1, "i") as i: A[i + 1] = A[i] + 1 ib.emit(tvm.call_packed("tvm_stack_vm_print", i)) stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) a = tvm.nd.array(np.zeros(10, dtype=dtype)) def check(f): f(a) np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0])) run_jit(fapi, check)
def test_static_tensor(): dtype = 'float32' stype = 'csr' target = 'llvm' ctx = tvm.context(target, 0) m = tvm.var('m') n = tvm.var('n') A = tvmsp.placeholder(shape=(m, n), name='A', dtype=dtype) assert (A.stype == 'csr') n = 3 a = np.maximum(np.random.uniform(size=(n, n)).astype(dtype) - .6, 0.) a = tvmsp.array(a, ctx) A.data = tvm.placeholder(a.data.shape, dtype, name='A_data') Ab = tvm.decl_buffer(a.data.shape, dtype, name='A_data') binds = {A.data: Ab} C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter') s = tvm.create_schedule(C.op) f = tvm.build(s, [A.data, C], target, binds=binds) c = tvmsp.array(np.zeros((n, n), dtype), ctx) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr f(a.data, c.data) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)