def test_bound_tensor_compute_op(): def intrin_test(): m1 = tvm.var("m1") n1 = tvm.var("n1") a = tvm.placeholder((m1, n1), name='a') c = tvm.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c') Ab = tvm.decl_buffer(a.shape, name="Abuf", offset_factor=1) Cb = tvm.decl_buffer(c.shape, name="Cbuf", offset_factor=1) def intrin_func(ins, outs): aa = ins[0] cc = outs[0] def _body(): ib = tvm.ir_builder.create() ib.emit(tvm.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r"))) return ib.get() return _body() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb}) test_func = intrin_test() A = tvm.placeholder((20,20), name='A') B = tvm.compute(A.shape, lambda i,j : A[i,j], name='B') C = tvm.compute((10, 20), lambda i : test_func(B[i:10, 0:20]), name='C') s = tvm.create_schedule(C.op) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) assert(bounds[B.op.axis[0]].extent.value == 10)
def test_scan(): m = tvm.var("m") n = tvm.var("n") x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x") s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: x[0, i], name="s_init") x_trans = tvm.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans") s_up1 = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1") s_update = tvm.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update") s_scan = tvm.scan(s_init, s_update, s_state) def test_getbody(): body = tvm.schedule.ScanGetBody(s_scan.op) assert set(body) == set([s_scan.op, s_update.op, s_up1.op]) def test_attach_path(): s = tvm.create_schedule(s_scan.op) s[x_trans].compute_at(s[s_update], s_update.op.axis[0]) apath = tvm.schedule.CreateAttachPath(s) assert(tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis])) assert(tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis])) def test_fix_pt(): body = tvm.schedule.ScanGetBody(s_scan.op) fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body) assert(fxpt[s_scan.spatial_axis_[0]].value != 0)
def test_storage_sync(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1') A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') s = tvm.create_schedule(A2.op) xo, xi = s[A2].split(A2.op.axis[0], factor=8) s[A2].bind(xo, tvm.thread_axis("blockIdx.x")) s[A1].compute_at(s[A2], xo) s[A1].set_scope("shared") bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64) f = tvm.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 0, True) flist = tvm.ir_pass.SplitHostDevice(f) f = flist[1] f = tvm.ir_pass.ThreadSync(f, "shared") body_list = tvm.make.stmt_list(f.body.body.body.body) assert(body_list[1].value.name == "tvm_storage_sync")
def test_storage_share_gpu(): m = tvm.var('m') A = [tvm.placeholder((m), name='A')] num_stage = 5 for t in range(num_stage): A.append(tvm.compute((m,), lambda i: A[-1][i] + (t+1), name='A%d_s' % t)) A.append(tvm.compute((m,), lambda i: A[-1][i], name='A%d' % t)) s = tvm.create_schedule(A[-1].op) for t in range(num_stage): x = A[2*t+2].op.axis[0] bx, tx = s[A[2*t+2]].split(x, factor=32) s[A[2*t+2]].bind(bx, tvm.thread_axis("blockIdx.x")) s[A[2*t+2]].bind(tx, tvm.thread_axis("threadIdx.x")) s[A[2*t+1]].compute_at(s[A[2*t+2]], tx) s[A[2*t+1]].set_scope("shared") bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A[0].shape, A[0].dtype, name='A') Bb = tvm.decl_buffer(A[0].shape, A[0].dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) alloc_stats = {"global": 0, "shared": 0} def verify(n): if isinstance(n, tvm.stmt.AttrStmt): if n.attr_key == "storage_scope": alloc_stats[n.value.value] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert alloc_stats["global"] == 2 assert alloc_stats["shared"] == num_stage
def test_inplace_rule(): m = 10 A = tvm.placeholder((m,), name='A') A0 = tvm.compute((m,), lambda i: A[i], name='A0') A1 = tvm.compute((m,), lambda i: A[i] + 1, name='A1') AA = tvm.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name='AA') B = tvm.compute((m,), lambda i: AA[i] + 1, name='B') s = tvm.create_schedule(B.op) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) # verify only have one allocations. # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2
def my_clip(x, a_min, a_max): """Unlike topi's current clip, put min and max into two stages.""" const_min = tvm.const(a_min, x.dtype) const_max = tvm.const(a_max, x.dtype) x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") return x
def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024): #Test Buffer register_mem(scope_tb, max_bits) m = 10 A = tvm.placeholder((m,), name='A') C = tvm.placeholder((m,), name='C') D = tvm.placeholder((m,), name='D') A0 = tvm.compute((m,), lambda i: A[i] + C[i], name='A0') A1 = tvm.compute((m,), lambda i: D[i] * D[i], name='A1') A2 = tvm.compute((m,), lambda i: A0[i] + A1[i], name='A2') B = tvm.compute((m,), lambda i: A2[i], name='B') s = tvm.create_schedule(B.op) A0L = s.cache_read(A0, scope_tb, [A2]) A1L = s.cache_read(A1, scope_tb, [A2]) A2L = s.cache_read(A2, scope_tb, [B]) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Cc = tvm.decl_buffer(C.shape, B.dtype, name='C') Dd = tvm.decl_buffer(D.shape, B.dtype, name='D') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) # verify only have one allocations. # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2
def binary_dense(data, weight): """Binary matrix multiplication using xor and bit-count. Parameters ---------- data : tvm.Tensor 2-D with shape [batch, in_dim], dtype is uint32. weight : tvm.Tensor 2-D with shape [out_dim, in_dim], dtype is uint32. Returns ------- output : tvm.Tensor 2-D with shape [batch, out_dim], dtype is float32. """ assert data.dtype == 'uint32' and weight.dtype == 'uint32', \ "dtype of data and weight should be uint32" assert len(data.shape) == 2 and len(weight.shape) == 2, \ "only support 2-dim binary dense" batch, in_dim = data.shape out_dim, _ = weight.shape k = tvm.reduce_axis((0, in_dim), name='k') matmul = tvm.compute((batch, out_dim), lambda i, j: \ tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \ tag='binary_dense') return tvm.compute((batch, out_dim), lambda i, j: \ 32 * in_dim - 2. * matmul(i, j), \ tag=tag.ELEMWISE)
def test_bound_nest_thread(): m = tvm.var('m') A = tvm.placeholder((m), name='A') A1 = tvm.compute((m,), lambda i: A[i], name='A1') A2 = tvm.compute((m,), lambda i: A1[i] + 2, name='A2') A3 = tvm.compute((m,), lambda i: A2[i] + 3, name='A3') s = tvm.create_schedule(A3.op) s[A2].set_scope("shared") s[A1].set_scope("local") block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") bx, tx = s[A3].split(A3.op.axis[0], factor=32) s[A3].bind(bx, block_x) s[A3].bind(tx, thread_x) s[A2].compute_at(s[A3], tx) _, xi = s[A2].split(A2.op.axis[0], nparts=1) s[A2].bind(xi, thread_x) s[A1].compute_at(s[A3], tx) s = s.normalize() bounds = tvm.schedule.InferBound(s) assert(bounds[A1.op.axis[0]].extent.value==1) assert(bounds[A2.op.axis[0]].extent.value==32) assert(bounds[A3.op.axis[0]].extent == m)
def test_double_splitting_with_indivisible_factors(): m = 48 dtype="float32" A = tvm.placeholder((m,), name='A', dtype=dtype) C = tvm.compute((m,), lambda i: A[i], name='C') D = tvm.compute((m,), lambda i: C[i], name='D') s = tvm.create_schedule(D.op) co, ci = s[C].split(C.op.axis[0], factor=10) do, di = s[D].split(D.op.axis[0], 32) s[C].compute_at(s[D], do) target = 'llvm' with tvm.build_config(partition_const_loop=True): f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False) func = tvm.build(f, target=target) # Find the beginning of the Halide IR corresponding to kernel code # and make sure it doesn't have an if statements left top_produce = find_top_produce(f.body) assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse)))) # check functional correctness of generated code ctx = tvm.context(target, 0) a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx) c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) func(a, c, d) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
def test_llvm_persist_parallel(): n = 128 A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B') C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=8) xo1, xo2 = s[C].split(xo, nparts=1) s[B].compute_at(s[C], xo1) s[B].parallel(s[B].op.axis[0]) s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish") s[C].parallel(xi) s[C].pragma(xo1, "parallel_launch_point") s[C].pragma(xi, "parallel_stride_pattern") def check_llvm(): if not tvm.module.enabled("llvm"): return # BUILD and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), np.sqrt(a.asnumpy() + 1) * 2 + 2, rtol=1e-5) check_llvm()
def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None): if out_dtype is None: out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) # create tuning space cfg.define_split("tile_x", out_dim, num_outputs=2) cfg.define_split("tile_y", batch, num_outputs=2) cfg.define_split("tile_k", in_dim, num_outputs=2) if cfg.is_fallback: _default_dense_nopack_config(cfg, batch, out_dim, in_dim) vec = cfg["tile_k"].size[-1] k = tvm.reduce_axis((0, in_dim // vec), "k") CC = tvm.compute((batch, out_dim, vec), lambda z, y, x: tvm.sum( data[z, k * vec + x].astype(out_dtype) * weight[y, k * vec + x].astype(out_dtype), axis=k)) kk = tvm.reduce_axis((0, vec), "kk") C = tvm.compute((batch, out_dim), lambda y, x: tvm.sum(CC[y, x, kk], axis=kk), tag="dense_nopack") if bias is not None: C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None): if out_dtype is None: out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) # create tuning space cfg.define_split("tile_y", batch, num_outputs=3) cfg.define_split("tile_x", out_dim, num_outputs=3) cfg.define_split("tile_k", in_dim, num_outputs=2) if cfg.is_fallback: _default_dense_pack_config(cfg, batch, out_dim, in_dim) packw_bn = cfg["tile_x"].size[-1] packw_shape = (out_dim // packw_bn, in_dim, packw_bn) packw = tvm.compute(packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") k = tvm.reduce_axis((0, in_dim), name="k") C = tvm.compute((batch, out_dim), lambda y, x: tvm.sum( data[y, k].astype(out_dtype) * packw[x // packw_bn, k, x % packw_bn].astype(out_dtype), axis=k), tag="dense_pack") if bias is not None: C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
def dense_default(data, weight, bias=None): """The default implementation of dense in topi. Parameters ---------- data : tvm.Tensor 2-D with shape [batch, in_dim] weight : tvm.Tensor 2-D with shape [out_dim, in_dim] bias : tvm.Tensor, optional 1-D with shape [out_dim] Returns ------- output : tvm.Tensor 2-D with shape [batch, out_dim] """ assert len(data.shape) == 2 and len(weight.shape) == 2, \ "only support 2-dim dense" if bias is not None: assert len(bias.shape) == 1 batch, in_dim = data.shape out_dim, _ = weight.shape k = tvm.reduce_axis((0, in_dim), name='k') matmul = tvm.compute((batch, out_dim), \ lambda i, j: tvm.sum(data[i, k] * weight[j, k], axis=k), \ tag='dense') if bias is not None: matmul = tvm.compute((batch, out_dim), \ lambda i, j: matmul[i, j] + bias[j], \ tag=tag.BROADCAST) return matmul
def test_multiple_kernels(): N = 1024 A = tvm.placeholder((N, N), name='A') B = tvm.compute((N, N), lambda i, j: A[i, j]) C = tvm.compute((N, N), lambda i, j: B[i, j]) s = tvm.create_schedule([C.op]) s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x")) s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x")) # shared memory usage: 0 # thread usage: N for target in ['opencl', 'cuda']: if not tvm.context(target).exist: continue valid = [None] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1))]}): tvm.build(s, [A, C], target) assert not valid[0] with tvm.build_config(**{"add_lower_pass": [ (2, get_verify_pass(valid, max_shared_memory_per_block=0, max_threads_per_block=N))]}): tvm.build(s, [A, C], target) assert valid[0]
def test_in_bounds_vectorize_llvm(): n = 512 lanes = 2 A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes) B = tvm.compute((n,), lambda i: A[i], name='B') C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], nparts=2) _, xi = s[C].split(xi, factor=2) s[C].parallel(xo) s[C].vectorize(xi) s[B].compute_at(s[C], xo) xo, xi = s[B].split(B.op.axis[0], factor=2) s[B].vectorize(xi) # build and invoke the kernel. lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False) print (lowered_func.body) f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((n,), A.dtype).copyfrom( np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), C.dtype, ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def test_copy_pad_split(): m = 4 * 3 A = tvm.placeholder((m, ), name="A") Apad = tvm.compute((m + 2,), lambda i: tvm.select(tvm.all(i >= 1, i <= m), A[i - 1], 0.0), "Apad") B = tvm.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2]) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=4) s[Apad].compute_at(s[B], xo) s[Apad].pragma(s[Apad].op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.CanonicalSimplify(stmt) def cb(src, dst, pad_before, pad_after, pad_value): assert(dst.elem_offset.value == 0) assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1) rpad_before = tvm.max(1 - xo * 4, 0) rpad_after = tvm.max(xo * 4 - 7, 0) assert_expr_equal(pad_before[0], rpad_before) assert_expr_equal(pad_after[0], rpad_after) assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after) return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_schedule_create(): m = tvm.var('m') n = tvm.var('n') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') B = tvm.placeholder((n, l), name='B') AA = tvm.compute((m, l), lambda i, j: A[i, j]) T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k)) s = tvm.create_schedule(T.op) s[AA].set_scope("shared") xo, xi = s[T].split(T.op.axis[0], factor=10) xi1, xi2 = s[T].split(xi, factor=2) s[AA].compute_at(s[T], xi1) xo, xi = s[AA].split(AA.op.axis[0], factor=10) s[T].reorder(xi2, xi1) assert T.op.axis[1] in s[T].leaf_iter_vars # save load json json_str = tvm.save_json(s) s_loaded = tvm.load_json(json_str) assert isinstance(s_loaded, tvm.schedule.Schedule) assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body)) # pickle unpickle dump = pkl.dumps(s) s_loaded = pkl.loads(dump) assert isinstance(s_loaded, tvm.schedule.Schedule) assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
def global_pool(data, pool_type): """Perform global pooling on the data Parameters ---------- data : tvm.Tensor 4-D with shape [batch, channel, in_height, in_width] pool_type : str Pool type, 'max' or 'avg' Returns ------- output : tvm.Tensor 4-D with shape [batch, channel, 1, 1] """ assert len(data.shape) == 4, "only support 4-dim pooling" batch, channel, height, width = data.shape dheight = tvm.reduce_axis((0, height)) dwidth = tvm.reduce_axis((0, width)) if pool_type == 'max': return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tvm.max(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \ tag="global_pool_max") elif pool_type == 'avg': tsum = tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tvm.sum(data[n, c, dheight, dwidth], axis=[dheight, dwidth]), \ tag="global_pool_sum") return tvm.compute((batch, channel, 1, 1), lambda n, c, h, w: \ tsum[n, c, h, w] / (height*width).astype(tsum.dtype), \ tag=tag.ELEMWISE) else: raise ValueError("Pool type should be 'avg' or 'max'.")
def test_scan_group(): m = tvm.var("m") n = tvm.var("n") x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x") s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: x[0, i]) s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i]) s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1) s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1) res = tvm.scan(s_init, s_update3, s_state, inputs=x) s = tvm.create_schedule(res.op) assert s[s_update1].group is not None assert s[s_update2].group == s[s_update1].group # Assign within group, is valid s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1]) # create a new group, for [s_update2 and s_update1] g2 = s.create_group(outputs=s_update2, inputs=[s_state, x]) assert g2.group is not None assert g2.group == s[s_update3].group assert s[s_update2].group == g2 assert s[s_update1].group == g2 g2.compute_at(s[s_update3], s_update3.op.axis[1]) assert g2.attach_stage == s[s_update3] try: # compute outside group error. s[s_update2].compute_at(s[s_init], s_init.op.axis[0]) assert False except tvm.TVMError: pass
def _spatial_pack(data, kernel, stride, padding, out_dtype): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) H, W = wkl.height, wkl.width CI, CO = wkl.in_filter, wkl.out_filter KH, KW = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride HCAT, WCAT = KH-1, KW-1 VH = sch.vh VW = sch.vw VC = sch.vc UNROLL = sch.unroll TH = H + 2*HPAD TW = W + 2*WPAD OH = (H + 2*HPAD - KH) // HSTR + 1 OW = (W + 2*WPAD - KW) // WSTR + 1 dshape = (1, CI, H, W) dpshape = (1, CI, TH, TW) dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT) kshape = (CO, CI, KH, KW) kvshape = (CO/VC, CI, KH, KW, VC) ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (1, CO, OH, OW) DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \ data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \ kernel[co*VC+vc][ci][dh][dw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) * kernel_vec[co, ci, dh, dw, vc].astype(out_dtype), axis=[ci, dh, dw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC], name='output_unpack', tag='spatial_conv_output') return output
def test_tensor_reduce_multi_axis(): m = tvm.var('m') n = tvm.var('n') A = tvm.placeholder((m, n), name='A') k1 = tvm.reduce_axis((0, n), "k") k2 = tvm.reduce_axis((0, m), "k") C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=(k1, k2))) C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=[k1, k2]))
def test_scan3_not_exact_reach(): s_h1 = tvm.compute((l, n, m), lambda t, j, i: s_state[t-1, i, j], name="h1") s_h2 = tvm.compute((l, m, n), lambda t, i, j: s_state[t-1, i, 10] * 2, name="h1") s_update = tvm.compute((l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update") s_scan = tvm.scan(s_init, s_update, s_state) body = tvm.schedule.ScanGetBody(s_scan.op) fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op) assert(fxpt[s_scan.op.spatial_axis_[0]].value == 1) assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
def test_scan4_reach_other(): s_h1 = tvm.compute((l, n, m), lambda t, j, i: s_state[t-1, j, j], name="h1") s_h2 = tvm.compute((l, m, n), lambda t, i, j: s_state[t-1, i, j] * 2, name="h1") s_update = tvm.compute((l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update") s_scan = tvm.scan(s_init, s_update, s_state) fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op) assert(fxpt[s_scan.op.spatial_axis_[0]].value == 0) assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
def test_tensor_scan(): m = tvm.var("m") n = tvm.var("n") x = tvm.placeholder((m, n)) s = tvm.placeholder((m, n)) res = tvm.scan(tvm.compute((1, n), lambda _, i: x[0, i]), tvm.compute((m, n), lambda t, i: s[t-1, i] + x[t, i]), s) assert tuple(res.shape) == (m, n)
def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC") sch = _get_schedule(wkl, "NHWC") VH = sch.vh VW = sch.vw VC = sch.vc data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) N, H, W, IB, CI = data_q.shape OCO, KH, KW, KB, VC, _ = kernel_vec.shape CO = OCO * VC HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride HCAT, WCAT = KH-1, KW-1 PAD_H = H + 2*HPAD PAD_W = W + 2*WPAD OH = (H + 2*HPAD - KH) // HSTR + 1 OW = (W + 2*WPAD - KW) // WSTR + 1 dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI) ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC) oshape = (1, OH, OW, CO) if (HPAD != 0 and WPAD != 0): data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad") else: data_pad = data_q data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \ data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') ib = tvm.reduce_axis((0, IB), name='ib') kb = tvm.reduce_axis((0, KB), name='kb') def _conv(n, h, w, co, vh, vw, vc): return tvm.sum((tvm.popcount( kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') & data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16')) << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci]) conv = tvm.compute(ovshape, _conv, name='conv') return tvm.compute(oshape, lambda n, h, w, co: conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype), name='output_vec', tag='spatial_bitserial_conv_nhwc')
def test_schedule_bound_condition(): A = tvm.placeholder((64,), name='A', dtype="float32") Apad = tvm.compute((66,), lambda i: tvm.select(tvm.all(i>0, i < 65), A[i-1], tvm.const(0.)), name='Apad') Apad2 = tvm.compute((66,), lambda i: Apad[i]*2, name='Apad2') s = tvm.create_schedule(Apad2.op) AL1 = s.cache_read(A,"local",[Apad]) s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.Simplify(stmt) assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
def test_bound_conv1d(): n = tvm.var('n') A = tvm.compute((n+2), lambda i: 1, name='A') def computeB(ii): i = ii + 1 return A[i-1] + A[i] + A[i+1] B = tvm.compute(n, computeB, name='B') s = tvm.create_schedule(B.op) s[A].compute_at(s[B], B.op.axis[0]) s = s.normalize() bounds = tvm.schedule.InferBound(s) assert(bounds[A.op.axis[0]].extent.value == 3)
def test_replace_dataflow(): shape = (255,) A = tvm.placeholder(shape, name = "A") B = tvm.compute(shape, lambda i: A[i] + A[i], name = "B") C = tvm.compute(shape, lambda i: A[i] + B[i], name = "C") D = tvm.compute(shape, lambda i: A[i] + C[i], name = "D") E = tvm.compute(shape, lambda i: A[i] + D[i], name = "E") s = tvm.create_schedule(E.op) s.cache_read(A, "local", [B, C, D, E]) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map)
def test_bound1(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1') A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') s = tvm.create_schedule([A2.op]) xo, xi = s[A2].split(s[A2].op.axis[0], 8) s[A1].compute_at(s[A2], xo) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) assert(bounds[A1.op.axis[0]].extent.value == 8)
def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, num_tile): assert layout == "NCHW", "Only support NCHW" # create workload according to raw arguments out_dtype = out_dtype or data.dtype N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: pre_packed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: # kernel tensor is pre packed pre_packed = True CO, _, KH, KW, VC = get_const_tuple(kernel.shape) CO = CO * VC dilated_kernel_h = (KH - 1) * dilation_h + 1 dilated_kernel_w = (KW - 1) * dilation_w + 1 pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1 OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1 data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right]) # ==================== define configuration space ==================== n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) if num_tile == 2: # for arm cpu co, vc = cfg.define_split('tile_co', co, num_outputs=2) oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2) ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2) elif num_tile == 3: # for mali gpu co, _, vc = cfg.define_split('tile_co', co, num_outputs=3) oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3) ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3) else: raise RuntimeError("Invalid num_tile") cfg.define_reorder("reorder_0", [n, co, oh, ow, ci, kh, kw, vh, vw, vc], policy='candidate', candidate=[ [n, co, oh, ow, ci, kh, kw, vh, vw, vc], [n, co, oh, ow, ci, kh, kw, vc, vh, vw]]) cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') # fallback support if cfg.is_fallback: if num_tile == 2: # arm cpu ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'conv2d', 'direct') cfg.fallback_with_reference_log(ref_log) elif num_tile == 3: # mali gpu ref_log = autotvm.tophub.load_reference_log('mali', 'rk3399', 'conv2d', 'direct') cfg.fallback_with_reference_log(ref_log) # ==================================================================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] kvshape = (CO // VC, CI, KH, KW, VC) ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (N, CO, OH, OW) if dilation_h != 1 or dilation_w != 1: # undilate input data dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW) data_vec = tvm.compute(dvshape, lambda n, h, w, ci, kh, kw, vh, vw: data_pad[n][ci][(h*VH+vh)*HSTR+kh*dilation_h] [(w*VW+vw)*WSTR+kw*dilation_w], name='data_vec_undilated') else: dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1) data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') if pre_packed: kernel_vec = kernel else: kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc: kernel[co*VC+vc][ci][kh][kw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') kh = tvm.reduce_axis((0, KH), name='kh') kw = tvm.reduce_axis((0, KW), name='kw') if dilation_h != 1 or dilation_w != 1: conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) * kernel_vec[co, ci, kh, kw, vc].astype(out_dtype), axis=[ci, kh, kw]), name='conv') else: conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) * kernel_vec[co, ci, kh, kw, vc].astype(out_dtype), axis=[ci, kh, kw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC], name='output_unpack', tag='spatial_conv2d_output') return output
import nnpu import tvm import topi from nnpu.utils import ScheduleProcHelper import numpy as np with (ScheduleProcHelper()): env = nnpu.get_env() nnpu.set_device(env, type='S0') dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder((2, 4, 16), dtype_n, 'a') a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a') pad_buf = tvm.compute((2, 6, 16), lambda i, j, k: tvm.expr.Select( j >= 2, a_buf[i, j - 2, k], tvm.const(0, dtype_n)), 'pad') nnpu.utils.MarkScope(pad_buf) nnpu.utils.PragmaCopy(pad_buf) tile_host, _ = nnpu.utils.CopyBufToH(pad_buf, 'tile') s = nnpu.create_schedule([tile_host.op]) print(tvm.lower(s, [a, tile_host], simple_mode=True)) print(nnpu.lower(s, [a, tile_host], simple_mode=True)) # exit(0) func = nnpu.build(s, [a, tile_host], 'nnpu', 'llvm', name='nnpu_func') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(2, 4, 16), dtype=a.dtype, low=-128,
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") if tile_size == 4: G_data = np.array([ [1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0], [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]], dtype=np.float32) B_data = np.array([ [4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0], [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]], out_dtype) A_data = np.array([ [1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1], [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]], out_dtype) elif tile_size == 2: G_data = np.array([ [1, 0, 0], [1.0/2, 1.0/2, 1.0/2], [1.0/2, -1.0/2, 1.0/2], [0, 0, 1]], np.float32) B_data = np.array([ [1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype) A_data = np.array([ [1, 0], [1, 1], [1, -1], [0, -1]], out_dtype) else: raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) m = A_data.shape[1] r = 3 alpha = m + r - 1 K = CO C = CI H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 nH, nW = (H + m-1) // m, (W + m-1) // m P = N * nH * nW cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16) cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16) VP = cfg['tile_p'].size[-1] VK = cfg['tile_k'].size[-1] # pack input tile input_tile = tvm.compute((C, P // VP, alpha, alpha, VP), lambda c, b, eps, nu, bb: data_pad[(b*VP+bb) // (nH*nW)][c][(b*VP+bb) // nW % nH * m + eps] [(b*VP+bb) % nW * m + nu], name='d') # transform kernel if pre_computed: U = kernel else: G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute((alpha, alpha, K // VK, C, VK), lambda eps, nu, k, c, kk: tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image B = const_matrix(B_data, 'B') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') V = tvm.compute((alpha, alpha, P // VP, C, VP), lambda eps, nu, b, c, bb: tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V') # batch gemm c = tvm.reduce_axis((0, C), name='c') M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b: tvm.sum(U[eps][nu][k // VK][c][k % VK] * V[eps][nu][b // VP][c][b % VP], axis=c), name='M') # inverse transform A = const_matrix(A_data, 'A') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw: tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]), name='Y') # unpack output output = tvm.compute((N, K, H, W), lambda n, k, h, w: Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m], name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * K * H * W * KH * KW * C) return output
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") idxd = tvm.indexdiv idxm = tvm.indexmod r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) K = CO C = CI H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW cfg.define_split('tile_p', cfg.axis(P), num_outputs=2, filter=lambda x: x.size[-1] <= 16) cfg.define_split('tile_k', cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16) VP = cfg['tile_p'].size[-1] VK = cfg['tile_k'].size[-1] # pack input tile input_tile = tvm.compute((C, idxd(P, VP), alpha, alpha, VP), lambda c, b, eps, nu, bb: data_pad[ idxd(b * VP + bb, nH * nW), c, idxm(idxd(b * VP + bb, nW), nH) * m + eps, idxm(b * VP + bb, nW) * m + nu], name='d') # transform kernel if pre_computed: U = kernel else: r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute( (alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk: tvm.sum(kernel[k * VK + kk][c][r_kh][ r_kw].astype(out_dtype) * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') V = tvm.compute( (alpha, alpha, idxd(P, VP), C, VP), lambda eps, nu, b, c, bb: tvm.sum(input_tile[c][b][r_eps][r_nu][ bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V') # batch gemm c = tvm.reduce_axis((0, C), name='c') M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b: tvm.sum(U[eps][nu][idxd(k, VK)][c][ idxm(k, VK)] * V[eps][nu][idxd(b, VP)][c][idxm(b, VP)], axis=c), name='M') # inverse transform r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw: tvm.sum(M[r_eps][r_nu][k][b] * A[ r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]), name='Y') # unpack output output = tvm.compute( (N, K, H, W), lambda n, k, h, w: Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m), idxm(h, m), idxm(w, m)], name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * K * H * W * KH * KW * C) return output
def fused_convs(input_data, filters, resnet_block=False): out_dtype = input_data.dtype Input = None nodes = [input_data] params = [input_data] for f in filters: Input = nodes[-1] Filter = f.placeholder layout = f.layout depthwise = f.depthwise kernel = f.kernel stride = f.stride padding = f.padding dilation = f.dilation assert not (depthwise and kernel == 1) # Don't consider 1by1 depthwise padded_count = 0 conv_count = 0 depthwise_count = 0 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape if f.NHWC_transpose: # HWOI kernel_h, kernel_w, tmp, kernel_channel = Filter.shape else: # HWIO kernel_h, kernel_w, kernel_channel, tmp = Filter.shape if depthwise: channel_multiplier = tmp else: num_filter = tmp # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = simplify(in_channel * channel_multiplier) if depthwise else num_filter out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) if f.kernel > 1: print("Padding is needed!") pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput_{}".format(padded_count)) padded_count += 1 nodes.append(PaddedInput) # Update Input Input = PaddedInput batch, in_height, in_width, in_channel = Input.shape if not depthwise: rc = tvm.reduce_axis((0, in_channel), name='rc') if kernel > 1: ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') if not depthwise: # Normal convolution if kernel > 1: Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * (Filter[ry, rx, ff, rc] if f.NHWC_transpose else Filter[ry, rx, rc, ff]).astype(out_dtype), axis=[ry, rx, rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") else: # Only reduce rc axis Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h, xx * stride_w, rc].astype(out_dtype) * (Filter[0, 0, ff, rc] if f.NHWC_transpose else Filter[0, 0, rc, ff]).astype(out_dtype), axis=[rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") conv_count += 1 else: # Depthwise convolution (kernel > 1) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (Input[b, i*stride_h + ry*dilation_h, j*stride_w + rx*dilation_w, tvm.indexdiv(c, channel_multiplier)].astype(out_dtype) * (Filter[ry, rx, tvm.indexmod(c, channel_multiplier), tvm.indexdiv(c, channel_multiplier)] if f.NHWC_transpose else Filter[ry, rx, tvm.indexdiv(c, channel_multiplier), tvm.indexmod(c, channel_multiplier)]).astype(out_dtype)), axis=[ry, rx]), name='DepthwiseConv2dOutput_{}'.format(depthwise_count), tag="depthwise_nhwc") depthwise_count += 1 nodes.append(Output) params.append(Filter) if resnet_block: First = nodes[0] Last = nodes[-1] assert (first.shape == last.shape) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (First[b, i, j, c].astype(out_dtype) + (Last[b, i, j, c]).astype(out_dtype))), name='ElementwiseAddOutput_{}'.format(depthwise_count), tag="elem_nhwc") nodes.append(Output) params.append(nodes[-1]) # Final output return nodes, params
def _spatial_conv_all(wkl, sch, data, kernel, out_dtype): H, W = wkl.height, wkl.width CI, CO = wkl.in_filter, wkl.out_filter KH, KW = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride HCAT, WCAT = KH - 1, KW - 1 VH = sch.vh VW = sch.vw VC = sch.vc UNROLL = sch.unroll TH = H + 2 * HPAD TW = W + 2 * WPAD OH = (H + 2 * HPAD - KH) // HSTR + 1 OW = (W + 2 * WPAD - KW) // WSTR + 1 dshape = (1, CI, H, W) dpshape = (1, CI, TH, TW) dvshape = (1, TH // (VH * HSTR), TW // (VW * WSTR), CI, VH * HSTR + HCAT, VW * WSTR + WCAT) DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \ data_pad[n][ci][h * VH * HSTR + vh][w * VW * WSTR + vw], name='data_vec') kshape = (CO, CI, KH, KW) kvshape = (CO // VC, CI, KH, KW, VC) kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \ kernel[co * VC + vc][ci][dh][dw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (1, CO, OH, OW) conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, vh * HSTR + dh, vw * WSTR + dw].astype(out_dtype) * kernel_vec[co, ci, dh, dw, vc].astype(out_dtype), axis=[ci, dh, dw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co // VC][h // VH][ w // VW][h % VH][w % VW][co % VC], name='output_unpack', tag='spatial_conv_output') s = tvm.create_schedule(conv.op) traverse(s, conv.op) # schedule for data_vec A0, A1 = data_pad, data_vec if DOPAD: s[A0].compute_inline() _, h, _, _, _, _ = s[A1].op.axis if sch.ba == 1: oaxis = h paxis = h else: oh, ih = s[A1].split(h, sch.ba) oaxis = oh paxis = ih s[A1].parallel(paxis) s[A1].pragma(oaxis, "parallel_launch_point") s[A1].pragma(paxis, "parallel_stride_pattern") s[A1].pragma(oaxis, "parallel_barrier_when_finish") # schedule for kernel_vec B, B0 = kernel, kernel_vec co, _, _, _, _ = s[B0].op.axis if sch.bc == 1: oaxis = co paxis = co else: oco, ico = s[B0].split(co, sch.bc) oaxis = oco paxis = ico s[B0].parallel(paxis) s[B0].pragma(oaxis, "parallel_launch_point") s[B0].pragma(paxis, "parallel_stride_pattern") s[B0].pragma(oaxis, "parallel_barrier_when_finish") # schedule for conv & unpack C0, C = conv, output s = tvm.create_schedule(C.op) traverse(s, C.op) CC = s.cache_write(C0, "global") _, co, oh, ow, vh, vw, vc = s[C0].op.axis if UNROLL: s[C0].unroll(vw) s[C0].vectorize(vc) s[CC].compute_at(s[C0], ow) _, co, oh, ow, vh, vw, vc = s[CC].op.axis ci, dh, dw = s[CC].op.reduce_axis s[CC].reorder(ci, dh, vh, dw, vw, vc) if UNROLL: s[CC].unroll(vw) s[CC].vectorize(vc) n, co, h, w = s[C].op.axis co, vc = s[C].split(co, VC) oh, ow, vh, vw = s[C].tile(h, w, VH, VW) s[C].reorder(n, co, oh, ow, vh, vw, vc) # if C != C1: # s[C1].compute_inline() s[C0].compute_at(s[C], ow) if sch.bc == 1: oaxis = co paxis = co else: oco, ico = s[C].split(co, sch.bc) oaxis = oco paxis = ico s[C].parallel(paxis) s[C].pragma(oaxis, "parallel_launch_point") s[C].pragma(paxis, "parallel_stride_pattern") s[C].pragma(oaxis, "parallel_barrier_when_finish") return C, s
def _compile_function(dtype: str, device: str, b0: int = 4, b1: int = 4, b2: int = 16): '''Compiles a tvm function that computes diagonal_mm args: dtype: str in ['float64', 'float32', 'float16'] device: str in ['cpu' or 'cuda'] b0, b1, b2: size of tensor tiles. Very important for good performance ''' import tvm # import the full tvm library here for compilation. Don't import at the top of the file in case we don't need to compile from tvm.contrib import nvcc @tvm.register_func def tvm_callback_cuda_compile(code): """Use nvcc compiler for better perf.""" ptx = nvcc.compile_cuda( code, target="ptx", arch='sm_52') # use old arch for this to work on old GPUs return ptx assert dtype in ['float16', 'float32', 'float64'] assert device in ['cpu', 'cuda'] device = None if device == 'cpu' else device tgt_host = "llvm" b = tvm.var('b') # batch size n = tvm.var('n') # sequence length h = tvm.var('h') # number of heads m = tvm.var('m') # hidden dimension w = tvm.var('w') # window size w_upper = tvm.var( 'w_upper' ) # window size to the right of the word. Should be `0` or `w` padding = tvm.var('padding') # padding transpose_t1 = tvm.var('transpose_t1') # t1 should be transposed t1d3 = tvm.var('t1d3') # last dimension of t1 t3d3 = tvm.var('t3d3') # last dimension of t3 (the result tensor) X = tvm.placeholder((b, n, h, t1d3), name='X', dtype=dtype) # first tensor Y = tvm.placeholder((b, n, h, m), name='Y', dtype=dtype) # second tensor k = tvm.reduce_axis((0, t1d3), name='k') # dimension to sum over D = tvm.placeholder((h), name='D', dtype='int') # dilation per head output_shape = (b, n, h, t3d3) # shape of the result tensor algorithm = lambda l, i, q, j: tvm.sum( tvm.if_then_else( t3d3 == m, # if output dimension == m, then t1 is diagonaled (FIXME: This breaks if t3d3 == m == t1d3) tvm.if_then_else( transpose_t1 == 0, tvm.if_then_else( tvm.all( i + D[q] * (k - w) >= 0, i + D[q] * (k - w) < n, ), X[l, i, q, k] * Y[l, i + D[q] * (k - w), q, j], # t1 is diagonaled padding), tvm.if_then_else( tvm.all( i + D[q] * (k - w_upper) >= 0, # `w_upper` to handle the case `autoregressive=True` i + D[q] * (k - w_upper) < n, ), X[l, i + D[q] * (k - w_upper), q, (w_upper + w) - k] * Y[l, i + D[q] * (k - w_upper), q, j ], # # t1 is diagonaled and should be transposed padding), ), tvm.if_then_else( tvm.all( i + D[q] * (j - w) >= 0, i + D[q] * (j - w) < n, ), X[l, i, q, k] * Y[l, i + D[q] * (j - w), q, k ], # t1 is not diagonaled, but the output tensor is going to be padding)), axis=k) Z = tvm.compute(output_shape, algorithm, name='Z') # automatically generate cuda code s = tvm.create_schedule(Z.op) print('Lowering: \n ===================== \n{}'.format( tvm.lower(s, [X, Y, D], simple_mode=True))) # split long axis into smaller chunks and assing each one to a separate GPU thread/block ko, ki = s[Z].split(Z.op.reduce_axis[0], factor=b0) ZF = s.rfactor(Z, ki) j_outer, j_inner = s[Z].split(s[Z].op.axis[-1], factor=b1) i_outer, i_inner = s[Z].split(s[Z].op.axis[1], factor=b2) s[Z].bind(j_outer, tvm.thread_axis("blockIdx.x")) s[Z].bind(j_inner, tvm.thread_axis("threadIdx.y")) s[Z].bind(i_outer, tvm.thread_axis("blockIdx.y")) s[Z].bind(i_inner, tvm.thread_axis("threadIdx.z")) tx = tvm.thread_axis("threadIdx.x") s[Z].bind(s[Z].op.reduce_axis[0], tx) s[ZF].compute_at(s[Z], s[Z].op.reduce_axis[0]) s[Z].set_store_predicate(tx.var.equal(0)) print('Lowering with GPU splits: \n ===================== \n{}'.format( tvm.lower(s, [X, Y, D], simple_mode=True))) # compiling the automatically generated cuda code diagonaled_mm = tvm.build( s, [X, Y, Z, D, w, w_upper, padding, transpose_t1, t3d3], target=device, target_host=tgt_host, name='diagonaled_mm') return diagonaled_mm
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert (bn <= N) assert (num_thread * num_thread * 16 <= N) assert (num_block * num_block * 2 <= N) A = tvm.placeholder((N, N), name='A') B = tvm.placeholder((N, N), name='Btmp') k = tvm.reduce_axis((0, N), name='k') packedB = tvm.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name='B') C = tvm.compute((N, N), lambda ii, jj: tvm.sum( A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name='C') s = tvm.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_x = tvm.thread_axis("threadIdx.x") thread_y = tvm.thread_axis("threadIdx.y") thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") temp = util.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, ctx, N, times)
def test_rpc_remote_module(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") client = rpc.connect(server.host, server.port) # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) def check_remote(remote): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_remote_link_cl(remote): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return if not tvm.module.enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote(client) check_remote(rpc.LocalSession())
def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype): """ Compute convolution with pack on spatial axes. """ assert data.shape[ 0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC") sch = _get_schedule(wkl, "NHWC") VH = sch.vh VW = sch.vw VC = sch.vc data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) N, H, W, IB, CI = data_q.shape OCO, KH, KW, KB, VC, _ = kernel_vec.shape CO = OCO * VC HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride HCAT, WCAT = KH - 1, KW - 1 PAD_H = H + 2 * HPAD PAD_W = W + 2 * WPAD OH = (H + 2 * HPAD - KH) // HSTR + 1 OW = (W + 2 * WPAD - KW) // WSTR + 1 dvshape = (N, PAD_H // (VH * HSTR), PAD_W // (VW * WSTR), VH * HSTR + HCAT, VW * WSTR + WCAT, IB, CI) ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC) oshape = (1, OH, OW, CO) if (HPAD != 0 and WPAD != 0): data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad") else: data_pad = data_q data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \ data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') ib = tvm.reduce_axis((0, IB), name='ib') kb = tvm.reduce_axis((0, KB), name='kb') def _conv(n, h, w, co, vh, vw, vc): return tvm.sum( (tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw, ib, ci].astype('uint16')) << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci]) conv = tvm.compute(ovshape, _conv, name='conv') return tvm.compute(oshape, lambda n, h, w, co: conv[n][h // VH][w // VW][co // VC][ h % VH][w % VW][co % VC].astype(out_dtype), name='output_vec', tag='spatial_bitserial_conv_nhwc')
def test_tensorize_tensor_compute_op(): # an intrinsic called "multivadd" whose definition (pattern) # is a loop of another intrinsic called "vadd" def intrin_multivadd(n): n_a = tvm.var("n_a") Ab = tvm.decl_buffer((n, ), tvm.float32, strides=[n_a]) n_b = tvm.var("n_b") Bb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_b]) n_c = tvm.var("n_c") Cb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_c]) z = tvm.compute((n, ), lambda i: tvm.call_extern( "float32", 'vadd', Ab.access_ptr("w", offset=n_a * i), Bb.access_ptr("r", offset=n_b * i), Cb.access_ptr("r", offset=n_c * i))) # replace the pattern with the multivadd call. I need to figure out # how to pass it the right parameters. def intrin_func(ins, outs): return tvm.call_packed("multivadd") with tvm.build_config(): return tvm.decl_tensor_intrin(z.op, intrin_func, name="multivadd") def intrin_vadd(n): dtype = 'float32' x = tvm.placeholder((n, ), dtype=dtype, name='vx') y = tvm.placeholder((n, ), dtype=dtype, name='vy') z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z') s = tvm.create_schedule(z.op) def create_buffer(t): return tvm.decl_buffer(t.shape, t.dtype, name='W' + t.name, offset_factor=16) def intrin_func(ins, outs): ib = tvm.ir_builder.create() ib.emit( tvm.call_extern("float32", 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr'))) return ib.get() with tvm.build_config(offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds={ x: create_buffer(x), y: create_buffer(y), z: create_buffer(z) }) # cache_read, cache_write M = 1024 factor = 16 dtype = 'float32' A = tvm.placeholder((M // factor, factor), name="A", dtype=dtype) B = tvm.placeholder((M // factor, factor), name="B", dtype=dtype) vadd = intrin_vadd(factor) C = tvm.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name='C') s = tvm.create_schedule(C.op) multivadd = intrin_multivadd(64) s[C].tensorize(C.op.axis[0], multivadd) s = s.normalize() dom_map = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, dom_map) # The loop that we tried to tensorize still exists in the code # That means tensorize didn't work as expected assert isinstance(stmt.body.body.body, tvm.stmt.For) assert stmt.body.body.body.loop_var.name == C.op.axis[0].var.name
def test_add_pipeline(): n = tvm.var('n') A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='C') s = tvm.create_schedule(D.op) # GPU schedule have to split by gridIdx and threadIdx num_thread = 256 xo, xi = s[C].split(C.op.axis[0], factor=num_thread) s[C].bind(xo, tvm.thread_axis("threadIdx.x")) s[C].bind(xi, tvm.thread_axis("blockIdx.x")) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, tvm.thread_axis("threadIdx.x")) s[D].bind(xi, tvm.thread_axis("blockIdx.x")) # compile to IR s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Cb = tvm.decl_buffer(C.shape, C.dtype, name='C') stmt = tvm.ir_pass.LoopPartition(stmt) stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cb}, 64) stmt = tvm.ir_pass.Simplify(stmt) fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 0, True) fsplits = [x for x in tvm.ir_pass.SplitHostDevice(fapi)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) def check_target(device, host="stackvm"): if not tvm.module.enabled(host): return if not tvm.module.enabled(device): return ctx = tvm.context(device, 0) mhost = tvm.codegen.build_module(fsplits[0], host) mdev = tvm.codegen.build_module(fsplits[1:], device) mhost.import_module(mdev) code = mdev.get_source() f = mhost.entry_func # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx) f(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) def check_module_save(device, host="stackvm"): if not tvm.module.enabled(host): return if not tvm.module.enabled(device): return ctx = tvm.context(device, 0) fmt = "ptx" if device == "cuda" else "cl" mhost = tvm.codegen.build_module(fsplits[0], host) mdev = tvm.codegen.build_module(fsplits[1:], device) temp = util.tempdir() mpath = temp.relpath("test.%s" % fmt) mdev.save(mpath) mdev2 = tvm.module.load(mpath) mhost.import_module(mdev2) f = mhost.entry_func # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx) f(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) check_target("cuda", host="stackvm") check_target("cuda", host="llvm") check_module_save("cuda", host="stackvm") check_target("nvptx", host="llvm")
def conv1d_ncw(data, kernel, strides=1, padding='VALID', dilation=1, out_dtype=None): """ 1D convolution forward operator for NCW layout. Parameters ---------- data : tvm.Tensor 3-D with shape [batch, in_channel, in_width] kernel : tvm.Tensor 3-D with shape [num_filter, in_channel, filter_size] strides : int or tuple The spatial stride along width padding : int, tuple, or str Padding size can be an integer for equal padding, a tuple of (left, right) or a string in ['VALID', 'SAME']. dilation : int or tuple Dilation rate if convolution should be dilated. out_dtype : str The output data type. If None then output is same type as input. """ if out_dtype is None: out_dtype = data.dtype if isinstance(strides, (tuple, list)): strides = strides[0] if isinstance(dilation, (tuple, list)): dilation = dilation[0] batch, in_channels, data_width = data.shape out_channels, _, kernel_size = kernel.shape # Compute the output shape dilated_kernel_size = (kernel_size - 1) * dilation + 1 pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size, )) out_channels = simplify(out_channels) out_width = simplify( (data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1) # Apply padding pad_before = [0, 0, pad_left] pad_after = [0, 0, pad_right] temp = pad(data, pad_before, pad_after, name='pad_temp') # Compute graph rc = tvm.reduce_axis((0, in_channels), name='rc') rw = tvm.reduce_axis((0, kernel_size), name='rw') return tvm.compute( (batch, out_channels, out_width), lambda b, c, w: tvm.sum( temp[b, rc, w * strides + rw * dilation].astype(out_dtype) * kernel[c, rc, rw].astype(out_dtype), axis=[rc, rw]), tag="conv1d_ncw")
def check_device(device, target_device): if not tvm.runtime.enabled(target_device): print("Skip test because {} is not enabled.".format(target_device)) return device_ctx = tvm.context(device) graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type) shape = (4, ) # Insert copy nodes for data transferring between add and sub nodes. # Transfers data from gpu to cpu. copy_add_sub = tvm.placeholder(shape, name="__copy0") # Transfers data from cpu to gpu. copy_sub_add = tvm.placeholder(shape, name="__copy1") # Create a module containing adds on the device. tensor_a = tvm.placeholder(shape, name="A") tensor_b = tvm.placeholder(shape, name="B") tensor_d = tvm.placeholder(shape, name="D") elemwise_add0 = tvm.compute(shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add0") elemwise_add1 = tvm.compute(shape, lambda *i: copy_sub_add(*i) + tensor_d(*i), name="elemwise_add1") target = topi.cpp.TEST_create_target(device) add_schedule0 = topi.cpp.cuda.schedule_injective( target, [elemwise_add0]) lower_add0 = tvm.lower(add_schedule0, [tensor_a, tensor_b, elemwise_add0], name="elemwise_add0") add_schedule1 = topi.cpp.cuda.schedule_injective( target, [elemwise_add1]) lower_add1 = tvm.lower(add_schedule1, [tensor_d, copy_sub_add, elemwise_add1], name="elemwise_add1") # Create module for sub whose target is the host. tensor_c = tvm.placeholder(shape, name="C") elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i) - tensor_c(*i), name="elemwise_sub") sub_schedule = tvm.create_schedule(elemwise_sub.op) lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c, elemwise_sub], name="elemwise_sub") target_flist = { target_device: [lower_add0, lower_add1], target_host: [lower_sub] } mhost = tvm.build(target_flist, target_host=target_host) ctx = [host_ctx, device_ctx] params = {} params["A"] = tensor_a = np.random.uniform(size=shape).astype( tensor_a.dtype) params["B"] = tensor_b = np.random.uniform(size=shape).astype( tensor_b.dtype) params["C"] = tensor_c = np.random.uniform(size=shape).astype( tensor_c.dtype) params["D"] = tensor_d = np.random.uniform(size=shape).astype( tensor_d.dtype) def check_verify(): mod = graph_runtime.create(graph, mhost, ctx) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d) def check_load_module(): temp = util.tempdir() path_lib = temp.relpath("deploy.so") mhost.export_library(path_lib) with open(temp.relpath("deploy.json"), "w") as out_file: out_file.write(graph) loaded_lib = tvm.runtime.load_module(path_lib) loaded_graph = open(temp.relpath("deploy.json")).read() mod = graph_runtime.create(loaded_graph, loaded_lib, ctx) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal(out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d) check_verify() check_load_module()
def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float16'): batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape] pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): stride_h, stride_w = stride else: stride_h, stride_w = stride, stride out_channel = num_filter out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) oshape = (batch, out_channel, out_height, out_width) rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') block_w = 1 block_h = 1 if stride_h == 2: if num_filter + kernel_h == 515: block_h = 4 block_w = 4 else: block_h = 4 block_w = 5 elif kernel_h == 3: if num_filter == 512: block_h = 2 block_w = 7 else: block_h = 2 block_w = 14 elif kernel_h == 7 and padding == 3 and stride == 1: block_h = 3 block_w = 4 else: block_h = 1 block_w = 16 attrs = {'block_h': block_h, 'block_w' : block_w} c_h = out_height c_w = out_width if not out_width % block_w == 0: c_w = (out_width // block_w + 1) * block_w if not out_height % block_h == 0: c_h = (out_height // block_h + 1) * block_h pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w] temp = pad(data, pad_before, pad_after, name="pad_temp") nv = 16 if not num_filter % nv == 0: num_filter = (num_filter // nv + 1) * nv out_channel = num_filter cshape = (batch, out_channel // nv, c_h, c_w, nv) kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv) kernel_vec = tvm.compute( kvshape, lambda co, ci, kh, kw, vc: kernel[co*nv + vc][ci][kh][kw], name='kernel_vec') conv = tvm.compute( cshape, lambda nn, ff, yy, xx, vc:\ tvm.sum( temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) * kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype), axis=[rc, ry, rx]), name='conv', attrs=attrs) output = tvm.compute( oshape, lambda nn, ff, yy, xx: conv[nn][ff//nv][yy][xx][ff%nv], name='output_unpack', tag='conv2d') return output
def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile): assert layout == "NCHW", "Only support NCHW" out_dtype = out_dtype or data.dtype N, CI, IH, IW = get_const_tuple(data.shape) _, CO, KH, KW = get_const_tuple(kernel.shape) pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW)) bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH OW = (IW - 1) * WSTR - pad_left - pad_right + KW dilated_input = dilate(data, [1, 1, HSTR, WSTR]) data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right]) # ==================== define configuration space ==================== n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) if num_tile == 2: # for arm cpu co, vc = cfg.define_split('tile_co', co, num_outputs=2) oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2) ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2) elif num_tile == 3: # for mali gpu co, _, vc = cfg.define_split('tile_co', co, num_outputs=3) oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3) ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3) else: raise RuntimeError("Invalid num_tile") cfg.define_reorder("reorder_0", [n, co, oh, ow, ci, kh, kw, vh, vw, vc], policy='candidate', candidate=[[n, co, oh, ow, ci, kh, kw, vh, vw, vc], [n, co, oh, ow, ci, kh, kw, vc, vh, vw]]) cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') # ==================================================================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] dvshape = (N, OH // VH, OW // VW, CI, VH + KH - 1, VW + KW - 1) kvshape = (CO // VC, CI, KH, KW, VC) ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (N, CO, OH, OW) data_vec = tvm.compute( dvshape, lambda n, h, w, ci, vh, vw: data_pad[n][ci][h * VH + vh][w * VW + vw], name='data_vec') kernel_vec = tvm.compute( kvshape, lambda co, ci, kh, kw, vc: kernel[ci][co * VC + vc][kh][kw], name='kernel_vec_conv2d_transpose') ci = tvm.reduce_axis((0, CI), name='ci') kh = tvm.reduce_axis((0, KH), name='kh') kw = tvm.reduce_axis((0, KW), name='kw') conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) * kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype), axis=[ci, kh, kw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co // VC][h // VH][ w // VW][h % VH][w % VW][co % VC], name='output_unpack', tag='spatial_conv2d_transpose_output') return output
def _im2col_pack(wkl, sch, data, kernel, stride, padding, out_dtype): """ Compute convolution with im2col pack layout. """ assert data.shape[ 0].value == 1, "im2col pack convolution only support batch size=1" N = 1 H, W = wkl.height, wkl.width CI = wkl.in_filter CO = wkl.out_filter KH, KW = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.hpad HSTR, WSTR = wkl.hstride, wkl.wstride OH = (H + 2 * HPAD - KH) // HSTR + 1 OW = (W + 2 * WPAD - KW) // WSTR + 1 P = sch.vp Q = sch.vq UNROLL = sch.unroll dshape = (N, CI, H, W) dpshape = (N, CI, H + 2 * HPAD, W + 2 * WPAD) dcshape = (N, OH, OW, CI, KH, KW) dvshape = (N, OH * OW // P, CI, KH, KW, P) kshape = (CO, CI, KH, KW) kvshape = (CO // Q, CI, KH, KW, Q) ovshape = (N, CO // Q, OH * OW // P, P, Q) oshape = (N, CO, OH, OW) ############### declaration DO_PAD = (wkl.hpad != 0 and wkl.wpad != 0) if DO_PAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data data_col = tvm.compute(dcshape, lambda n, oh, ow, ci, hk, wk: \ data_pad[n][ci][oh*HSTR+hk][ow*WSTR+wk], name='data_col') data_vec = tvm.compute(dvshape, lambda n, im, ci, hk, wk, vim: \ data_col[n][(im*P+vim)//OW][(im*P+vim)%OW][ci][hk][wk], name='data_vec') kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \ kernel[co*Q+vc][ci][dh][dw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') hk = tvm.reduce_axis((0, KH), name='hk') wk = tvm.reduce_axis((0, KW), name='wk') conv = tvm.compute(ovshape, lambda n, co, im, vim, vco: \ tvm.sum(data_vec[n][im][ci][hk][wk][vim].astype(out_dtype) * kernel_vec[co][ci][hk][wk][vco].astype(out_dtype), axis=[ci, hk, wk]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: \ conv[n][co//Q][(h*OW+w)//P][(h*OW+w)%P][co%Q], name='output_vec', tag='im2col_conv_output') return output
def test_tensorize_matmul(): n = 1024 m = n l = n A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((m, l), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda i, j: tvm.sum(B[j, k] * A[i, k], axis=k), name='C') def check(factor): s = tvm.create_schedule(C.op) x, y = C.op.axis yo, yi = s[C].split(y, factor=factor) gemv = intrin_gemv(factor, l) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal( tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor(factor, rfactor): s = tvm.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) s[C].reorder(yo, ro, yi, ri) gemv = intrin_gemv(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal( tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor_no_reset(factor, rfactor): s = tvm.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) s[C].reorder(yo, ro, yi, ri) gemv = intrin_gemv_no_reset(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal( tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor_no_reset_multi_reduction(factor, rfactor): s = tvm.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) roo, roi = s[C].split(ro, factor=2) s[C].reorder(yo, roo, roi, yi, ri) gemv = intrin_gemv_no_reset(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal( tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) check(16) check_rfactor(16, 16) check_rfactor_no_reset(16, 16) check_rfactor_no_reset_multi_reduction(16, 16)
def _spatial_conv_only(wkl, sch, data_vec, kernel_vec, out_dtype): H, W = wkl.height, wkl.width CI, CO = wkl.in_filter, wkl.out_filter KH, KW = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride HCAT, WCAT = KH - 1, KW - 1 VH = sch.vh VW = sch.vw VC = sch.vc UNROLL = sch.unroll TH = H + 2 * HPAD TW = W + 2 * WPAD OH = (H + 2 * HPAD - KH) // HSTR + 1 OW = (W + 2 * WPAD - KW) // WSTR + 1 ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (1, CO, OH, OW) conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, vh * HSTR + dh, vw * WSTR + dw].astype(out_dtype) * kernel_vec[co, ci, dh, dw, vc].astype(out_dtype), axis=[ci, dh, dw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co // VC][h // VH][ w // VW][h % VH][w % VW][co % VC], name='output_unpack', tag='spatial_conv_output') C0, C = conv, output s = tvm.create_schedule(C.op) traverse(s, C.op) CC = s.cache_write(C0, "global") _, co, oh, ow, vh, vw, vc = s[C0].op.axis if UNROLL: s[C0].unroll(vw) s[C0].vectorize(vc) s[CC].compute_at(s[C0], ow) _, co, oh, ow, vh, vw, vc = s[CC].op.axis ci, dh, dw = s[CC].op.reduce_axis s[CC].reorder(ci, dh, vh, dw, vw, vc) if UNROLL: s[CC].unroll(vw) s[CC].vectorize(vc) n, co, h, w = s[C].op.axis co, vc = s[C].split(co, VC) oh, ow, vh, vw = s[C].tile(h, w, VH, VW) s[C].reorder(n, co, oh, ow, vh, vw, vc) # if C != C1: # s[C1].compute_inline() s[C0].compute_at(s[C], ow) if sch.bc == 1: oaxis = co paxis = co else: oco, ico = s[C].split(co, sch.bc) oaxis = oco paxis = ico s[C].parallel(paxis) s[C].pragma(oaxis, "parallel_launch_point") s[C].pragma(paxis, "parallel_stride_pattern") s[C].pragma(oaxis, "parallel_barrier_when_finish") return C, s
def _compute(attrs, x, _): x = x[0] scalar = attrs.get_float("scalar") scalar = tvm.const(scalar, x.dtype) return tvm.compute(x.shape, lambda *i: f(x(*i), scalar))
def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed): """Compute declaration for winograd""" assert layout == 'NCHW' tile_size = _infer_tile_size(data, kernel) N, CI, H, W = get_const_tuple(data.shape) if not pre_computed: # kernel tensor is raw tensor, do strict check if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) CO, CI, KH, KW = get_const_tuple(kernel.shape) HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel) HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3 else: # kernel tensor is pre-transfomred. this op is created by # alter op layout, do not check # dilation is not supported HSTR = WSTR = 1 HPAD = WPAD = 1 KH = KW = 3 _, _, CI, CO = get_const_tuple(kernel.shape) data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad") if tile_size == 4: G_data = np.array([ [1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0], [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]], dtype=np.float32) B_data = np.array([ [4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0], [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]], out_dtype) A_data = np.array([ [1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1], [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]], out_dtype) elif tile_size == 2: G_data = np.array([ [1, 0, 0], [1.0/2, 1.0/2, 1.0/2], [1.0/2, -1.0/2, 1.0/2], [0, 0, 1]], np.float32) B_data = np.array([ [1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype) A_data = np.array([ [1, 0], [1, 1], [1, -1], [0, -1]], out_dtype) else: raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) m = A_data.shape[1] r = 3 alpha = m + r - 1 H = (H + 2 * HPAD - KH) // HSTR + 1 W = (W + 2 * WPAD - KW) // WSTR + 1 nH, nW = (H + m-1) // m, (W + m-1) // m P = N * nH * nW # transform kernel if not pre_computed: G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), name='r_kh') r_kw = tvm.reduce_axis((0, KW), name='r_kw') kernel_pack = tvm.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co: tvm.sum(kernel[co][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='kernel_pack') else: kernel_pack = kernel # pack input tile input_tile = tvm.compute((CI, P, alpha, alpha), lambda c, p, eps, nu: data_pad[p // (nH * nW)][c][p // nW % nH * m + eps] [p % nW * m + nu], name='d') # transform data B = const_matrix(B_data) r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p: tvm.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='data_pack') # do batch gemm ci = tvm.reduce_axis((0, CI), name='ci') bgemm = tvm.compute((alpha, alpha, CO, P), lambda eps, nu, co, p: tvm.sum(kernel_pack[eps][nu][ci][co] * data_pack[eps][nu][ci][p], axis=[ci]), name='bgemm') # inverse transform A = const_matrix(A_data) r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_a') inverse = tvm.compute((CO, P, m, m), lambda co, p, vh, vw: tvm.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='inverse') # output output = tvm.compute((N, CO, H, W), lambda n, co, h, w: inverse[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m], name='output', tag='conv2d_nchw_winograd') cfg.add_flop(2 * N * CO * H * W * CI * KH * KW) return output
import tvm import numpy as np ###################################################################### # Direct Declare Extern Math Call # ------------------------------- # The most straight-forward way to call target specific function is via # extern function call construct in tvm. # In th following example, we use :any:`tvm.call_pure_extern` to call # :code:`__expf` function, which is only available under CUDA. # n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda i: tvm.call_pure_extern("float32", "__expf", A[i]), name="B") s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "rocm", name="myexp") print(f.get_source()) ###################################################################### # Unified Intrinsic Call # ---------------------- # The above code verifies that direct external call can be used to # call into device specific functions. # However, the above way only works for CUDA target with float type.
def _spatial_pack(data, kernel, stride, padding, out_dtype=None): """ Compute convolution with pack on spatial axes. """ if out_dtype is None: out_dtype = data.dtype assert data.shape[ 0].value == 1, "spatial pack convolution only support batch size=1" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) H, W = wkl.height, wkl.width CI, CO = wkl.in_filter, wkl.out_filter KH, KW = wkl.hkernel, wkl.wkernel HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride HCAT, WCAT = KH - 1, KW - 1 VH = sch.vh VW = sch.vw VC = sch.vc UNROLL = sch.unroll TH = H + 2 * HPAD TW = W + 2 * WPAD OH = (H + 2 * HPAD - KH) // HSTR + 1 OW = (W + 2 * WPAD - KW) // WSTR + 1 dshape = (1, CI, H, W) dpshape = (1, CI, TH, TW) dvshape = (1, TH // (VH * HSTR), TW // (VW * WSTR), CI, VH * HSTR + HCAT, VW * WSTR + WCAT) kshape = (CO, CI, KH, KW) kvshape = (CO / VC, CI, KH, KW, VC) ovshape = (1, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (1, CO, OH, OW) DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: \ data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, vc: \ kernel[co*VC+vc][ci][dh][dw], name='kernel_vec') ci = tvm.reduce_axis((0, CI), name='ci') dh = tvm.reduce_axis((0, KH), name='dh') dw = tvm.reduce_axis((0, KW), name='dw') conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ tvm.sum(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw].astype(out_dtype) * kernel_vec[co, ci, dh, dw, vc].astype(out_dtype), axis=[ci, dh, dw]), name='conv') output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co // VC][h / VH][w // VW] [h % VH][w % VW][co % VC], name='output_unpack', tag='spatial_conv_output') return output
def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): out_dtype = data.dtype if out_dtype is None else out_dtype assert layout == 'NCHW', "only support NCHW convolution for AVX" assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(dilation, int): dilation_h, dilation_w = dilation else: dilation_h, dilation_w = dilation HPAD, WPAD = padding HSTR, WSTR = strides batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD dilated_kernel_h = (kernel_height - 1) * dilation_h + 1 dilated_kernel_w = (kernel_width - 1) * dilation_w + 1 out_height = (in_height + 2 * HPAD - dilated_kernel_h) // HSTR + 1 out_width = (in_width + 2 * WPAD - dilated_kernel_w) // WSTR + 1 # pack data DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data # fetch schedule ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width) data_vec = tvm.compute( shape, lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w], name='data_vec') # pack kernel shape = (num_filter // oc_bn, in_channel // ic_bn, kernel_height, kernel_width, ic_bn, oc_bn) kernel_vec = tvm.compute(shape, lambda CO, CI, h, w, ci, co: kernel[ CO * oc_bn + co, CI * ic_bn + ci, h, w], name='kernel_vec') # convolution oshape = (batch_size, num_filter // oc_bn, out_height, out_width, oc_bn) unpack_shape = (batch_size, num_filter, out_height, out_width) ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // ic_bn, oh * HSTR + kh * dilation_h, ic % ic_bn, ow * WSTR + kw * dilation_w].astype(out_dtype) * kernel_vec[ oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn, oc_block].astype( out_dtype), axis=[ic, kh, kw]), name='conv') unpack = tvm.compute(unpack_shape, lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn ].astype(out_dtype), name='output_unpack', tag='conv2d_nchw') return unpack
def non_max_suppression_gpu(data, valid_count, max_output_size=-1, iou_threshold=0.5, force_suppress=False, top_k=-1, coord_start=2, score_index=1, id_index=0, return_indices=True, invalid_to_bottom=False): """Non-maximum suppression operator for object detection. Parameters ---------- data : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, elem_length]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. valid_count : tvm.Tensor 1-D tensor for valid number of boxes. max_output_size : optional, int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : optional, float Non-maximum suppression threshold. force_suppress : optional, boolean Whether to suppress all detections regardless of class_id. top_k : optional, int Keep maximum top k detections before nms, -1 for no limit. coord_start : required, int Start index of the consecutive 4 coordinates. score_index : optional, int Index of the scores/confidence of boxes. id_index : optional, int index of the class categories, -1 to disable. return_indices : boolean Whether to return box indices in input data. invalid_to_bottom : optional, boolean Whether to move all valid bounding boxes to the top. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, elem_length]. Example -------- .. code-block:: python # An example to use nms dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") iou_threshold = 0.7 force_suppress = True top_k = -1 out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold, force_suppress=force_supress, top_k=top_k, return_indices=False) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "cuda") ctx = tvm.gpu(0) tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) score_axis = score_index score_shape = (batch_size, num_anchors) score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE) sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False) sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8) data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) out_buf = api.decl_buffer(data.shape, data.dtype, "out_buf", data_alignment=8) out, box_indices = \ tvm.extern([data.shape, score_shape], [data, sort_tensor, valid_count], lambda ins, outs: nms_ir( ins[0], ins[1], ins[2], outs[0], outs[1], max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index), dtype=[data.dtype, "int32"], in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], name="nms", tag="nms") if return_indices: return box_indices if invalid_to_bottom: output_buf = api.decl_buffer(data.shape, data.dtype, "output_buf", data_alignment=8) temp_flag_buf = api.decl_buffer(score_shape, valid_count_dtype, "temp_flag", data_alignment=8) temp_idx_buf = api.decl_buffer(score_shape, valid_count_dtype, "temp_idx", data_alignment=8) temp_flag, temp_idx = tvm.extern( [score_shape, score_shape], [out], lambda ins, outs: invalid_to_bottom_pre(ins[0], outs[0], outs[1]), dtype=["int32", "int32"], in_buffers=[out_buf], out_buffers=[temp_flag_buf, temp_idx_buf], name="invalid_to_bottom_phase_one") output = tvm.extern([data.shape], [out, temp_flag, temp_idx], lambda ins, outs: invalid_to_bottom_ir( ins[0], ins[1], ins[2], outs[0]), dtype=[data.dtype], in_buffers=[out_buf, temp_flag_buf, temp_idx_buf], out_buffers=[output_buf], name="invalid_to_bottom", tag="invalid_to_bottom") return output return out
import tvm n = 1024 A = tvm.placeholder((n,), name='A') k = tvm.reduce_axis((0, n), name='k') B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B') s = tvm.create_schedule(B.op) print(tvm.lower(s, [A, B], simple_mode=True)) print("---------cutting line---------") ko, ki = s[B].split(B.op.reduce_axis[0], factor=32) print(tvm.lower(s, [A, B], simple_mode=True))
import numpy as np ###################################################################### # Define Matrix Multiplication # ---------------------------- # Take matrix multiplication as our example. # Matmul first multiply the corresponding elements between two matrix, # then accumulate across a certain axis. # The following lines describe the computation :code:`A * B^T` in TVM. # N, M, L = 1024, 512, 64 A = tvm.placeholder((N, L), name='A') B = tvm.placeholder((M, L), name='B') k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k), name='C') s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, B, C], simple_mode=True)) ###################################################################### # Schedule the Matmul # ------------------- # Now, suppose we have an accelerator that supports # matrix-vector multiplication (GEMV) as a hardware primitive, # which can take arbitrary size of reduce axis, # but another axis needs to be no larger than 16. # Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV. # factor = 16 x, y = C.op.axis
# .. note:: # # Now we back to the local machine, which has a full TVM installed # (with LLVM). # # Here we will declare a simple kernel on the local machine: import numpy as np import tvm from tvm import rpc from tvm.contrib import util n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute((n, ), lambda i: A[i] + 1.0, name='B') s = tvm.create_schedule(B.op) ###################################################################### # Then we cross compile the kernel. # The target should be 'llvm -target=armv7l-linux-gnueabihf' for # Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable # on our webpage building server. See the detailed note in the following block. local_demo = True if local_demo: target = 'llvm' else: target = 'llvm -target=armv7l-linux-gnueabihf'
def intrinsic_gemm(i, j, k, il, jl, kl, ic, jc, kc): """ (i, k) * (k, j) i, j, k: normal iteration size il, jl, kl: last iteration size ic, jc, kc: last iteration condition """ assert i * k + k * j <= 256 * 1024, 'input too large for scratchpad' assert 4 * (i * j) <= 64 * 1024, 'input too large for accumulator' a = tvm.placeholder((i, k), name='a', dtype=dtype) b = tvm.placeholder((k, j), name='b', dtype=dtype) kk = tvm.reduce_axis((0, k), name='k') c = tvm.compute((i, j), lambda ii, jj: tvm.sum(a[ii, kk] * b[kk, jj], axis=kk), name='c') strideA = tvm.var("sA") Ab = tvm.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[strideA, 1]) strideB = tvm.var("sB") Bb = tvm.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[strideB, 1]) strideC = tvm.var("sC") Cb = tvm.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[strideC, 1]) II = i // DIM + (0 if i % DIM == 0 else 1) JJ = j // DIM + (0 if j % DIM == 0 else 1) KK = k // DIM + (0 if k % DIM == 0 else 1) pad_I = 0 if i % DIM == 0 else (DIM - i % DIM) pad_J = 0 if j % DIM == 0 else (DIM - j % DIM) pad_K = 0 if k % DIM == 0 else (DIM - k % DIM) IIl = il // DIM + (0 if il % DIM == 0 else 1) JJl = jl // DIM + (0 if jl % DIM == 0 else 1) KKl = kl // DIM + (0 if kl % DIM == 0 else 1) pad_Il = 0 if il % DIM == 0 else (DIM - il % DIM) pad_Jl = 0 if jl % DIM == 0 else (DIM - jl % DIM) pad_Kl = 0 if kl % DIM == 0 else (DIM - kl % DIM) II = tvm.if_then_else(ic, IIl, II) JJ = tvm.if_then_else(jc, JJl, JJ) KK = tvm.if_then_else(kc, KKl, KK) pad_I = tvm.if_then_else(ic, pad_Il, pad_I) pad_J = tvm.if_then_else(jc, pad_Jl, pad_J) pad_K = tvm.if_then_else(kc, pad_Kl, pad_K) # reset-update-finalize def intrin_func(ins, outs): aa, bb = ins cc, = outs def _body(): ib = tvm.ir_builder.create() # int32_t matmul_kernel(const elem_t *A, const elem_t *B, const acc_t *D, # elem_t *C, int32_t I, int32_t J, int32_t K, int32_t pad_I, # int32_t pad_J, int32_t pad_K, int32_t A_row_len, # int32_t B_row_len, int32_t D_row_len, int32_t C_row_len, # bool no_bias, bool repeating_bias); # D is set to a dummy address 1 to determine whether to overwrite # accumulator contents: on the first run, 1 will be retained and # overwrite the value in the accumulator; on subsequent runs D will be # replaced by NULL and C will accumulate on top of the accumulator's contents # This is controlled via bit 1 << (ADDR_LEN - 2) - see kernel source ib.emit( tvm.call_extern("int32", "matmul_kernel", aa.access_ptr("r"), bb.access_ptr("r"), 1, cc.access_ptr("rw"), II, JJ, KK, pad_I, pad_J, pad_K, strideA, strideB, 0, strideC, True, False)) return ib.get() def _reset(): ib = tvm.ir_builder.create() # int32_t matmul_reset(elem_t *C, int32_t I, int32_t J, int32_t pad_I, # int32_t pad_J, int32_t C_row_len); ib.emit( tvm.call_extern("int32", "matmul_reset", cc.access_ptr("w"), II, JJ, pad_I, pad_J, strideC)) return ib.get() def _finalize(): ib = tvm.ir_builder.create() # Move out C from accumulator # int32_t matmul_finalize(elem_t *C, int32_t I, int32_t J, int32_t pad_I, # int32_t pad_J, int32_t C_row_len); ib.emit( tvm.call_extern("int32", "matmul_finalize", cc.access_ptr("rw"), II, JJ, pad_I, pad_J, strideC)) return ib.get() # standalone (without reduce axis split), reset, update return None, _reset(), _body(), _finalize() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={ a: Ab, b: Bb, c: Cb }, name="sp_gemm")
ctx = tvm.context(target, 0) # ground truth a = tvm.nd.array(np.random.rand(M, K).astype(dtype), ctx) b = tvm.nd.array(np.random.rand(K, N).astype(dtype), ctx) c = tvm.nd.array(np.zeros((M, N), dtype=dtype), ctx) answer = np.dot(a.asnumpy(), b.asnumpy()) ################### # TVM part # Algorithm k = tvm.reduce_axis((0, K), 'k') A = tvm.placeholder((M, K), name='A') B = tvm.placeholder((K, N), name='B') C = tvm.compute((M, N), lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k), name='C') s = tvm.create_schedule(C.op) # Blocking by loop tiling bn = 64 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], x_factor=bn, y_factor=bn) k, = s[C].op.reduce_axis ko, ki = s[C].split(k, factor=8) # Hoist reduction domain outside the blocking loop s[C].reorder(xo, yo, ko, ki, xi, yi) # Vectorization s[C].vectorize(yi)