def test_rfactor(): n = tvm.var('n') k1 = tvm.reduce_axis((0, n), name="k1") k2 = tvm.reduce_axis((0, n), name="k2") A = tvm.placeholder((n, n, n), name='A') B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k1, k2], axis=[k1, k2])) # normal schedule s = tvm.create_schedule(B.op) BF = s.rfactor(B, k1) assert(tuple(BF.shape) == (n, n)) assert(set(BF.op.body[0].axis) == set([k2])) assert(s[B].op.body[0].axis[0].dom.extent == n) assert(len(s[B].all_iter_vars) == 2) # schedule with splot s = tvm.create_schedule(B.op) ko, ki = s[B].split(k1, factor=4) xo, xi = s[B].split(B.op.axis[0], factor=8) BF = s.rfactor(B, ki) assert(BF.shape[0].value == 4) assert(BF.shape[1] == n) assert(BF.op.body[0].axis[0] == k2) assert(BF.op.body[0].axis[1].var == ko.var) assert(s[B].op.body[0].axis[0].dom.extent.value == 4) # schedule with factor_axis s = tvm.create_schedule(B.op) ko, ki = s[B].split(k1, factor=4) xo, xi = s[B].split(B.op.axis[0], factor=8) BF = s.rfactor(B, ki, 1) assert(n == BF.shape[0]) assert(BF.shape[1].value == 4) assert(BF.op.body[0].axis[0] == k2) assert(BF.op.body[0].axis[1].var == ko.var) assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
def verify_full(shape, dtype, fill_value): A = tvm.placeholder(shape, dtype=dtype, name="A") B = topi.cpp.full_like(A, fill_value) C = topi.cpp.full(shape, dtype, fill_value) s1 = tvm.create_schedule([B.op]) s2 = tvm.create_schedule([C.op]) def get_ref_data(): return np.full(shape, fill_value, dtype) np_nd = get_ref_data() def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return target = topi.cpp.TEST_create_target(device) ctx = tvm.context(device, 0) out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) f = tvm.build(s1, [A, B], device, name="full_like") f(tvm.nd.array(np.zeros(shape, dtype), ctx), out) tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5) f = tvm.build(s2, [C], device, name="full") f(out) tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5) for device in ["llvm"]: check_device(device)
def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') temp = util.tempdir() s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd") path_dso1 = temp.relpath("dev_lib.dylib") f.export_library(path_dso1, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso1) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso2 = temp.relpath("cpu_lib.dylib") f.export_library(path_dso2, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso2) # Start RPC test server that contains the compiled library. server = xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, libs=[path_dso1, path_dso2]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.metal(0) f1 = remote.load_module("dev_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # CPU ctx = remote.cpu(0) f2 = remote.load_module("cpu_lib.dylib") a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f1.entry_name, ctx, number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
def test_add_pipeline(): nn = 64 max_threads = 4 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, (n+1) // 2) as i: ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() def extern_generator_gpu(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var with ib.if_scope(ib.likely(idx < n)): ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C') C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C') s_cpu = tvm.create_schedule(C_cpu.op) s_gpu = tvm.create_schedule(C_gpu.op) print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True)) print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True)) def check_target(target): if not tvm.module.enabled(target): return s = s_gpu if target in ['opencl', 'cuda'] else s_cpu C = C_gpu if target in ['opencl', 'cuda'] else C_cpu # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.context(target, 0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) check_target("llvm") check_target("opencl") check_target("cuda")
def test_bound_tensor_compute_op(): def intrin_test(): m1 = tvm.var("m1") n1 = tvm.var("n1") a = tvm.placeholder((m1, n1), name='a') c = tvm.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c') Ab = tvm.decl_buffer(a.shape, name="Abuf", offset_factor=1) Cb = tvm.decl_buffer(c.shape, name="Cbuf", offset_factor=1) def intrin_func(ins, outs): aa = ins[0] cc = outs[0] def _body(): ib = tvm.ir_builder.create() ib.emit(tvm.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r"))) return ib.get() return _body() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb}) test_func = intrin_test() A = tvm.placeholder((20,20), name='A') B = tvm.compute(A.shape, lambda i,j : A[i,j], name='B') C = tvm.compute((10, 20), lambda i : test_func(B[i:10, 0:20]), name='C') s = tvm.create_schedule(C.op) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) assert(bounds[B.op.axis[0]].extent.value == 10)
def test_scan_group(): m = tvm.var("m") n = tvm.var("n") x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x") s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: x[0, i]) s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i]) s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1) s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1) res = tvm.scan(s_init, s_update3, s_state, inputs=x) s = tvm.create_schedule(res.op) assert s[s_update1].group is not None assert s[s_update2].group == s[s_update1].group # Assign within group, is valid s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1]) # create a new group, for [s_update2 and s_update1] g2 = s.create_group(outputs=s_update2, inputs=[s_state, x]) assert g2.group is not None assert g2.group == s[s_update3].group assert s[s_update2].group == g2 assert s[s_update1].group == g2 g2.compute_at(s[s_update3], s_update3.op.axis[1]) assert g2.attach_stage == s[s_update3] try: # compute outside group error. s[s_update2].compute_at(s[s_init], s_init.op.axis[0]) assert False except tvm.TVMError: pass
def schedule_conv2d_nchw(outs): """Schedule for conv2d_nchw for Intel Graphics Parameters ---------- outs: Array of Tensor The computation graph description of conv2d_nchw in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for conv2d_nchw. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) scheduled_ops = [] def traverse(op): """inline all one-to-one-mapping operators except the last stage (output)""" if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors and tensor.op not in scheduled_ops: traverse(tensor.op) if 'conv2d' in op.tag: _schedule_cl_spatialpack(s, op) scheduled_ops.append(op) traverse(outs[0].op) return s
def test_bound_nest_thread(): m = tvm.var('m') A = tvm.placeholder((m), name='A') A1 = tvm.compute((m,), lambda i: A[i], name='A1') A2 = tvm.compute((m,), lambda i: A1[i] + 2, name='A2') A3 = tvm.compute((m,), lambda i: A2[i] + 3, name='A3') s = tvm.create_schedule(A3.op) s[A2].set_scope("shared") s[A1].set_scope("local") block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") bx, tx = s[A3].split(A3.op.axis[0], factor=32) s[A3].bind(bx, block_x) s[A3].bind(tx, thread_x) s[A2].compute_at(s[A3], tx) _, xi = s[A2].split(A2.op.axis[0], nparts=1) s[A2].bind(xi, thread_x) s[A1].compute_at(s[A3], tx) s = s.normalize() bounds = tvm.schedule.InferBound(s) assert(bounds[A1.op.axis[0]].extent.value==1) assert(bounds[A2.op.axis[0]].extent.value==32) assert(bounds[A3.op.axis[0]].extent == m)
def schedule_conv2d_nchw_cuda(cfg, outs): """TOPI schedule callback of conv2d for cuda gpu Parameters ---------- cfg: ConfigEntity The config for this template outs: Array of Tensor The computation graph description of conv2d in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for conv2d. """ target = tvm.target.current_target() if 'cudnn' in target.libs: return generic.schedule_extern(outs) outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == 'conv2d_nchw': schedule_direct_cuda(cfg, s, op.output(0)) if op.tag == 'conv2d_nchw_winograd': schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False) if op.tag == "conv2d_NCHWc_int8": schedule_conv2d_NCHWc_int8(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s
def test_min_repeat_ms(): tmp = tempdir() filename = tmp.relpath("log") @tvm.register_func def my_debug(filename): """one call lasts for 100 ms and writes one character to a file""" time.sleep(0.1) with open(filename, "a") as fout: fout.write("c") X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename)) s = tvm.create_schedule(X.op) func = tvm.build(s, [X]) x = tvm.nd.empty((), dtype="int32") ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1) ftimer(x) with open(filename, "r") as fin: ct = len(fin.readline()) assert ct == 2 ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1, min_repeat_ms=1000) ftimer(x) # make sure we get more than 10 calls with open(filename, "r") as fin: ct = len(fin.readline()) assert ct > 10 + 2
def verify_log_softmax(m, n): A = tvm.placeholder((m, n), name='A') B = topi.nn.log_softmax(A) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.log_softmax_python(a_np) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_softmax(B) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="log_softmax") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ["opengl"]: check_device(device)
def test_matmul_add(): n = 1024 l = 128 m = 235 A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = rocblas.matmul(A, B) s = tvm.create_schedule(C.op) def verify(target="rocm"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.rocm(0) f = tvm.build(s, [A, B, C], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) verify()
def _schedule_conv2d(outs): outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) tvm.schedule.AutoInlineInjective(s) def traverse(OP): """Internal travserse function""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_injective(OP.tag): if OP not in s.outputs: s[OP].compute_inline() for tensor in OP.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) # schedule conv2d elif OP.tag.find("conv2d") >= 0: Conv2d = OP.output(0) if not Conv2d.op in s.outputs: Out = outs[0].op.output(0) s[Conv2d].compute_at(s[Out], s[Out].op.axis[1]) else: raise RuntimeError("Unsupported operator: %s" % OP.tag) traverse(outs[0].op) px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1) s[outs[0]].bind(px, tvm.thread_axis("pipeline")) return s
def test_add(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) def check_c(): mhost = tvm.build(s, [A, B, C], "c", name="fadd") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m['fadd'] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) check_c()
def test_double_splitting_with_indivisible_factors(): m = 48 dtype="float32" A = tvm.placeholder((m,), name='A', dtype=dtype) C = tvm.compute((m,), lambda i: A[i], name='C') D = tvm.compute((m,), lambda i: C[i], name='D') s = tvm.create_schedule(D.op) co, ci = s[C].split(C.op.axis[0], factor=10) do, di = s[D].split(D.op.axis[0], 32) s[C].compute_at(s[D], do) target = 'llvm' with tvm.build_config(partition_const_loop=True): f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False) func = tvm.build(f, target=target) # Find the beginning of the Halide IR corresponding to kernel code # and make sure it doesn't have an if statements left top_produce = find_top_produce(f.body) assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse)))) # check functional correctness of generated code ctx = tvm.context(target, 0) a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx) c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) func(a, c, d) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
def schedule_softmax(outs): """Schedule for softmax Parameters ---------- outs: Array of Tensor The computation graph description of softmax in the format of an array of tensors. Returns ------- sch: Schedule The computation schedule for the op. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) tvm.schedule.AutoInlineInjective(s) softmax = outs[0] max_elem = softmax.op.input_tensors[1] expsum = softmax.op.input_tensors[2] s[expsum].compute_at(s[softmax], s[softmax].op.axis[1]) s[max_elem].compute_at(s[softmax], s[softmax].op.axis[1]) px, x = s[softmax].split(softmax.op.axis[0], nparts=1) s[softmax].bind(px, tvm.thread_axis("pipeline")) return s
def dump_graph_lib(target_dir): dim = 4 A = tvm.placeholder((dim,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') sched = tvm.create_schedule(B.op) node0 = {"op": "null", "name": "x", "inputs": []} node1 = {"op": "tvm_op", "name": "add", "inputs": [[0, 0, 0]], "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs" : "1", "num_outputs" : "1"}} nodes = [node0, node1] arg_nodes = [0] node_row_ptr = [0, 1, 2] outputs = [[1, 0, 0]] shape = (4,) attrs = { "shape" : ["list_shape", [shape, shape]], "dltype" : ["list_str", ["float32", "float32"]], "storage_id" : ["list_int", [0, 1]], } graph = {"nodes": nodes, "arg_nodes": arg_nodes, "node_row_ptr": node_row_ptr, "heads": outputs, "attrs": attrs} graph = json.dumps(graph) mlib = tvm.build(sched, [A, B], "llvm", name="myadd") mlib.export_library(os.path.join(target_dir, "graph_addone_lib.so")) with open(os.path.join(target_dir, "graph_addone.json"), "w") as fo: fo.write(graph)
def test_conv_tiling(): HSTR = WSTR = 1 in_channel = 128 kernel_height = kernel_width = 3 out_channel = 64 batch_size = 1 in_height = in_width = 64 out_height = out_width = in_height - kernel_height + 1 data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data') kernel = tvm.placeholder((kernel_height, kernel_width, in_channel, out_channel), name='kernel') ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute((batch_size, out_channel, out_height, out_width), lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]), name="conv2d") s = tvm.create_schedule(conv.op) n, oc, oh, ow = conv.op.axis oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.LoopPartition(stmt, True) stmt = tvm.ir_pass.Simplify(stmt) assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
def verify_clip(N, a_min, a_max, dtype): A = tvm.placeholder((N, N), dtype=dtype, name='A') B = topi.clip(A, a_min, a_max) s = tvm.create_schedule([B.op]) # use memoize to pickle the test data for next time use @memoize("topi.tests.test_topi_clip") def get_ref_data(): a_np = np.random.uniform(a_min*2, a_max*2, size=(N, N)).astype(dtype) b_np = np.clip(a_np, a_min, a_max) return a_np, b_np a_np, b_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_injective(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B], device, name="clip") f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in get_all_backend(): check_device(device)
def intrin_vadd(n, cache_read=False, cache_write=False): scope_ubuf = 'local' dtype = 'float32' x = tvm.placeholder((n,), dtype=dtype, name='vx') y = tvm.placeholder((n,), dtype=dtype, name='vy') z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z') s = tvm.create_schedule(z.op) def create_buffer(t): return tvm.decl_buffer(t.shape, t.dtype, name='W'+t.name, scope=scope_ubuf, offset_factor=16) binds = {} if cache_read: binds[x] = create_buffer(x) binds[y] = create_buffer(y) if cache_write: binds[z] = create_buffer(z) def intrin_func(ins, outs): ib = tvm.ir_builder.create() ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr'))) return ib.get() with tvm.build_config(offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
def test_llvm_persist_parallel(): n = 128 A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B') C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=8) xo1, xo2 = s[C].split(xo, nparts=1) s[B].compute_at(s[C], xo1) s[B].parallel(s[B].op.axis[0]) s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish") s[C].parallel(xi) s[C].pragma(xo1, "parallel_launch_point") s[C].pragma(xi, "parallel_stride_pattern") def check_llvm(): if not tvm.module.enabled("llvm"): return # BUILD and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), np.sqrt(a.asnumpy() + 1) * 2 + 2, rtol=1e-5) check_llvm()
def test_multiple_func(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def check_llvm(): if not tvm.module.enabled("llvm"): return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], "llvm") fadd1 = m['fadd1'] fadd2 = m['fadd2'] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd1(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) fadd2(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) check_llvm()
def test_dynamic_tensor(): dtype = 'float32' stype = 'csr' target = 'llvm' ctx = tvm.context(target, 0) nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n') A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype) assert(A.stype == 'csr') C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter') s = tvm.create_schedule(C.op) _nr, _nc = 3, 5 a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.) a = tvmsp.array(a, ctx) assert a.data.dtype == a.dtype Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr']) Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data') Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices') binds = {A.data: Ab.data, A.indices: Ab.indices} f = tvm.build(s, [nr, A.data, C], target, binds=binds) c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr f(a.data.shape[0], a.data, c.data) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
def test_sort_np(): dshape = (1, 2, 3, 4, 5, 6) axis = 4 reduced_shape = (1, 2, 3, 4, 6) is_descend = False data = tvm.placeholder(dshape, name='data') sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32") out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
def test_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, n/2) as i: ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C = tvm.extern(A.shape, [A], extern_generator, name='C') s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, C], simple_mode=True)) def check_llvm(): if not tvm.module.enabled("llvm"): return # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) np.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1) check_llvm()
def test_schedule_create(): m = tvm.var('m') n = tvm.var('n') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') B = tvm.placeholder((n, l), name='B') AA = tvm.compute((m, l), lambda i, j: A[i, j]) T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k)) s = tvm.create_schedule(T.op) s[AA].set_scope("shared") xo, xi = s[T].split(T.op.axis[0], factor=10) xi1, xi2 = s[T].split(xi, factor=2) s[AA].compute_at(s[T], xi1) xo, xi = s[AA].split(AA.op.axis[0], factor=10) s[T].reorder(xi2, xi1) assert T.op.axis[1] in s[T].leaf_iter_vars # save load json json_str = tvm.save_json(s) s_loaded = tvm.load_json(json_str) assert isinstance(s_loaded, tvm.schedule.Schedule) assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body)) # pickle unpickle dump = pkl.dumps(s) s_loaded = pkl.loads(dump) assert isinstance(s_loaded, tvm.schedule.Schedule) assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
def test_sort(): n = 2 l = 5 m = 3 data = tvm.placeholder((n, l, m), name='data') sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_descend = True out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]] ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def test_lstm_cell_inline(): num_step = 128 num_input = 256 num_hidden = 1152 batch_size = 4 # Global transition matrix X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X") Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h") Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h") # h: output hidden state, c: cell state. s_state_h = tvm.placeholder((num_step, batch_size, num_hidden)) s_state_c = tvm.placeholder((num_step, batch_size, num_hidden)) s_init_c = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c") s_init_h = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h") # LSTM transition k = tvm.reduce_axis((0, num_input), name="ki2h") s_i2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k), name="s_i2h") k = tvm.reduce_axis((0, num_hidden), name="ki2h") s_h2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k), name="s_h2h") # Gate rules gates = tvm.compute(s_i2h.shape, lambda *i: s_i2h(*i) + s_h2h(*i), name="gates") gshape = (num_step, batch_size, num_hidden) in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate") in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform") forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate") out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate") next_c = tvm.compute(gshape, lambda t, i, j: forget_gate[t, i, j] * s_state_c[t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j], name="next_c") next_h = tvm.compute(gshape, lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h") update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c") update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h") # schedule scan_h, scan_c = tvm.scan( [s_init_h, s_init_c], [update_h, update_c], [s_state_h, s_state_c], inputs=[X], name="lstm_scan") # schedule s = tvm.create_schedule(scan_h.op) # Inline gate computations s[gates].compute_inline() s[in_gate].compute_inline() s[in_transform].compute_inline() s[forget_gate].compute_inline() s[out_gate].compute_inline() # verify we can lower correctly tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
def test_pack_buffer_intermediate(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.compute((n,), lambda i: A[i] + 1, name="B") def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline.""" return tvm.call_packed("my_extern_array_func2", ins[0], outs[0]) C = tvm.extern(B.shape, [B], extern_generator, name='C') s = tvm.create_schedule(C.op) def check_target(target): if not tvm.module.enabled(target): return # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) @tvm.register_func def my_extern_array_func2(aa, bb): assert aa.shape == a.shape tvm.testing.assert_allclose( aa.asnumpy(), a.asnumpy() + 1) aa.copyto(bb) f(a, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1) check_target("llvm")
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"): A = tvm.placeholder(shape, dtype=dtype, name="A") C = topi.sum(A, axis=0, keepdims=False) s = tvm.create_schedule(C.op) f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name) return f
def visit_call(node, ret): ret.append(node) print(type(node), " dtype=", node.dtype) print(type(node), " name=", node.name) for arg in node.args: visit(arg, ret) print(type(node), " call_type=", node.call_type) print(type(node), " func=", node.func) print(type(node), " value_index=", node.value_index) def visit_let(node, ret): ret.append(node) visit(node.var, ret) visit(node.value, ret) visit(node.body, ret) if __name__ == "__main__": from auto_schedule.examples import FUNC_TABLE func = FUNC_TABLE["conv3d_channel_batch"].func args = FUNC_TABLE["conv3d_channel_batch"].args op, bufs = func(*args) s = tvm.create_schedule(op) stmt = tvm.lower(s, bufs, simple_mode=True) print(stmt) ret = [] visit(stmt, ret)
def schedule_dense(cfg, outs): """Schedule for dense operator. Parameters ---------- cfg: ConfigEntity The config entity for this template outs: Array of Tensor The computation graph description of dense in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for dense. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == 'dense': vec_size = [1, 2, 4, 8, 16] max_unroll = 32 dense = op.output(0) output = outs[0] y, x = s[output].op.axis c = s[dense].op.reduce_axis[0] ##### space definition begin ##### cfg.define_split('tile_y', y, num_outputs=3) cfg.define_split('tile_x', x, num_outputs=3) cfg.define_split('c_unroll', c, num_outputs=2, max_factor=64) # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( 'mali', 'rk3399', 'dense', 'direct') cfg.fallback_with_reference_log(ref_log) ##### space definition end ##### if dense.op in s.outputs: dense = s.cache_write(output, 'local') by, ty, yi = cfg['tile_y'].apply(s, output, y) bx, tx, xi = cfg['tile_x'].apply(s, output, x) s[output].bind(by, tvm.thread_axis('blockIdx.y')) s[output].bind(bx, tvm.thread_axis('blockIdx.x')) s[output].bind(ty, tvm.thread_axis('threadIdx.y')) s[output].bind(tx, tvm.thread_axis('threadIdx.x')) if cfg['tile_y'].size[-1] < max_unroll: s[output].unroll(yi) if cfg['tile_x'].size[-1] in vec_size: s[output].vectorize(xi) s[dense].compute_at(s[output], tx) k = s[dense].op.reduce_axis[0] y, x = s[dense].op.axis k, k_unroll = cfg['c_unroll'].apply(s, dense, k) s[dense].reorder(k, k_unroll, y, x) s[dense].unroll(k_unroll) if cfg['tile_y'].size[-1] < max_unroll: s[dense].unroll(y) if cfg['tile_x'].size[-1] in vec_size: s[dense].vectorize(x) traverse_inline(s, outs[0].op, _callback) return s
def measure_bandwidth_sum(total_item, item_per_thread, stride, base_type, bits, lanes, target, target_host, remote, ctx, n_times): """ measure memory bandwidth of gpu by product reduction for a given type The IR for measurement is for each thread for i in 1..num_per_thread: y[global_id] = y[global_id] * x[base + i * stride] Parameters ---------- total_item: int number of elements in input array item_per_thread: int number of elements each thread accumulates stride: int stride in memory access base_type: str can be "int", "float" bits: int can be 16, 32 lanes: int lane of the vector type, can be 1, 2, 4, 8, 16 target: :any:`tvm.target.Target` the target and option of the compilation. target_host : str or :any:`tvm.target.Target` host compilation target ctx: TVMcontext the context of array remote: tvm.contrib.rpc.RPCSession remote rpc session n_times: int number of runs for taking mean Returns ------- GBPS: float gigabyte per second """ n, m = total_item, item_per_thread n //= lanes base_type = str(base_type) + str(bits) dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) k = tvm.reduce_axis((0, m), name="k") x = tvm.placeholder((n,), dtype=dtype, name="x") op = tvm.comm_reducer(lambda x, y: x*y, lambda t: tvm.const(1, dtype=t), name="sum") y = tvm.compute((n // m,), lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k)) s = tvm.create_schedule(y.op) yo, yi = s[y].split(y.op.axis[0], target.max_num_threads) s[y].bind(yo, tvm.thread_axis("blockIdx.x")) s[y].bind(yi, tvm.thread_axis("threadIdx.x")) s[y].unroll(k) try: func = tvm.build(s, [x, y], target, target_host=target_host) x = tvm.nd.empty((n,), dtype=dtype, ctx=ctx) y = tvm.nd.empty((n // m,), dtype=dtype, ctx=ctx) func = _convert_to_remote(func, remote) time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) time = time_f(x, y).mean except tvm._ffi.base.TVMError: # build error (occur when device does not support half) return -1 return 1.0 * (total_item * bits / 8) / 1e9 / time
def train_op_schedule_cpu_general_dx(entities, epoch, batch_size, path, loop_num=100, loop_size=16, stack_size=20, logfile="temp.log", device="cuda:0"): dim = 5 timeout = 15.0 num_sample = len(entities) device = torch.device(device) model = OpScheduleCPUd5(3, 128, device) # load or initialize parameter file if os.path.exists(path) and os.path.isfile(path): state_dict = torch.load(path) model.load_state_dict(state_dict) else: torch.save(model.state_dict(), path) model.to(device) optimizer = torch.optim.Adadelta(model.parameters(), lr=LR) model.train() # maintain a dataset for each function datasets = [[] for i in range(num_sample)] train_beg_time = time.time() with open(logfile, "a") as f: f.write("New log\ntime: {}".format(train_beg_time)) perf_before = dict() perf_before_dump = False model.train() print("Scheduling begins...parameters in path {}\n logs to{}".format( path, logfile)) for i in range(epoch): optimizer.zero_grad() for batch in range(batch_size): for p in range(num_sample): func_name = entities[p].func_name func = FUNC_TABLE[func_name].func args = entities[p].args ops, bufs = func(*args) s = tvm.create_schedule(ops) # get the performance before scheduling # only run one time entity_key = "{}:{}".format(func_name, args) if entity_key not in perf_before: pre_cost = serial_evaluate(s, bufs, "llvm", np.random.randint(0, MAX_CPU), 10, timeout=timeout) perf_before[entity_key] = pre_cost if not isinstance(ops, (list, tuple)): ops = [ops] bfs_order, down_graph = graph_analysis(ops) group_points = [] for op in bfs_order: if not isinstance(op, tvm.tensor.ComputeOp): continue if able_inline(op, down_graph): s[op].compute_inline() else: group_points.append(op) if len(group_points) > 1: raise RuntimeError("Not support more than one compute") for j, point in enumerate(group_points): y_dict, y_diary = op_schedule_cpu_general_dx( dim, s, point, model, random=np.random.random() < 0.2, sampling=True) post_cost = serial_evaluate(s, bufs, "llvm", np.random.randint(0, MAX_CPU), 10, timeout=timeout) data = dict() for name, value in y_dict.items(): if isinstance(value, list): tmp = [] for v in value: tmp.append(v.detach()) data[name] = ( tmp, y_diary[name] ) # the data record schedule decisions else: data[name] = (value.detach(), y_diary[name]) # record (point No. , sch data, time cost) datasets[p].append((j, data, post_cost)) # record performance before scheduling # only run one time if not perf_before_dump: with open(logfile, "a") as f: logs = "performance before scheduling:\n" f.write(logs) for key, perf in perf_before.items(): logs = "{}: {}\n".format(key, perf) f.write(logs) f.write("\n") perf_before_dump = True # control the size of dataset and record best cases cur_time = time.time() with open(logfile, "a") as f: for j in range(num_sample): datasets[j] = heapq.nsmallest(stack_size, datasets[j], key=lambda x: x[-1]) entity_key = "{}:{}".format(entities[j].func_name, entities[j].args) duration = cur_time - train_beg_time logs = "epoch {}/{}| {} best perf {}| [{}s]\n".format( i + 1, epoch, entity_key, datasets[j][0][-1], duration) f.write(logs) logs = "schedule {}\n".format(entity_key) for name, val in datasets[j][0][1].items( ): # find the diary, this is ugly now, change later logs = logs + "{}: {}\n".format(name, val[1]) logs = logs + "\n" f.write(logs) # train the parameters for r in range(loop_num): acc_loss = 0.0 for inner in range(loop_size): for q in range(num_sample): func_name = entities[q].func_name func = FUNC_TABLE[func_name].func args = entities[q].args for (point_num, data, time_cost) in datasets[q][:1]: ops, bufs = func(*args) s = tvm.create_schedule(ops) if not isinstance(ops, (list, tuple)): ops = [ops] bfs_order, down_graph = graph_analysis(ops) group_points = [] for op in bfs_order: if not isinstance(op, tvm.tensor.ComputeOp): continue if able_inline(op, down_graph): s[op].compute_inline() else: group_points.append(op) y_dict, _ = op_schedule_cpu_general_dx( dim, s, group_points[point_num], model, random=False, sampling=False) # spatial loss spatial_loss = 0.0 for j in range(dim): spatial_loss = spatial_loss + torch.nn.functional\ .binary_cross_entropy(y_dict["spatial"][j], data["spatial"][0][j]) # reduce_loss reduce_loss = 0.0 for j in range(dim): reduce_loss = reduce_loss + torch.nn.functional\ .binary_cross_entropy(y_dict["reduce"][j], data["reduce"][0][j]) # parallel_loss parallel_loss = torch.nn.functional\ .binary_cross_entropy(y_dict["parallel"], data["parallel"][0]) # reorder_one loss reorder_one_loss = torch.nn.functional\ .binary_cross_entropy(y_dict["reorder_one"], data["reorder_one"][0]) # reorder_two loss reorder_two_loss = torch.nn.functional\ .binary_cross_entropy(y_dict["reorder_two"], data["reorder_two"][0]) # reorder_three loss reorder_three_loss = torch.nn.functional\ .binary_cross_entropy(y_dict["reorder_three"], data["reorder_three"][0]) # accumulate loss acc_loss = acc_loss + spatial_loss + reduce_loss + parallel_loss + reorder_one_loss \ + reorder_two_loss + reorder_three_loss acc_loss.backward() if r % 10 == 0: torch.save(model.state_dict(), path) logs = "epoch={}, r={}, loss={}\n".format( i + 1, r, float(acc_loss.detach())) with open(logfile, "a") as f: f.write(logs) optimizer.step() with open(logfile, "a") as f: f.write("\n") print("All done.")
def schedule_conv2d_hwcn(outs): """Schedule for conv2d_hwcn and any element-wise operations. Parameters ---------- outs: Array of Tensor The computation graph description of conv2d_hwcn in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for conv2d_hwcn. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs sch = tvm.create_schedule([x.op for x in outs]) def schedule(Apad, W, B): """Schedule conv2d_hwcn""" sch[Apad].compute_inline() AA = sch.cache_read(Apad, "shared", [B]) WW = sch.cache_read(W, "shared", [B]) AL = sch.cache_read(AA, "local", [B]) WL = sch.cache_read(WW, "local", [B]) if B.op in sch.outputs: Out = B BL = sch.cache_write(Out, "local") else: Out = sch.outputs[0].output(0) sch[B].set_scope("local") BL = B tile = 8 num_thread = 8 block_factor = tile * num_thread step = 8 vthread = 2 block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") block_z = tvm.thread_axis("blockIdx.z") thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx") thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy") hi, wi, fi, ni = sch[Out].op.axis bz = sch[Out].fuse(hi, wi) by, fi = sch[Out].split(fi, factor=block_factor) bx, ni = sch[Out].split(ni, factor=block_factor) tyz, fi = sch[Out].split(fi, nparts=vthread) txz, ni = sch[Out].split(ni, nparts=vthread) ty, fi = sch[Out].split(fi, nparts=num_thread) tx, ni = sch[Out].split(ni, nparts=num_thread) sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni) sch[Out].bind(bz, block_z) sch[Out].bind(by, block_y) sch[Out].bind(bx, block_x) sch[Out].bind(tyz, thread_yz) sch[Out].bind(txz, thread_xz) sch[Out].bind(ty, thread_y) sch[Out].bind(tx, thread_x) # Schedule BL local write sch[BL].compute_at(sch[Out], tx) yi, xi, fi, ni = sch[BL].op.axis ry, rx, rc = sch[BL].op.reduce_axis rco, rci = sch[BL].split(rc, factor=step) sch[BL].reorder(rco, ry, rx, rci, fi, ni) fuse_index = sch[BL].fuse(ry, rx) fuse_index = sch[BL].fuse(fuse_index, rco) rx = fuse_index sch[AA].compute_at(sch[BL], rx) sch[WW].compute_at(sch[BL], rx) sch[AL].compute_at(sch[BL], rci) sch[WL].compute_at(sch[BL], rci) # Schedule for A's shared memory load yi, xi, ci, ni = sch[AA].op.axis ty, ci = sch[AA].split(ci, nparts=num_thread) tx, ni = sch[AA].split(ni, nparts=num_thread) _, ni = sch[AA].split(ni, factor=4) sch[AA].reorder(ty, tx, yi, xi, ci, ni) sch[AA].bind(ty, thread_y) sch[AA].bind(tx, thread_x) sch[AA].vectorize(ni) # Schedule for W's shared memory load yi, xi, ci, fi = sch[WW].op.axis ty, ci = sch[WW].split(ci, nparts=num_thread) tx, fi = sch[WW].split(fi, nparts=num_thread) _, fi = sch[WW].split(fi, factor=4) sch[WW].reorder(ty, tx, yi, xi, ci, fi) sch[WW].bind(ty, thread_y) sch[WW].bind(tx, thread_x) sch[WW].vectorize(fi) def traverse(operator): """Traverse operators from computation graph""" if operator.tag == 'ewise' or operator.tag == 'scale_shift': if operator not in sch.outputs: sch[operator].compute_inline() for tensor in operator.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) elif operator.tag == 'conv2d_hwcn': Apad = operator.input_tensors[0] W = operator.input_tensors[1] B = operator.output(0) schedule(Apad, W, B) else: raise RuntimeError("Unsupported operator: %s" % operator.tag) traverse(outs[0].op) return sch
def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): assert N == 1, "Only consider batch_size = 1 in this template" data = tvm.placeholder((N, CI, H, W), name='data') kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel') conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, 'float32') s = tvm.create_schedule([conv.op]) # inline padding pad_data = s[conv].op.input_tensors[0] s[pad_data].compute_inline() data, raw_data = pad_data, data output = conv OL = s.cache_write(conv, 'local') # create cache stage AA = s.cache_read(data, 'shared', [OL]) WW = s.cache_read(kernel, 'shared', [OL]) AL = s.cache_read(AA, 'local', [OL]) WL = s.cache_read(WW, 'local', [OL]) # tile and bind spatial axes n, f, y, x = s[output].op.axis cfg = autotvm.get_config() cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) kernel_scope = n # this is the scope to attach global config inside this kernel s[output].bind(bf, tvm.thread_axis("blockIdx.z")) s[output].bind(by, tvm.thread_axis("blockIdx.y")) s[output].bind(bx, tvm.thread_axis("blockIdx.x")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(vy, tvm.thread_axis("vthread")) s[output].bind(vx, tvm.thread_axis("vthread")) s[output].bind(tf, tvm.thread_axis("threadIdx.z")) s[output].bind(ty, tvm.thread_axis("threadIdx.y")) s[output].bind(tx, tvm.thread_axis("threadIdx.x")) s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) s[OL].compute_at(s[output], tx) # tile and bind reduction axes n, f, y, x = s[OL].op.axis rc, ry, rx = s[OL].op.reduce_axis cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3) cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3) cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3) rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc) ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry) rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx) s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) s[AA].compute_at(s[OL], rxo) s[WW].compute_at(s[OL], rxo) s[AL].compute_at(s[OL], rxm) s[WL].compute_at(s[OL], rxm) # cooperative fetching for load in [AA, WW]: n, f, y, x = s[load].op.axis fused = s[load].fuse(n, f, y, x) tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) s[load].bind(tz, tvm.thread_axis("threadIdx.z")) s[load].bind(ty, tvm.thread_axis("threadIdx.y")) s[load].bind(tx, tvm.thread_axis("threadIdx.x")) # tune unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) return s, [raw_data, kernel, conv]
def gemm_int8(n, m, l): cfg = autotvm.get_config() A = tvm.placeholder((n, l), name='A', dtype='int8') B = tvm.placeholder((m, l), name='B', dtype='int8') cfg.define_split('tile_y', cfg.axis(m), num_outputs=3) cfg.define_split('tile_x', cfg.axis(m), num_outputs=3) y_chunk = cfg['tile_y'].size[0] y_block = functools.reduce(operator.mul, cfg['tile_y'].size[1:]) x_chunk = cfg['tile_x'].size[0] x_block = functools.reduce(operator.mul, cfg['tile_x'].size[1:]) k_chunk = l // 16 k_block = 16 A_packed = tvm.compute( (y_chunk, k_chunk, y_block, k_block), lambda yo, ko, yi, ki: A[yo * y_block + yi, ko * k_block + ki], name='A_packed') B_packed = tvm.compute( (x_chunk, k_chunk, x_block, k_block), lambda xo, ko, xi, ki: B[xo * x_block + xi, ko * k_block + ki], name='B_packed') ko = tvm.reduce_axis((0, k_chunk)) ki = tvm.reduce_axis((0, k_block)) C = tvm.compute( (n, m), lambda i, j: tvm.sum(A_packed[ i // y_block, ko, i % y_block, ki].astype('int32') * B_packed[ j // x_block, ko, j % x_block, ki].astype('int32'), axis=[ko, ki]), name='C') s = tvm.create_schedule([t.op for t in [A_packed, B_packed, C]]) block_x = tvm.thread_axis('blockIdx.x') block_y = tvm.thread_axis('blockIdx.y') thread_x = tvm.thread_axis('threadIdx.x') thread_y = tvm.thread_axis('threadIdx.y') s[A_packed].compute_inline() s[B_packed].compute_inline() AA = s.cache_read(A_packed, 'shared', [C]) BB = s.cache_read(B_packed, 'shared', [C]) AL = s.cache_read(AA, 'local', [C]) BL = s.cache_read(BB, 'local', [C]) CC = s.cache_write(C, 'local') ko, ki = CC.op.reduce_axis cfg.define_split('tile_k', cfg.axis(ko), num_outputs=2) ko, kmo = cfg['tile_k'].apply(s, CC, ko) kmi, ki = s[CC].split(ki, factor=4) y, x = CC.op.axis s[CC].reorder(ko, kmo, kmi, y, x, ki) km = s[CC].fuse(kmo, kmi) s[CC].tensorize(ki, dot) y, x = C.op.axis by, tyz, ty = cfg['tile_y'].apply(s, C, y) bx, txz, tx = cfg['tile_x'].apply(s, C, x) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].bind(tyz, tvm.thread_axis('vthread')) s[C].bind(txz, tvm.thread_axis('vthread')) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(by, bx, tyz, txz, ty, tx) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis cfg.define_knob('local_double_buffer', [0, 1]) for load in [AL, BL]: s[load].compute_at(s[CC], km) ki = load.op.axis[3] s[load].vectorize(ki) if cfg['local_double_buffer'].val: s[load].double_buffer() cfg.define_knob('shared_double_buffer', [0, 1]) for load in [AA, BB]: s[load].compute_at(s[CC], ko) fused = s[load].fuse(load.op.axis[2], load.op.axis[3]) ty, tx = s[load].split(fused, nparts=cfg['tile_y'].size[2]) tx, xi = s[load].split(tx, nparts=cfg['tile_x'].size[2]) _, xi = s[load].split(xi, factor=16) s[load].bind(ty, thread_y) s[load].bind(tx, thread_x) s[load].vectorize(xi) if cfg['shared_double_buffer'].val: s[load].double_buffer cfg.define_knob('auto_unroll_max_step', [512, 1500]) s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[C].pragma(by, 'unroll_explicit', False) cfg.add_flop(n * m * l * 2) return s, [A, B, C]
# # Now we back to the local machine, which has a full TVM installed # (with LLVM). # # Here we will declare a simple kernel on the local machine: import numpy as np import tvm from tvm import rpc from tvm.contrib import util n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute((n, ), lambda i: A[i] + 1.0, name='B') s = tvm.create_schedule(B.op) ###################################################################### # Then we cross compile the kernel. # The target should be 'llvm -target=armv7l-linux-gnueabihf' for # Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable # on our webpage building server. See the detailed note in the following block. local_demo = True if local_demo: target = 'llvm' else: target = 'llvm -target=armv7l-linux-gnueabihf' func = tvm.build(s, [A, B], target=target, name='add_one')
def schedule_depthwise_conv2d_nchw_cuda(cfg, outs): """Schedule for depthwise_conv2d nchw forward. Parameters ---------- outs: Array of Tensor The computation graph description of depthwise_conv2d in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for depthwise_conv2d nchw. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == 'depthwise_conv2d_nchw': pad_data = op.input_tensors[0] kernel = op.input_tensors[1] conv = op.output(0) ##### space definition begin ##### n, f, y, x = s[conv].op.axis cfg.define_split("tile_f", f, num_outputs=4) cfg.define_split("tile_y", y, num_outputs=4) cfg.define_split("tile_x", x, num_outputs=4) cfg.define_knob("auto_unroll_max_step", [0, 256, 1500]) target = tvm.target.current_target() if target.target_name in ['nvptx', 'rocm']: cfg.define_knob("unroll_explicit", [1]) else: cfg.define_knob("unroll_explicit", [0, 1]) # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( target.target_name, target.model, 'depthwise_conv2d_nchw', 'direct') cfg.fallback_with_reference_log(ref_log) # TODO(lmzheng): A bug here, set unroll_explicit to False as workaround cfg['unroll_explicit'].val = 0 ##### space definition end ##### s[pad_data].compute_inline() if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag: s[kernel].compute_inline() if conv.op in s.outputs: output = conv OL = s.cache_write(conv, 'local') else: output = s.outputs[0].output(0) s[conv].set_scope('local') OL = conv # create cache stage AA = s.cache_read(pad_data, 'shared', [OL]) WW = s.cache_read(kernel, 'shared', [OL]) AL = s.cache_read(AA, 'local', [OL]) WL = s.cache_read(WW, 'local', [OL]) # tile and bind spatial axes n, f, y, x = s[output].op.axis bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) kernel_scope, n = s[output].split(n, nparts=1) bf = s[output].fuse(n, bf) s[output].bind(bf, tvm.thread_axis("blockIdx.z")) s[output].bind(by, tvm.thread_axis("blockIdx.y")) s[output].bind(bx, tvm.thread_axis("blockIdx.x")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(vy, tvm.thread_axis("vthread")) s[output].bind(vx, tvm.thread_axis("vthread")) s[output].bind(tf, tvm.thread_axis("threadIdx.z")) s[output].bind(ty, tvm.thread_axis("threadIdx.y")) s[output].bind(tx, tvm.thread_axis("threadIdx.x")) s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) s[OL].compute_at(s[output], tx) # cooperative fetching s[AA].compute_at(s[output], bx) s[WW].compute_at(s[output], bx) s[AL].compute_at(s[output], tx) s[WL].compute_at(s[output], tx) for load in [AA, WW]: fused = s[load].fuse(*list(s[load].op.axis)) fused, tx = s[load].split(fused, cfg["tile_x"].size[2]) fused, ty = s[load].split(fused, cfg["tile_y"].size[2]) fused, tz = s[load].split(fused, cfg["tile_f"].size[2]) s[load].bind(tz, tvm.thread_axis("threadIdx.z")) s[load].bind(ty, tvm.thread_axis("threadIdx.y")) s[load].bind(tx, tvm.thread_axis("threadIdx.x")) s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) traverse_inline(s, outs[0].op, _callback) return s
def schedule_depthwise_conv2d_nhwc(outs): """Schedule for depthwise_conv2d nhwc forward. Parameters ---------- outs: Array of Tensor The computation graph description of depthwise_conv2d in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for depthwise_conv2d nhwc. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _schedule(temp, Filter, DepthwiseConv2d): s[temp].compute_inline() FS = s.cache_read(Filter, "shared", [DepthwiseConv2d]) if DepthwiseConv2d.op in s.outputs: Output = DepthwiseConv2d CL = s.cache_write(DepthwiseConv2d, "local") else: Output = outs[0].op.output(0) s[DepthwiseConv2d].set_scope("local") block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") b, h, w, c = s[Output].op.axis # num_thread here could be 728, it is larger than cuda.max_num_threads num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value target = tvm.target.current_target() if target and (target.target_name not in ["cuda", "nvptx"]): num_thread = target.max_num_threads xoc, xic = s[Output].split(c, factor=num_thread) s[Output].reorder(xoc, b, h, w, xic) xo, yo, _, _ = s[Output].tile(h, w, x_factor=2, y_factor=2) fused = s[Output].fuse(yo, xo) fused = s[Output].fuse(fused, b) fused = s[Output].fuse(fused, xoc) s[Output].bind(fused, block_x) s[Output].bind(xic, thread_x) if DepthwiseConv2d.op in s.outputs: s[CL].compute_at(s[Output], xic) else: s[DepthwiseConv2d].compute_at(s[Output], xic) _, _, ci, fi = s[FS].op.axis s[FS].compute_at(s[Output], fused) fused = s[FS].fuse(fi, ci) s[FS].bind(fused, thread_x) scheduled_ops = [] def traverse(OP): """Internal travserse function""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(OP.tag): if OP not in s.outputs: s[OP].compute_inline() for tensor in OP.input_tensors: if tensor.op.input_tensors and tensor.op not in scheduled_ops: traverse(tensor.op) # schedule depthwise_conv2d if OP.tag == 'depthwise_conv2d_nhwc': PaddedInput = OP.input_tensors[0] Filter = OP.input_tensors[1] if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag: s[Filter].compute_inline() DepthwiseConv2d = OP.output(0) _schedule(PaddedInput, Filter, DepthwiseConv2d) scheduled_ops.append(OP) traverse(outs[0].op) return s
def conv(iw, ih, fw, fh, fi, fo, batch, dtype): img = tvm.placeholder((batch, fi, iw, ih), dtype=dtype, name='img') fil = tvm.placeholder((fi, fo, fw, fh), dtype=dtype, name='fil') conv = topi.nn.conv2d_nchw(img, fil, (1, 1), 'VALID') cfg = autotvm.template.DispatchContext.current.query(None, None) cfg.add_flop(iw * ih * fw * fh * fi * fo * batch * 2) s = tvm.create_schedule(conv.op) temp = conv.op.input_tensors[0] sch[temp].compute_inline() shared_cache = [] local_cache = [] # Space definition for buf in conv.op.input_tensors: shared_cache.append(s.cache_read(buf, "shared", [conv])) local_cache.append(s.cache_read(shared_cache[-1], "local", [conv])) write_cache = s.cache_write(conv, "local") spatial_axes = [cfg.axis(x) for x in s[conv].op.axis] spatial_chs = [ cfg.define_split("tile_" + x.name, x, num_outputs=4) for x in spatial_axes ] re_axes = cfg.define_reorder("re", reduce(list.__add__, spatial_chs), policy='interleave', spatial=spatial_chs, reduce=[]) cfg.define_annotate('bind', re_axes[:sum([len(ch) - 1 for ch in spatial_chs])], policy='bind_gpu_virtual') reduce_axes = [cfg.axis(x) for x in s[write_cache].op.reduce_axis] reduce_chs = [ cfg.define_split("tile_reduce_" + x.name, x, num_outputs=2) for x in reduce_axes ] cfg.define_annotate("cache_anchor", reduce(list.__add__, reduce_chs), policy='locate_cache', num_anchor=2) # Apply on schedule spatial_axes = s[conv].op.axis spatial_chs = [ cfg["tile_" + x.var.name].apply(s, conv, x) for x in spatial_axes ] spatial_lens = [cfg["tile_" + x.var.name].size for x in spatial_axes] re_axes = cfg["re"].apply(s, conv, reduce(list.__add__, spatial_chs)) bind_axes = re_axes[:sum([len(ch) - 1 for ch in spatial_chs])] cfg['bind'].apply(s, conv, bind_axes) # Cache anchor s[write_cache].compute_at(s[conv], bind_axes[-1]) local_axes = s[write_cache].op.axis reduce_axes = s[write_cache].op.reduce_axis reduce_chs = [ cfg["tile_reduce_" + x.var.name].apply(s, write_cache, x) for x in reduce_axes ] s[write_cache].reorder(*(reduce(list.__add__, reduce_chs) + list(local_axes))) cfg['cache_anchor'].apply(s, write_cache, reduce(list.__add__, reduce_chs), source=[shared_cache, local_cache]) re_lens = [reduce(list.__add__, spatial_lens)[x] for x in cfg["re"].perm] bind_lens = re_lens[:sum([len(ch) - 1 for ch in spatial_chs])] thread_info = [] for ann, length in zip(cfg['bind'].anns, bind_lens): if 'threadIdx' in ann: thread_info.append((ann, length)) thread_info.sort(key=lambda x: x[0]) for i, cache in enumerate(shared_cache): axes = list(s[cache].op.axis) fused = s[cache].fuse(*axes) for name, length in reversed(thread_info): t, fused = s[cache].split(fused, nparts=length) s[cache].bind(t, tvm.thread_axis(name)) return s, [img, fil, conv]
import tvm import numpy as np # 同一个计算有多种不同的计算方式,更会有不同的性能 # Schedule来决定如何计算,schedule是一组计算转换,用于转化程序中的循环计算 # schedule 是由一组opts组成 # 默认情况下,以行优先的串行方式计算 n = tvm.var('n') m = tvm.var('m') A = tvm.placeholder((m, n), name='A') B = tvm.placeholder((m, n), name='B') C = tvm.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C') s = tvm.create_schedule([C.op]) # lower会将计算从定义转换为真正的可调用函数。 使用参数`simple_mode = True`, # 它将返回一个可读的C like语句,我们在此处使用它来打印计划结果。 # print(tvm.lower(s, [A, B, C], simple_mode=True)) # 一个schedule由多个stage组成,一个stage代表一个opt # 每个stage提供多种方法 # split # 将特定的一维拆成两维 A = tvm.placeholder((m, ), name='A') B = tvm.compute((m, ), lambda i: A[i] * 2, name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) # print(tvm.lower(s, [A, B], simple_mode=True)) s = tvm.create_schedule(B.op) bx, tx = s[B].split(B.op.axis[0], nparts=32)
def check(start, end, dstart, dend, dtype, floor_div=False): div = tvm.floordiv if floor_div else tvm.truncdiv mod = tvm.floormod if floor_div else tvm.truncmod # A are dividends, B are divisors. Note that we add 1 to make include end in the range. A = tvm.placeholder((end - start + 1, ), name="A", dtype=dtype) B = tvm.placeholder((dend - dstart + 1, ), name="B", dtype=dtype) # We clip values with min and max so that simplifiers know the ranges of values clipa = lambda x: tvm.min(tvm.const(end, dtype), tvm.max(tvm.const(start, dtype), x)) clipb = lambda x: tvm.min(tvm.const(dend, dtype), tvm.max(tvm.const(dstart, dtype), x)) # If the range is just a single point, use the constant itself if start == end: clipa = lambda x: tvm.const(start, dtype) if dstart == dend: clipb = lambda x: tvm.const(dstart, dtype) # D are division results and M are modulo results [D, M] = tvm.compute( (end - start + 1, dend - dstart + 1), lambda i, j: (div(clipa(A[i]), clipb(B[j])), mod(clipa(A[i]), clipb(B[j])))) s = tvm.create_schedule([D.op, M.op]) f = tvm.build(s, [A, B, D, M], "llvm") # Fill input arrays with values A_arr = tvm.nd.empty((end - start + 1, ), dtype) B_arr = tvm.nd.empty((dend - dstart + 1, ), dtype) A_arr.copyfrom(np.arange(start, end + 1, dtype=dtype)) B_np = np.arange(dstart, dend + 1, dtype=dtype) # If the range of the divisor contains 0, replace it with 1 to avoid division by zero if dend >= 0 and dstart <= 0: B_np[-dstart] = 1 B_arr.copyfrom(B_np) D_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype) M_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype) # Run the function and convert the results to numpy f(A_arr, B_arr, D_arr, M_arr) D_arr = D_arr.asnumpy() M_arr = M_arr.asnumpy() # This helper just prints additional info on failure def _show_info(): print("dtype: {}".format(dtype)) print("dividend range: [{}, {}]".format(start, end)) print("divisor range: [{}, {}]".format(dstart, dend)) lowered = tvm.lower(s, [A, B, D, M], simple_mode=True) print("Lowered code:") print(lowered) # Check that the computed values are correct for i in range(start, end + 1): for j in range(dstart, dend + 1): if j == 0: continue if floor_div: dref = i // j mref = i % j else: dref = int(float(i) / j) mref = int(math.fmod(i, j)) if D_arr[i - start, j - dstart] != dref: _show_info() raise AssertionError( "Incorrect division result: {}({}, {}) is {} " "but should be {}".format(div.__name__, i, j, D_arr[i - start, j - dstart], dref)) if M_arr[i - start, j - dstart] != mref: _show_info() raise AssertionError( "Incorrect modulo result: {}({}, {}) is {} " "but should be {}".format(mod.__name__, i, j, M_arr[i - start, j - dstart], mref))
def test_dwarf_debug_information(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def check_llvm_object(): if not tvm.runtime.enabled("llvm"): return if tvm.target.codegen.llvm_version_major() < 5: return if tvm.target.codegen.llvm_version_major() > 6: return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], "llvm") temp = util.tempdir() o_path = temp.relpath("temp.o") m.save(o_path) import re import shutil import subprocess import sys # Try the dwarfdump utility (OS X) if shutil.which("dwarfdump"): output = subprocess.check_output(["dwarfdump", o_path]) assert re.search(r"""DW_AT_name\\t\("fadd1"\)""", str(output)) assert re.search(r"""DW_AT_name\\t\("fadd2"\)""", str(output)) # Try gobjdump (OS X) if shutil.which("gobjdump"): output = subprocess.check_output(["gobjdump", "--dwarf", o_path]) assert re.search(r"""DW_AT_name.*fadd1""", str(output)) assert re.search(r"""DW_AT_name.*fadd2""", str(output)) # Try objdump (Linux) - Darwin objdump has different DWARF syntax. if shutil.which("objdump") and sys.platform != 'darwin': output = subprocess.check_output(["objdump", "--dwarf", o_path]) assert re.search(r"""DW_AT_name.*fadd1""", str(output)) assert re.search(r"""DW_AT_name.*fadd2""", str(output)) def check_llvm_ir(): if not tvm.runtime.enabled("llvm"): return if tvm.target.codegen.llvm_version_major() < 5: return if tvm.target.codegen.llvm_version_major() > 6: return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], target="llvm -target=aarch64-linux-gnu") ll = m.get_source("ll") # On non-Darwin OS, don't explicitly specify DWARF version. import re assert not re.search(r""""Dwarf Version""" "", ll) assert re.search(r"""llvm.dbg.value""", ll) # Try Darwin, require DWARF-2 m = tvm.build([f1, f2], target="llvm -target=x86_64-apple-darwin-macho") ll = m.get_source("ll") assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll) assert re.search(r"""llvm.dbg.value""", ll) check_llvm_object() check_llvm_ir()
def schedule_pool(outs, layout): """Schedule for pool. Parameters ---------- outs: Array of Tensor The computation graph description of pool in the format of an array of tensors. layout: str Data layout. Returns ------- s: Schedule The computation schedule for pool. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _schedule(PaddedInput, Pool): if isinstance(PaddedInput.op, tvm.tensor.ComputeOp): s[PaddedInput].compute_inline() num_thread = tvm.target.Target.current(allow_none=False).max_num_threads if Pool.op in s.outputs: Out = Pool OL = s.cache_write(Pool, "local") else: Out = outs[0].op.output(0) s[Pool].set_scope("local") fused = s[Out].fuse(*s[Out].op.axis) bx, tx = s[Out].split(fused, factor=num_thread) s[Out].bind(bx, tvm.thread_axis("blockIdx.x")) s[Out].bind(tx, tvm.thread_axis("threadIdx.x")) if Pool.op in s.outputs: s[OL].compute_at(s[Out], tx) else: s[Pool].compute_at(s[Out], tx) scheduled_ops = [] def traverse(OP): """Internal traverse function""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(OP.tag): if OP not in s.outputs: s[OP].compute_inline() for tensor in OP.input_tensors: if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: traverse(tensor.op) # schedule pool elif OP.tag.startswith('pool'): PaddedInput = OP.input_tensors[0] Pool = OP.output(0) _schedule(PaddedInput, Pool) else: raise RuntimeError("Unsupported operator: %s" % OP.tag) scheduled_ops.append(OP) traverse(outs[0].op) return s
def schedule_conv1d_transpose_ncw_cuda(cfg, outs): """TOPI Schedule callback for conv1d_transpose operator. Parameters ---------- cfg: ConfigEntity The parameters for this template outs: Array of Tensor The computation graph description of conv1d transpose in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for conv1d transpose. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == 'conv1d_transpose_ncw': pad_data = op.input_tensors[0] kernel = op.input_tensors[1] conv = op.output(0) ##### space definition begin ##### n, f, x = s[conv].op.axis rc = s[conv].op.reduce_axis[0] cfg.define_split("tile_n", cfg.axis(n), num_outputs=4) cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3) cfg.define_knob("auto_unroll_max_step", [64, 512, 1500]) target = tvm.target.current_target() if target.target_name in ['nvptx', 'rocm']: cfg.define_knob("unroll_explicit", [1]) else: cfg.define_knob("unroll_explicit", [0, 1]) ##### space definition end ##### if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag: s[kernel].compute_inline() if conv.op in s.outputs: output = conv OL = s.cache_write(conv, 'local') else: output = s.outputs[0].output(0) s[conv].set_scope('local') OL = conv # create cache stage s[pad_data].set_scope('shared') AA = pad_data WW = s.cache_read(kernel, 'shared', [OL]) # tile and bind spatial axes n, f, x = s[output].op.axis kernel_scope, n = s[output].split(n, nparts=1) bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n) bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) s[output].reorder(bn, bf, bx, vn, vf, vx, tn, tf, tx, ni, fi, xi) s[output].bind(bn, tvm.thread_axis("blockIdx.z")) s[output].bind(bf, tvm.thread_axis("blockIdx.y")) s[output].bind(bx, tvm.thread_axis("blockIdx.x")) s[output].bind(vn, tvm.thread_axis("vthread")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(vx, tvm.thread_axis("vthread")) s[output].bind(tx, tvm.thread_axis("threadIdx.x")) s[OL].compute_at(s[output], tx) # number of threads n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2] n_tx = cfg["tile_x"].size[2] # tile reduction axes n, f, x = s[OL].op.axis rc, rx = s[OL].op.reduce_axis rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc) s[OL].reorder(rco, rcm, rx, rci, n, f, x) s[AA].compute_at(s[OL], rx) s[WW].compute_at(s[OL], rx) # cooperative fetching for load in [AA, WW]: n, f, x = s[load].op.axis fused = s[load].fuse(f, x) tz, fused = s[load].split(fused, nparts=n_tz) tx, fused = s[load].split(fused, nparts=n_tx) s[load].bind(tz, tvm.thread_axis("threadIdx.y")) s[load].bind(tx, tvm.thread_axis("threadIdx.x")) s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) traverse_inline(s, outs[0].op, _callback) return s
import tvm import numpy as np m = tvm.var('m') n = tvm.var('n') X = tvm.placeholder((m, n), name='X') s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: X[0, i]) s_update = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i]) s_scan = tvm.scan(s_init, s_update, s_state, inputs=[X]) # Schedule the Scan Cell s = tvm.create_schedule(s_scan.op) num_thread = 256 block_x = tvm.thread_axis('blockIdx.x') thread_x = tvm.thread_axis('threadIdx.x') xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread) s[s_init].bind(xo, block_x) s[s_init].bind(xi, thread_x) xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread) s[s_update].bind(xo, block_x) s[s_update].bind(xi, thread_x) print(tvm.lower(s, [X, s_scan], simple_mode=True)) # Build and Verify f_scan = tvm.build(s, [X, s_scan], 'cuda', name='my_scan') ctx = tvm.gpu(0) n = 1024 m = 10 a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype) a = tvm.nd.array(a_np, ctx=ctx)
def schedule_adaptive_pool(outs): """Schedule for adaptive_pool. Parameters ---------- outs: Array of Tensor The computation graph description of adaptive_pool in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for adaptive_pool. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _schedule(Pool): num_thread = 8 block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") if Pool.op in s.outputs: Out = Pool OL = s.cache_write(Pool, "local") else: Out = outs[0].op.output(0) s[Pool].set_scope("local") by, ty = s[Out].split(s[Out].op.axis[0], factor=num_thread) bx, tx = s[Out].split(s[Out].op.axis[1], factor=num_thread) s[Out].reorder(by, bx, ty, tx) s[Out].bind(ty, thread_y) s[Out].bind(tx, thread_x) s[Out].bind(by, block_y) s[Out].bind(bx, block_x) if Pool.op in s.outputs: s[OL].compute_at(s[Out], tx) else: s[Pool].compute_at(s[Out], tx) scheduled_ops = [] def traverse(OP): """Internal traverse function""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(OP.tag): if OP not in s.outputs: s[OP].compute_inline() for tensor in OP.input_tensors: if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: traverse(tensor.op) # schedule global_pool elif OP.tag.startswith('adaptive_pool'): Pool = OP.output(0) _schedule(Pool) else: raise RuntimeError("Unsupported operator: %s" % OP.tag) scheduled_ops.append(OP) traverse(outs[0].op) return s
def schedule_conv2d(outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) target = tvm.target.current_target(allow_none=False) def default_schedule(op): """NCHW conv2d schedule for non imagenet workloads""" conv = op.output(0) kernel = op.input_tensors[1] data = op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] n_pad, c_pad, h_pad, w_pad = data_pad.op.axis pad_fused = s[data_pad].fuse(n_pad, c_pad) s[data_pad].parallel(pad_fused) C = conv n, c, h, w = C.op.axis rc, ry, rx = C.op.reduce_axis fused = s[C].fuse(n, c) s[C].parallel(fused) wo, wi = s[C].split(w, factor=16) s[C].reorder(fused, rc, h, wo, ry, rx, wi) # move rc to outer loop s[C].unroll(rx) s[C].unroll(ry) s[C].vectorize(wi) def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() else: # inject custom schedule if len(op.axis) == 4 and 'avx' not in str( target): # schedule bias + bn + relu n, c, h, w = op.axis fused = s[op].fuse(n, c) s[op].parallel(fused) s[op].vectorize(w) for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nchw' in op.tag: if 'avx' in str(target): try: output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] data_pad = None if isinstance( data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] padding = infer_pad(data, data_pad) if data_pad is None: stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) wkl = _get_workload(data, kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, outs[0]) except IndexError: default_schedule(op) else: default_schedule(op) traverse(outs[0].op) return s
def test_gemm(): # graph nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n l = n A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((m, l), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') # schedule s = tvm.create_schedule(C.op) xtile, ytile = 32, 32 scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis("threadIdx.y") CC = s.cache_write(C, "local") AA = s.cache_read(A, "shared", [CC]) BB = s.cache_read(B, "shared", [CC]) by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].reorder(by, bx, yi, xi) s[C].bind(by, block_y) s[C].bind(bx, block_x) ty, yi = s[C].split(yi, nparts=num_thread) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(ty, tx, yi, xi) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) s[CC].compute_at(s[C], tx) s[AA].compute_at(s[CC], k) s[BB].compute_at(s[CC], k) ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) # lowering test s = s.normalize() # one line to build the function. def check_device(device): if not tvm.module.enabled(device): print("skip because %s is not enabled.." % device) return f = tvm.build(s, [A, B, C], device) ctx = tvm.context(device, 0) # launch the kernel. n = nn m = n l = n a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) ftimer = f.time_evaluator(f.entry_name, ctx, number=1) tcost = ftimer(a, b, c).mean print("%s: exec=%g sec/op" % (ctx, tcost)) np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5) check_device("nvptx -mcpu=sm_20") check_device("metal") check_device("opencl") check_device("cuda")
def test_llvm_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def verify_elf(path, e_machine): with open(path, "rb") as fi: arr = fi.read(20) assert struct.unpack('ccc', arr[1:4]) == (b'E', b'L', b'F') endian = struct.unpack('b', arr[0x5:0x6])[0] endian = '<' if endian == 1 else '>' assert struct.unpack(endian + 'h', arr[0x12:0x14])[0] == e_machine def build_i386(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled..") return temp = util.tempdir() target = "llvm -target=i386-pc-linux-gnu" f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x03) def build_arm(): target = "llvm -target=armv7-none-linux-gnueabihf" if not tvm.module.enabled(target): print("Skip because %s is not enabled.." % target) return temp = util.tempdir() f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x28) asm_path = temp.relpath("myadd.asm") f.save(asm_path) # Do a RPC verification, launch kernel on Arm Board if available. host = os.environ.get('TVM_RPC_ARM_HOST', None) remote = None if host: port = int(os.environ['TVM_RPC_ARM_PORT']) try: remote = rpc.connect(host, port) except tvm.TVMError as e: pass if remote: remote.upload(path) farm = remote.load_module("myadd.o") ctx = remote.cpu(0) n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) farm(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) print("Verification finish on remote..") build_i386() build_arm()
def test(): env = nnpu.get_env() shape = (16, 16) a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host') a = tvm.compute(shape, lambda *i: a_host(*i), name='a') a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf') vctr_shape = (16, ) b_host = tvm.placeholder(vctr_shape, env.cfg['dtype_n'], 'b_host') b = tvm.compute(vctr_shape, lambda *i: b_host(*i), name='b') b_buf = tvm.compute(vctr_shape, lambda *i: b(*i), name='b_buf') dtype_w = env.cfg['dtype_w'] out_shape = (16, ) k = tvm.reduce_axis((0, 16), 'k') c_buf = tvm.compute( out_shape, lambda i: tvm.sum( a_buf[i, k].astype(dtype_w) * b_buf[k].astype(dtype_w), axis=k)) bias_host = tvm.placeholder(out_shape, env.cfg['dtype_w'], 'bias_host') bias = tvm.compute(out_shape, lambda *i: bias_host(*i), 'bias') bias_buf = tvm.compute(out_shape, lambda *i: bias(*i), 'bias_buf') #c = tvm.compute(out_shape, lambda *i: c_buf(*i), name='c') #c_host = tvm.compute(out_shape, lambda *i: c(*i), name='c_host') out_buf = tvm.compute(out_shape, lambda i: c_buf[i] + bias_buf[i], 'out_buf') out = tvm.compute(out_shape, lambda *i: out_buf(*i), 'out') out_host = tvm.compute(out_shape, lambda *i: out(*i), 'out_host') s = tvm.create_schedule(out_host.op) # mark variable scopes s[a].set_scope(env.dram_scope) s[b].set_scope(env.dram_scope) s[bias].set_scope(env.dram_scope) s[out].set_scope(env.dram_scope) s[a_buf].set_scope(env.uni_scratchpad_scope) s[b_buf].set_scope(env.uni_scratchpad_scope) s[c_buf].set_scope(env.uni_scratchpad_scope) s[bias_buf].set_scope(env.uni_scratchpad_scope) s[out_buf].set_scope(env.uni_scratchpad_scope) #print(dir(s[b].op.body)) # mark compiler pragmas s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma) s[b].pragma(s[b].op.axis[0], env.dma_copy_pragma) s[bias].pragma(s[bias].op.axis[0], env.dma_copy_pragma) s[out_host].pragma(s[out_host].op.axis[0], env.dma_copy_pragma) s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls) s[b_buf].pragma(s[b_buf].op.axis[0], env.scratchpad_ls) s[bias_buf].pragma(s[bias_buf].op.axis[0], env.scratchpad_ls) s[out].pragma(s[out].op.axis[0], env.scratchpad_ls) #s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0]) # tensorize #s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VEXP', mode='inc')) s[c_buf].tensorize( s[c_buf].op.axis[0], env.intrins.get('GEMM', shape=(16, 16, 1), mode='inc', reduce=True)) #outer, inner = out_buf.op.axis #s[out_buf].reorder(inner, outer) #print(outer) #print(tvm.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True)) s[out_buf].tensorize(s[out_buf].op.axis[0], env.intrins.get('VAddV', mode='w')) # build print(tvm.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True)) print( nnpu.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True)) #exit() func = nnpu.build(s, [a_host, b_host, bias_host, out_host], 'nnpu', 'llvm', name='nnpu_exp') print('function built: ') print('------------------- device module 1 asm code: ') print(func.imported_modules[0].get_source('asm')) #print(func.get_source()) # prepare data ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=0, high=64) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=vctr_shape, dtype=b_host.dtype, low=0, high=64) #b_np = np.random.random(size=vctr_shape).astype(b_host.dtype) b_nd = tvm.nd.array(b_np, ctx) bias_np = np.random.randint(size=out_shape, dtype=bias_host.dtype, low=0, high=10000) #bias_np = np.random.random(size=out_shape).astype(bias_host.dtype) bias_nd = tvm.nd.array(bias_np, ctx) out_nd = tvm.nd.array(np.zeros(out_shape).astype(out_host.dtype), ctx) # run func(a_nd, b_nd, bias_nd, out_nd) print('run finished') print('a=') print(a_np) print('b=') print(b_np) print('bias=') print(bias_np) print('out=') print(out_nd.asnumpy()) print('numpy ground truth is: ') gt = np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)) + bias_np #gt = np.greater(np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)), bias_np) print(gt) np.testing.assert_allclose(out_nd.asnumpy(), gt)
def test(): env = nnpu.get_env() a = tvm.placeholder((32, ), env.cfg['dtype_w'], 'a') sph = ScheduleProcHelper() Imm = tvm.const(5, env.cfg['dtype_w']) a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) #c_buf = tvm.compute((32,), lambda i: tvm.select(a_buf[i]>Imm,a_buf[i],Imm), 'c_buf') c_buf = tvm.compute((32, ), lambda i: Imm + a_buf[i], 'c_buf') sph.MarkScope(c_buf) c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph) sub_buf = tvm.compute((32, ), lambda i: a_buf[i] - Imm, 'sub_buf') sph.MarkScope(sub_buf) sub_host, sub_dram = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph) mul_buf = tvm.compute((32, ), lambda i: a_buf[i] * Imm, 'mul_buf') sph.MarkScope(mul_buf) mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) div_buf = tvm.compute((32, ), lambda i: a_buf[i] / Imm, 'rdiv_buf') sph.MarkScope(div_buf) div_host, div_dram = nnpu.utils.CopyBufToH(div_buf, 'rdiv', sph) gtm_buf = tvm.compute((32, ), lambda i: tvm.max(a_buf[i], Imm), 'gtm_buf') sph.MarkScope(gtm_buf) gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph) rsub_buf = tvm.compute((32, ), lambda i: Imm - a_buf[i], 'rsub_buf') sph.MarkScope(rsub_buf) rsub_host, rsub_dram = nnpu.utils.CopyBufToH(rsub_buf, 'rsub', sph) s = tvm.create_schedule([ c_host.op, sub_host.op, mul_host.op, div_host.op, gtm_host.op, rsub_host.op ]) sph.Transform(s) s[c_buf].tensorize(s[c_buf].op.axis[0], env.intrins.get('VAddI', imm_value=Imm.value, mode='w')) s[sub_buf].tensorize( s[sub_buf].op.axis[0], env.intrins.get('VSubI', imm_value=Imm.value, mode='w')) s[mul_buf].tensorize( s[mul_buf].op.axis[0], env.intrins.get('VMulI', imm_value=Imm.value, mode='w')) s[div_buf].tensorize( s[div_buf].op.axis[0], env.intrins.get('VDivI', imm_value=Imm.value, mode='w')) s[gtm_buf].tensorize( s[gtm_buf].op.axis[0], env.intrins.get('VGTMI', imm_value=Imm.value, mode='w')) s[rsub_buf].tensorize( s[rsub_buf].op.axis[0], env.intrins.get('ISubV', imm_value=Imm.value, mode='w')) print( nnpu.lower( s, [a, c_host, sub_host, mul_host, div_host, gtm_host, rsub_host], simple_mode=True)) func = nnpu.build( s, [a, c_host, sub_host, mul_host, div_host, gtm_host, rsub_host], 'nnpu', 'llvm', name='nnpu_vmuli') print('------------------- device module 1 IR: ') print(func.imported_modules[0].get_source('ir')) print('------------------- device module 1 uop code: ') print(func.imported_modules[0].get_source('uop')) ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(32, ), dtype=a.dtype, low=3, high=122) #a_np = np.random.random(size=shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) c_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx) sub_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx) div_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx) gtm_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx) rsub_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx) func(a_nd, c_nd, sub_nd, mul_nd, div_nd, gtm_nd, rsub_nd) print('a = ') print(a_nd.asnumpy()) print('a + {0} = '.format(Imm.value)) print(c_nd.asnumpy()) print('numpy ground truth =') gt = a_np + Imm.value print(gt) np.testing.assert_allclose(c_nd.asnumpy(), gt) print('a - {0} = '.format(Imm.value)) print(sub_nd.asnumpy()) np.testing.assert_allclose(sub_nd.asnumpy(), a_np - Imm.value) print('a * {0} = '.format(Imm.value)) print(mul_nd.asnumpy()) np.testing.assert_allclose(mul_nd.asnumpy(), a_np * Imm.value) print('a > {0} ? a : {0} = '.format(Imm.value)) print(gtm_nd.asnumpy()) #np.testing.assert_allclose(gtm_nd.asnumpy(), a_np Imm.value) print('{0} - a = '.format(Imm.value)) print(rsub_nd.asnumpy()) np.testing.assert_allclose(rsub_nd.asnumpy(), Imm.value - a_np) print('test passed')
# parallel manner. TVM asks the user to provide a description of the # computation called a schedule. # # A schedule is a set of transformation of computation that transforms # the loop of computations in the program. # # After we construct the schedule, by default the schedule computes # C in a serial manner in a row-major order. # # .. code-block:: c # # for (int i = 0; i < n; ++i) { # C[i] = A[i] + B[i]; # } # s = tvm.create_schedule(C.op) ###################################################################### # We used the split construct to split the first axis of C, # this will split the original iteration axis into product of # two iterations. This is equivalent to the following code. # # .. code-block:: c # # for (int bx = 0; bx < ceil(n / 64); ++bx) { # for (int tx = 0; tx < 64; ++tx) { # int i = bx * 64 + tx; # if (i < n) { # C[i] = A[i] + B[i]; # } # }
def gemm_tuning(batch, N, L, M): bn = 32 A = tvm.placeholder((batch, N, L), name='A', dtype='float32') B = tvm.placeholder((batch, L, M), name='B', dtype='float32') packedB = tvm.compute((batch, N / bn, L, bn), lambda b, x, y, z: B[b, y, x * bn + z], name='packedB') k = tvm.reduce_axis((0, L), name='k') C = tvm.compute( (batch, M, N), lambda b, x, y: tvm.sum(A[b, x, k] * packedB[b, y / bn, k, y % bn], axis=k), name='C') s = tvm.create_schedule(C.op) ##### define space and schedule cfg = autotvm.get_config() bn = 32 CC = s.cache_write(C, 'global') factor_range = [2, 4, 8, 16, 32, 64] cfg.define_knob('tile_factor_x', factor_range) cfg.define_knob('tile_factor_y', factor_range) bx = cfg['tile_factor_x'].val by = cfg['tile_factor_y'].val xo, yo, xi, yi = s[C].tile(C.op.axis[1], C.op.axis[2], bx, by) s[CC].compute_at(s[C], yo) b, xc, yc = s[CC].op.axis k, = s[CC].op.reduce_axis """cfg.define_split("split_k", k, num_outputs=2) ko, ki = cfg["split_k"].apply(s, CC, k)""" k_num_outputs_range = [2, 3, 4, 5, 6, 7, 8] cfg.define_knob('k_outputs', k_num_outputs_range) k_outputs = cfg['k_outputs'].val cfg.define_split("split_k", k, policy='all', num_outputs=k_outputs) k_list = cfg["split_k"].apply(s, CC, k) cfg.define_reorder("reorder_k", axes=[xc, yc] + k_list, policy='all') cfg["reorder_k"].apply(s, CC, [xc, yc] + k_list) """cfg.define_reorder("reorder_k", [ko, xc, ki, yc], policy='all') cfg["reorder_k"].apply(s, CC, s[CC].op.axis)""" # s[CC].reorder(ko, xc, ki, yc) k_unroll_id = list(range(k_outputs)) # print(len(k_list)) cfg.define_knob('k_unroll', k_unroll_id) k_id = cfg['k_unroll'].val # print(type(k_id)) s[CC].unroll(k_list[k_id]) # s[CC].unroll(ki) cfg.define_knob('vector_dim', [0, 1]) vector_id = cfg['vector_dim'].val if vector_id == 0: s[CC].vectorize(yc) else: s[CC].vectorize(xc) # s[CC].vectorize(yc) parallel_list = [xo, yo, xi, yi] cfg.define_knob('parallel_C', list(range(len(parallel_list)))) parallel_C_id = cfg['parallel_C'].val # print(len(parallel_list)) s[C].parallel(parallel_list[parallel_C_id]) # s[C].parallel(xo) return s, [A, B, C]
def single_lstm(): num_gate = 4 hidden_size = tvm.var('hidden_size') batch_size = tvm.var('batch_size') input_size = tvm.var('input_size') # A single LSTM block operations without unrolling # '*' linear transformation # '(*)' elementwise multiplication # F_t = sigmoid( W_f * x_t + R_f * h_t-1 + b_f ) # I_t = sigmoid( W_i * x_t + R_i * h_t-1 + b_i ) # O_t = sigmoid( W_o * x_t + R_o * h_t-1 + b_o ) # C'_t = tanh( W_c * x_t + R_c * h_t-1 + b_c ) # C_t = F_t (*) C_t-1 + I_t (*) C'_t # h_t = O_t (*) tanh( C_t ) # Global transition matrix # input X[0..t-1] X = tvm.placeholder((batch_size, input_size), name="X") Prev_h = tvm.placeholder((batch_size, hidden_size), name="Prev_h") Prev_c = tvm.placeholder((batch_size, hidden_size), name="Prev_c") # Parameters # Weight matrices [W_i, W_f, W_o, W_c]: 4 * hidden_size * input_size # Bias: 4 * hidden_size Wi2h = tvm.placeholder((num_gate, hidden_size, input_size), name="Wi2h") Bi2h = tvm.placeholder((num_gate, hidden_size), name="Bi2h") # Weight matrices [R_i, R_f, R_o, R_c]: 4 * hidden_size * hidden_size # Only handle hidden transition, saves space. Wh2h = tvm.placeholder((num_gate, hidden_size, hidden_size), name="Wh2h") Bh2h = tvm.placeholder((num_gate, hidden_size), name="Bh2h") # LSTM transition # [W_i, W_f, W_o, W_c] * X_t: 4 * num_hidden l = tvm.reduce_axis((0, input_size), name="li2h") i2h = tvm.compute((batch_size, num_gate, hidden_size), lambda i, x, j: tvm.sum(X[i, l] * Wi2h[x, j, l], axis=l), name="i2h") # [R_i, R_f, R_o, R_c] * h_t-1: 4 * hidden_size # R: hidden_size * hidden_size, h: hidden_size * 1 k = tvm.reduce_axis((0, hidden_size), name="ki2h") h2h = tvm.compute( (batch_size, num_gate, hidden_size), lambda i, x, j: tvm.sum(Prev_h[i, k] * Wh2h[x, j, k], axis=k), name="h2h") gates = tvm.compute( (batch_size, num_gate, hidden_size), lambda i, j, k: i2h[i, j, k] + h2h[i, j, k] + Bi2h[j, k] + Bh2h[j, k], name="gates") gshape = (batch_size, hidden_size) in_gate = tvm.compute(gshape, lambda i, j: tvm.sigmoid(gates[i, 0, j]), name="in_gate") forget_gate = tvm.compute(gshape, lambda i, j: tvm.sigmoid(gates[i, 1, j]), name="forget_gate") out_gate = tvm.compute(gshape, lambda i, j: tvm.sigmoid(gates[i, 2, j]), name="out_gate") in_transform = tvm.compute(gshape, lambda i, j: tvm.tanh(gates[i, 3, j]), name="in_transform") # C_t = F_t o C_t-1 + I_t o C'_t state_c = tvm.compute((batch_size, hidden_size), lambda i, j: forget_gate[i, j] * Prev_c[i, j] + in_gate[i, j] * in_transform[i, j], name="state_c") # h_t = O_t o tanh( C_t ) # state_h = tvm.compute((batch_size, hidden_size), # lambda i, j: out_gate[i, j] * tvm.tanh(state_c[i, j]), name="state_h") out_c, out_h = tvm.compute( (batch_size, hidden_size), lambda i, j: (state_c[i, j], out_gate[i, j] * tvm.tanh(state_c[i, j])), name="outputs_c_h") # schedule s = tvm.create_schedule(out_h.op) print( tvm.lower(s, [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h], simple_mode=True)) lstm = tvm.build(s, [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h], name="single_lstm") print(lstm) lstm.save("remy_single_lstm.o") print(lstm.imported_modules) cc.create_shared("remy_single_lstm.so", ["remy_single_lstm.o"])
def schedule_bitserial_dense(cfg, outs): """Schedule for binary_dense. Parameters ---------- outs: Array of Tensor The computation graph description of bitserial dense operator. in the format of an array of tensors. Returns ------- s: Schedule The computation schedule for bitserial_dense. """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _schedule(cfg, s, data_vec, weight_vec, output, unipolar): z, k, _, y, x = s[weight_vec].op.axis s[weight_vec].parallel(z) s[weight_vec].vectorize(x) x, y = s[output].op.axis wb, db, k = s[output].op.reduce_axis _, DB, _ = get_const_tuple(data_vec.shape) _, _, WB, _, _ = get_const_tuple(weight_vec.shape) yo, yi = cfg["tile_y"].apply(s, output, y) xo, xi = cfg["tile_x"].apply(s, output, x) ko, ki = cfg["tile_k"].apply(s, output, k) cfg["reorder_0"].apply(s, output, [yo, xo, ko, xi, wb, db, yi, ki]) fused = s[output].fuse(xo, yo) s[output].parallel(fused) nfactor = cfg['tile_y'].size[-1] kfactor = cfg['tile_k'].size[-1] if nfactor % 8 == 0: pc = _intrin_popcount(nfactor, kfactor, WB, DB, unipolar) s[output].tensorize(wb, pc) return s def traverse(op): """Internal travserse function""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag) or 'elemwise' in op.tag: if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if isinstance(tensor.op, tvm.tensor.ComputeOp): traverse(tensor.op) elif op.tag == 'bitserial_dense' or 'bitserial_dense_unipolar': output = op.output(0) weight_vec = op.input_tensors[0] data_vec = op.input_tensors[1] data = data_vec.op.input_tensors[0] if "QuantizeInput" in data.op.name: data = data.op.input_tensors[0] unipolar = (output.op.tag == 'bitserial_dense_unipolar') _schedule(cfg, s, data_vec, weight_vec, output, unipolar) else: raise RuntimeError("Unsupported operator: %s" % op.tag) traverse(outs[0].op) return s
def show_lowered(outputs, inputs): sout = tvm.create_schedule([o.op for o in outputs]) mout = tvm.lower(sout, outputs + inputs, simple_mode=True) print(mout)
def measure_compute_mad(total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, ctx, n_times): """ measure peak compute speed by computing mad for a type The IR for measurement is for each thread for i in 1..item_per_thread x = mad(x, x, y) y = mad(y, y, x) Parameters ---------- total_item: int number of elements in input array item_per_thread: int number of operations each thread does base_type: str can be "int", "float" bits: int can be 16, 32 lanes: int lane of the vector type, can be 1, 2, 4, 8, 16 target: :any:`tvm.target.Target` the target and option of the compilation. target_host : str or :any:`tvm.target.Target` host compilation target remote: tvm.contrib.rpc.RPCSession if it is not None, use remote rpc session ctx: TVMcontext the context of array n_times: int number of runs for taking mean Returns ------- GOPS: float giga operation per second """ n = total_item if bits >= 64 or lanes >= 16: n //= 2 max_threads = target.max_num_threads base_type = str(base_type) + str(bits) dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) def extern(ins, outs): # pylint: disable=unused-argument """construct measurement function by building IR directly""" ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", n // max_threads) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var a = ib.allocate(dtype, (1), name='a', scope='local') b = ib.allocate(dtype, (1), name='b', scope='local') a[0] = outs[0].vload(idx, dtype) b[0] = outs[0].vload(idx, dtype) if base_type.find('float') != -1: mad_func = lambda x, y: (x * x + y) else: mad_func = lambda x, y: y * y + x for _ in range(item_per_thread // 4 // lanes): a[0] = mad_func(a[0], b[0]) b[0] = mad_func(b[0], a[0]) ib.emit(outs[0].vstore(idx, b[0])) return ib.get() y = tvm.extern((n,), [], extern, name="y", dtype=dtype) s = tvm.create_schedule(y.op) try: func = tvm.build(s, [y], target, target_host=target_host) func = _convert_to_remote(func, remote) time_f = func.time_evaluator(func.entry_name, ctx, number=n_times) y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx) time = time_f(y).mean except tvm._ffi.base.TVMError: # build error (occur when device does not support half) return -1 return 1.0 * (n * item_per_thread) / 1e9 / time
import numpy as np N = tvm.var('N') # Data set size V = tvm.var('V') # Feature number C = tvm.var('C') # Center number data = tvm.placeholder((N, V), name='data') center = tvm.placeholder((C, V), name='center') # === Start computation # Compute distances rv = tvm.reduce_axis((0, V), name='rv') dis = tvm.compute((N, C), lambda n, c: tvm.sum( (data[n, rv]-center[c, rv]).astype('float64')* (data[n, rv]-center[c, rv]).astype('float64'), axis=rv), name='dis') rc = tvm.reduce_axis((0, C), name='rc') mse_n = tvm.compute((N,), lambda n: tvm.sum(dis[n, rc], axis=rc), name='mse_n') rn = tvm.reduce_axis((0, N), name='rn') mse = tvm.compute((1,), lambda i: tvm.sum(mse_n[rn], axis=rn), name='mse') # === End computation # Scheduling s = tvm.create_schedule(mse.op) # Compilation calc = tvm.build(s, [data, center, mse]) assert calc