def verify_log_softmax(m, n): A = tvm.placeholder((m, n), name='A') B = topi.nn.log_softmax(A) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.log_softmax_python(a_np) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_softmax(B) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="log_softmax") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ["opengl"]: check_device(device)
def verify_softmax(m, n): A = tvm.placeholder((m, n), name='A') B = topi.cpp.nn.softmax(A, 1) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.softmax_python(a_np) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) target = topi.cpp.TEST_create_target(device) if device == "llvm": s = topi.cpp.generic.default_schedule(target, [B], False) else: s = topi.cpp.cuda.schedule_softmax(target, [B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="softmax") foo(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm']: check_device(device)
def verify_softmax(m, n, dtype="float32"): A = tvm.placeholder((m, n), dtype=dtype, name='A') B = topi.nn.softmax(A) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.softmax_python(a_np) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_softmax(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], device, name="softmax") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']: check_device(device)
def test_lstm_cell_inline(): num_step = 128 num_input = 256 num_hidden = 1152 batch_size = 4 # Global transition matrix X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X") Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h") Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h") # h: output hidden state, c: cell state. s_state_h = tvm.placeholder((num_step, batch_size, num_hidden)) s_state_c = tvm.placeholder((num_step, batch_size, num_hidden)) s_init_c = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c") s_init_h = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h") # LSTM transition k = tvm.reduce_axis((0, num_input), name="ki2h") s_i2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k), name="s_i2h") k = tvm.reduce_axis((0, num_hidden), name="ki2h") s_h2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k), name="s_h2h") # Gate rules gates = tvm.compute(s_i2h.shape, lambda *i: s_i2h(*i) + s_h2h(*i), name="gates") gshape = (num_step, batch_size, num_hidden) in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate") in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform") forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate") out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate") next_c = tvm.compute(gshape, lambda t, i, j: forget_gate[t, i, j] * s_state_c[t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j], name="next_c") next_h = tvm.compute(gshape, lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h") update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c") update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h") # schedule scan_h, scan_c = tvm.scan( [s_init_h, s_init_c], [update_h, update_c], [s_state_h, s_state_c], inputs=[X], name="lstm_scan") # schedule s = tvm.create_schedule(scan_h.op) # Inline gate computations s[gates].compute_inline() s[in_gate].compute_inline() s[in_transform].compute_inline() s[forget_gate].compute_inline() s[out_gate].compute_inline() # verify we can lower correctly tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
def test_loop_dependent_allocate(): N = tvm.var("N") A = tvm.placeholder((2*N,), "float32", "A") C = tvm.compute((N, ), lambda i: A[2*i] + A[i+1], name='C') s = tvm.create_schedule(C.op) AA = s.cache_read(A, "local", [C]) s[AA].compute_at(s[C], s[C].op.axis[0]) # this line should fail due to IRUseDefAnalysis sees an allocate statement # referencing undefined variable tvm.lower(s, [A,C])
def verify_log_softmax(m, n, dtype="float32"): A = tvm.placeholder((m, n), dtype=dtype, name='A') B = topi.nn.log_softmax(A) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.log_softmax_python(a_np) for device in get_all_backend(): check_device(A, B, a_np, b_np, device, "log_softmax")
def verify_softmax(m, n, dtype="float32"): A = tvm.placeholder((m, n), dtype=dtype, name='A') B = topi.nn.softmax(A) # confirm lower works s = tvm.create_schedule([B.op]) tvm.lower(s, [A, B], simple_mode=True) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.softmax_python(a_np) for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']: check_device(A, B, a_np, b_np, device, "softmax")
def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride): """ Runs the inference and checks the functional correctness between compute and schedule outputs """ (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride, out_dtype) # Create TVM placeholders data = tvm.placeholder(data_shape, name='data', dtype=data_dtype) kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype) # Create the numpy arrays to be used for executing conv models if data_dtype == 'float32': data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX) kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX) else: data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype)) kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype)) # c_orig will be used for declaration ouptut # c_sch will be used for scheduled computation output c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) with tvm.target.create(TARGET_NAME): conv = topi.nn.conv2d_NCHWc(data, kernel, stride=hstride, padding=hpad, layout='NCHWc', out_layout='NCHWc', out_dtype=out_dtype) out = topi.nn.relu(conv) sch = tvm.create_schedule(out.op) func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out') func(data_array, kernel_array, c_orig) LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True)) # Generate and run the optimized schedule sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out]) func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv') func(data_array, kernel_array, c_sch) # Functional check if data_dtype == 'uint8': np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy()) else: assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy()) evaluator = func.time_evaluator(func.entry_name, CTX, number=1000) LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True)) return evaluator(data_array, kernel_array, c_sch).mean
def test_add_pipeline(): nn = 64 max_threads = 4 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, (n+1) // 2) as i: ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() def extern_generator_gpu(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() bx = tvm.thread_axis("blockIdx.x") tx = tvm.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads) ib.scope_attr(tx, "thread_extent", max_threads) idx = bx.var * max_threads + tx.var with ib.if_scope(ib.likely(idx < n)): ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C') C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C') s_cpu = tvm.create_schedule(C_cpu.op) s_gpu = tvm.create_schedule(C_gpu.op) print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True)) print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True)) def check_target(target): if not tvm.module.enabled(target): return s = s_gpu if target in ['opencl', 'cuda'] else s_cpu C = C_gpu if target in ['opencl', 'cuda'] else C_cpu # build and invoke the kernel. f = tvm.build(s, [A, C], target) ctx = tvm.context(target, 0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1) check_target("llvm") check_target("opencl") check_target("cuda")
def check_device(device, target_device): if not tvm.module.enabled(target_device): print("Skip test because {} is not enabled.".format(target_device)) return device_ctx = tvm.context(device) graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type) shape = (4,) # Create module for add whose target is the device. tensor_a = tvm.placeholder(shape, name="A") tensor_b = tvm.placeholder(shape, name="B") elemwise_add = tvm.compute(shape, lambda *i: tensor_a(*i) + tensor_b(*i), name="elemwise_add") target = topi.cpp.TEST_create_target(device) schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add]) lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add], name="elemwise_add") # Insert copy. Neither compute nor schedule is required for the copy # node. The compute will be performed at runtime which is just data # copy from the input to the output. tensor_copy = tvm.placeholder(shape, name="__copy") # Create module for sub whose target is the host. tensor_c = tvm.placeholder(shape, name="C") elemwise_sub = tvm.compute(shape, lambda *i: tensor_copy(*i) - tensor_c(*i), name="elemwise_sub") schedule_sub = tvm.create_schedule(elemwise_sub.op) lower_sub = tvm.lower(schedule_sub, [tensor_copy, tensor_c, elemwise_sub], name="elemwise_sub") target_flist = {target_device: [lower_add], target_host: [lower_sub]} mhost = tvm.build(target_flist, target_host=target_host) ctx = [host_ctx, device_ctx] mod = graph_runtime.create(graph, mhost, ctx) params = {} params["A"] = tensor_a = np.random.uniform( size=shape).astype(tensor_a.dtype) params["B"] = tensor_b = np.random.uniform( size=shape).astype(tensor_b.dtype) params["C"] = tensor_c = np.random.uniform( size=shape).astype(tensor_c.dtype) mod.set_input(**params) mod.run() out = mod.get_output(0, tvm.nd.empty(shape)) np.testing.assert_equal( out.asnumpy(), (tensor_a + tensor_b) - tensor_c)
def _lower(sch, inputs, func_name, graph): import traceback # pylint: disable=broad-except try: f = tvm.lower(sch, inputs, name=func_name) logging.debug("lower function %s", func_name) logging.debug("%s", tvm.lower(sch, inputs, simple_mode=True)) except Exception: msg = traceback.format_exc() msg += "Error during compile graph\n" msg += "--------------------------\n" msg += graph.ir(join_entry_attrs=["shape"]) raise RuntimeError(msg) return f if isinstance( f, (tvm.container.Array, tuple, list)) else [f]
def lower(*args, **kwargs): """Thin wrapper of tvm.lower This wrapper automatically applies VTA's build_config if there is no user specified build_config in context. See Also -------- tvm.lower : The original TVM's lower function """ cfg = tvm.build_module.current_build_config() if not cfg.add_lower_pass: with build_config(): return tvm.lower(*args, **kwargs) return tvm.lower(*args, **kwargs)
def test_local_gemm(): if not tvm.module.enabled("opengl"): return if not tvm.module.enabled("llvm"): return nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n l = n A = tvm.placeholder((n, l), name='A', dtype='int32') B = tvm.placeholder((m, l), name='B', dtype='int32') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') s = tvm.create_schedule(C.op) s[C].opengl() print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opengl", name="gemm") print("------opengl code------") print(f.imported_modules[0].get_source(fmt="gl")) ctx = tvm.opengl() n, m, l = nn, nn, nn a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype) b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
def test_in_bounds_conv_llvm(loop_tiling=False): HSTR = WSTR = 1 in_channel = 128 kernel_height = kernel_width = 3 out_channel = 64 batch_size = 1 in_height = in_width = 64 out_height = out_width = in_height - kernel_height + 1 data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data') kernel = tvm.placeholder((kernel_height, kernel_width, in_channel, out_channel), name='kernel') ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute((batch_size, out_channel, out_height, out_width), lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]), name="conv2d") s = tvm.create_schedule(conv.op) n, oc, oh, ow = conv.op.axis if loop_tiling: oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True) print (lowered_func.body) ctx = tvm.cpu (0) f = tvm.build(s, [data, kernel, conv], "llvm") data_input = tvm.nd.array(np.random.uniform( size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx) kernel_input = tvm.nd.array(np.random.uniform( size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx) conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx) f(data_input, kernel_input, conv_out)
def test_in_bounds_vectorize_llvm(): n = 512 lanes = 2 A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes) B = tvm.compute((n,), lambda i: A[i], name='B') C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], nparts=2) _, xi = s[C].split(xi, factor=2) s[C].parallel(xo) s[C].vectorize(xi) s[B].compute_at(s[C], xo) xo, xi = s[B].split(B.op.axis[0], factor=2) s[B].vectorize(xi) # build and invoke the kernel. lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False) print (lowered_func.body) f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((n,), A.dtype).copyfrom( np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), C.dtype, ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def test_upstream(): @tvm.hybrid.script def upstream(a): b = output_tensor((20, ), 'float32') for i in range(20): b[i] = a[i] * i return b a = tvm.placeholder((20, ), 'float32') b = tvm.placeholder((20, ), 'float32') c = tvm.compute((20, ), lambda x: a[x] + b[x]) d = upstream(c) sch = tvm.create_schedule([c.op, d.op]) ir = tvm.lower(sch, [a, b, d], simple_mode=True) func = tvm.build(sch, [a, b, d]) assert(func) a = numpy.random.randn(20).astype('float32') b = numpy.random.randn(20).astype('float32') ref = numpy.zeros((20, ), 'float32') for i in range(20): ref[i] = (a[i] + b[i]) * i tvm_a = tvm.nd.array(a) tvm_b = tvm.nd.array(b) tvm_d = tvm.nd.array(numpy.zeros((20, )).astype('float32')) func(tvm_a, tvm_b, tvm_d) tvm.testing.assert_allclose(tvm_d.asnumpy(), ref, 1e-5, 1e-5)
def test_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline""" ib = tvm.ir_builder.create() with ib.for_range(0, n/2) as i: ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2"))) return ib.get() C = tvm.extern(A.shape, [A], extern_generator, name='C') s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, C], simple_mode=True)) def check_llvm(): if not tvm.module.enabled("llvm"): return # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) f(a, c) np.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 1) check_llvm()
def test_double_splitting_with_indivisible_factors(): m = 48 dtype="float32" A = tvm.placeholder((m,), name='A', dtype=dtype) C = tvm.compute((m,), lambda i: A[i], name='C') D = tvm.compute((m,), lambda i: C[i], name='D') s = tvm.create_schedule(D.op) co, ci = s[C].split(C.op.axis[0], factor=10) do, di = s[D].split(D.op.axis[0], 32) s[C].compute_at(s[D], do) target = 'llvm' with tvm.build_config(partition_const_loop=True): f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False) func = tvm.build(f, target=target) # Find the beginning of the Halide IR corresponding to kernel code # and make sure it doesn't have an if statements left top_produce = find_top_produce(f.body) assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse)))) # check functional correctness of generated code ctx = tvm.context(target, 0) a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx) c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx) func(a, c, d) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5) tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
def check_c(): if not tvm.module.enabled("llvm"): return # Specifically allow offset to test codepath when offset is available Ab = tvm.decl_buffer( A.shape, A.dtype, elem_offset=tvm.var('Aoffset'), offset_factor=8, name='A') binds = {A : Ab} # BUILD and invoke the kernel. f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) mhost = tvm.codegen.build_module(fsplits[0], "c") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m["fadd_pipeline"] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy())
def conv_normal(print_ir): print("----- CONV2D CPU End-to-End Test-------") s = topi.generic.schedule_conv2d_nchw([res]) if print_ir: print(tvm.lower(s, [data, kernel, res], simple_mode=True)) cost = verify(s, True) gops = (num_ops / cost.mean) / float(10 ** 9) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir, check_correctness): s = tvm.create_schedule(res.op) s[data_buf].set_scope(env.inp_scope) s[weight_buf].set_scope(env.wgt_scope) s[res_gem].set_scope(env.acc_scope) s[res_shf].set_scope(env.acc_scope) s[res_min].set_scope(env.acc_scope) s[res_max].set_scope(env.acc_scope) if block: bblock = block // env.BATCH iblock = block // env.BLOCK_IN oblock = block // env.BLOCK_OUT xbo, xco, xbi, xci = s[res].op.axis xb1, xco1, xb2, xco2 = s[res].tile(xbo, xco, bblock, oblock) store_pt = xb2 s[res_gem].compute_at(s[res], xco1) s[res_shf].compute_at(s[res], xco1) s[res_min].compute_at(s[res], xco1) s[res_max].compute_at(s[res], xco1) xbo, xco, xbi, xci = s[res_gem].op.axis # Compute one line at a time ko1, ko2 = s[res_gem].split(ko, iblock) s[res_gem].reorder(ko1, ko2, xbo, xco, xbi, xci, ki) s[data_buf].compute_at(s[res_gem], ko1) s[weight_buf].compute_at(s[res_gem], ko1) # Use VTA instructions s[data_buf].pragma(s[data_buf].op.axis[0], load_inp) s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt) s[res_gem].tensorize(xbi, gemm) s[res_shf].pragma(s[res_shf].op.axis[0], alu) s[res_min].pragma(s[res_min].op.axis[0], alu) s[res_max].pragma(s[res_max].op.axis[0], alu) s[res].pragma(store_pt, store_out) else: xbo, xco, xbi, xci = s[res_gem].op.axis s[res_gem].reorder(ko, xbo, xco, xbi, xci, ki) # Use VTA instructions s[data_buf].pragma(s[data_buf].op.axis[0], load_inp) s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt) s[res_gem].tensorize(xbi, gemm) s[res_shf].pragma(s[res_shf].op.axis[0], alu) s[res_min].pragma(s[res_min].op.axis[0], alu) s[res_max].pragma(s[res_max].op.axis[0], alu) s[res].pragma(s[res].op.axis[0], store_out) if print_ir: print(tvm.lower(s, [data, weight, res], simple_mode=True)) return verify(s, check_correctness)
def main(): n = tvm.var('n') A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) s[C].parallel(s[C].op.axis[0]) print(tvm.lower(s, [A, B, C], simple_mode=True)) tvm.build(s, [A, B, C], 'llvm --system-lib').save(osp.join(sys.argv[1], 'test.o'))
def check(factor): s = tvm.create_schedule(z.op) xo, xi = s[z].split(z.op.axis[0], factor=factor) vadd = intrin_vadd(factor) s[z].tensorize(xi, vadd) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[z], dom_map) assert tvm.ir_pass.Equal(out_dom[z.op.axis[0]].extent, factor) assert tvm.ir_pass.Equal(out_dom[z.op.axis[0]].min, xo * factor) assert tvm.ir_pass.Equal(in_dom.items()[0][1][0].extent, factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[z], out_dom, in_dom, vadd) assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(vadd.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [x, y, z])
def check(factor): s = tvm.create_schedule(C.op) x, y = C.op.axis yo, yi = s[C].split(y, factor=factor) gemv = intrin_gemv(factor, l) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C])
def prepare_test_libs(base_path): n = tvm.var('n') A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B') s = tvm.create_schedule(B.op) s[B].parallel(s[B].op.axis[0]) print(tvm.lower(s, [A, B], simple_mode=True)) # Compile library in system library mode fadd_syslib = tvm.build(s, [A, B], 'llvm --system-lib', name='addonesys') syslib_path = osp.join(base_path, 'test_addone_sys.o') fadd_syslib.save(syslib_path)
def check_llvm(): if not tvm.module.enabled("llvm"): return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], "llvm") fadd1 = m['fadd1'] fadd2 = m['fadd2'] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd1(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) fadd2(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy())
def check_rfactor_no_reset_multi_reduction(factor, rfactor): s = tvm.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) roo, roi = s[C].split(ro, factor=2) s[C].reorder(yo, roo, roi, yi, ri) gemv = intrin_gemv_no_reset(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]), tvm.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C])
def _lower(sch, inputs, func_name, graph): import traceback # pylint: disable=broad-except try: f = tvm.lower(sch, inputs, name=func_name) if "quantized_conv2d" in func_name: logging.info(graph.ir(join_entry_attrs=["shape"])) except Exception: msg = traceback.format_exc() msg += "Error during compile graph\n" msg += "--------------------------\n" msg += graph.ir(join_entry_attrs=["shape"]) raise RuntimeError(msg) return f if isinstance( f, (tvm.container.Array, tuple, list)) else [f]
def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_pool(B) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) print(tvm.lower(s, [A, B], simple_mode=True)) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def test_lower_rfactor(): n = tvm.var("n") m = tvm.var("m") A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B") s = tvm.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B.op].bind(xo, tvm.thread_axis("blockIdx.x")) s[B.op].bind(xi, tvm.thread_axis("threadIdx.y")) s[B].bind(s[B].op.reduce_axis[0], tvm.thread_axis("threadIdx.x")) s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) fapi = tvm.lower(s, [A, B])
def test_tensor_intrin_scalar_params(): n = te.size_var("n") x = te.placeholder((n, ), name="x") v = te.size_var("v") w = te.size_var("w") z = te.compute((n, ), lambda i: x[i] * v + w, name="z") def intrin_func(ins, outs, sp): assert isinstance(ins[0], tvm.te.schedule.Buffer) assert ins[0].shape[0] == n assert sp[0] == v assert sp[1] == w return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1]) intrin = te.decl_tensor_intrin(z.op, intrin_func, scalar_params=[v, w], default_buffer_params={"offset_factor": 1}) assert intrin.op == z.op assert intrin.reduce_init is None assert tuple(intrin.inputs) == tuple(z.op.input_tensors) assert intrin.buffers[0].shape[0] == n assert tuple(intrin.scalar_params) == tuple((v, w)) A = te.placeholder((10, 10), name="A") # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs C = te.compute((10, 10), lambda i, j: intrin(i * i, A[i, j], i + j), name="C") s = te.create_schedule(C.op) stmt = tvm.lower(s, [A, C])["main"].body assert isinstance(stmt.body.body, tvm.tir.Evaluate) assert len(stmt.body.body.value.args) == 5 assert str(stmt.body.body.value.args[3]) == "(i: int32*i)" assert str(stmt.body.body.value.args[4]) == "(i: int32 + j: int32)"
def mod(self, target, load_type, store_type, indirect_indices): target = tvm.target.Target(target) n = 4 dtype = "int32" A = te.placeholder((n, ), dtype=dtype, name="A") R = te.placeholder((n, ), dtype=dtype, name="R") def do_compute(ins, outs): ib = tvm.tir.ir_builder.create() A, R = map(ib.buffer_ptr, ins) B = ib.buffer_ptr(outs[0]) if "gpu" in target.keys: ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0) index_map = { "ramp": tvm.tir.Ramp(0, 1, 4), "broadcast": tvm.tir.Broadcast(0, 4), } load_index = index_map[load_type] store_index = index_map[store_type] if indirect_indices: load_index = tvm.tir.expr.Load("int32x4", R, load_index) transfer = tvm.tir.expr.Load("int32x4", A, load_index) ib.emit(tvm.tir.stmt.Store(B, transfer, store_index)) return ib.get() B = te.extern(A.shape, [A, R], do_compute, dtype="int32") s = te.create_schedule(B.op) return tvm.lower(s, [A, R, B])
def test_lower_warp_memory_same_thread(): m = n = 128 A = te.placeholder((m, n), name="A") k = te.reduce_axis((0, n), name="k") B = te.compute((m,), lambda i: te.sum(A[i, k], axis=[k])) s = te.create_schedule(B.op) BB = s.cache_write(B, "warp") tx = te.thread_axis("threadIdx.x") xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xi, tx) s[B].bind(xo, te.thread_axis("blockIdx.x")) s[BB].compute_at(s[B], xo) xo, xi = s[BB].split(s[BB].op.axis[0], factor=32) s[BB].bind(xi, tx) cuda_target = tvm.target.Target("cuda") assert cuda_target.thread_warp_size == 32 mod = tvm.lower(s, [A, B], name="f") mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", cuda_target))(mod) fdevice = tvm.tir.transform.SplitHostDevice()(mod)["f_kernel0"] mod = tvm.IRModule.from_expr(fdevice) fdevice = tvm.tir.transform.LowerWarpMemory()(mod)["f_kernel0"] assert "tvm_warp_shuffle" not in fdevice.astext()
def test_multilevel_splitting_with_indivisble_factors(): from tvm import topi A = te.placeholder((130, ), dtype="float32") B = topi.nn.relu(A) s = te.create_schedule(B.op) (y, ) = s[B].op.axis (yo, yi) = s[B].split(y, factor=8) (yoo, yoi) = s[B].split(yo, factor=16) s[B].reorder(yoo, yoi, yi) s[B].unroll(yi) ## But this does the right thing. with tvm.transform.PassContext( config={"tir.LoopPartition": { "partition_const_loop": True }}): lowered_body = tvm.lower(s, [A, B], name="x")["x"].body def visit_stmt(op): return isinstance(op, tvm.tir.Max) num_max = collect_visit(lowered_body, visit_stmt) assert num_max.count(True) == 10
def test_in_bounds_vectorize_llvm(): n = 512 lanes = 2 A = te.placeholder((n, ), name='A', dtype="float32x%d" % lanes) B = te.compute((n, ), lambda i: A[i], name='B') C = te.compute((n, ), lambda i: B[i] + tvm.tir.const(1, A.dtype), name='C') s = te.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], nparts=2) _, xi = s[C].split(xi, factor=2) s[C].parallel(xo) s[C].vectorize(xi) s[B].compute_at(s[C], xo) xo, xi = s[B].split(B.op.axis[0], factor=2) s[B].vectorize(xi) # build and invoke the kernel. lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False) f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((n, ), A.dtype).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n, ), C.dtype, ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def test_tensor_intrin_scalar_params(): n = tvm.size_var("n") x = tvm.placeholder((n, ), name='x') v = tvm.size_var("v") w = tvm.size_var("w") z = tvm.compute((n, ), lambda i: x[i] * v + w, name='z') def intrin_func(ins, outs, sp): assert (isinstance(ins[0], tvm.schedule.Buffer)) assert (ins[0].shape[0] == n) assert (sp[0] == v) assert (sp[1] == w) return tvm.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1]) with tvm.build_config(offset_factor=1): intrin = tvm.decl_tensor_intrin(z.op, intrin_func, scalar_params=[v, w]) assert intrin.op == z.op assert intrin.reduce_init is None assert tuple(intrin.inputs) == tuple(z.op.input_tensors) assert (intrin.buffers[0].shape[0] == n) assert tuple(intrin.scalar_params) == tuple((v, w)) A = tvm.placeholder((10, 10), name='A') # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs C = tvm.compute((10, 10), lambda i, j: intrin(i * i, A[i, j], i + j), name="C") s = tvm.create_schedule(C.op) stmt = tvm.lower(s, [A, C], simple_mode=True) assert isinstance(stmt.body.body.body, tvm.tir.Evaluate) assert len(stmt.body.body.body.value.args) == 5 assert str(stmt.body.body.body.value.args[3]) == "(i*i)" assert str(stmt.body.body.body.value.args[4]) == "(i + j)"
def test_local_gemm(): if not tvm.module.enabled("opengl"): return if not tvm.module.enabled("llvm"): return nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n l = n A = tvm.placeholder((n, l), name='A', dtype='int32') B = tvm.placeholder((m, l), name='B', dtype='int32') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') s = tvm.create_schedule(C.op) s[C].opengl() print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opengl", name="gemm") print("------opengl code------") print(f.imported_modules[0].get_source(fmt="gl")) ctx = tvm.opengl() n, m, l = nn, nn, nn a_np = np.random.uniform(low=0, high=10, size=(n, l)).astype(A.dtype) b_np = np.random.uniform(low=0, high=10, size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
def test_large_input(): @te.hybrid.script def compute(a, b): n = 16384 c = output_tensor((n, n), "int32") for i in range(n): for j in range(n): c[i, j] = a[i, j] - b[i, j] return c n = 16384 shape = (n, n) a = te.placeholder(shape, name="a", dtype="int32") b = te.placeholder(shape, name="b", dtype="int32") c = te.compute(shape, lambda i, j: compute(a, b)[i, j]) c = te.compute(shape, lambda i, j: 1 + c[i, j]) s = te.create_schedule(c.op) stmt = tvm.lower(s, [a, b, c])["main"].body def verify(n): if isinstance(n, tvm.tir.Allocate): assert n.extents[0].value == 268435456 tvm.tir.stmt_functor.post_order_visit(stmt, verify)
def test_scan_inline2(): m = te.var("m") n = te.var("n") x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") s_state1 = te.placeholder((m, n)) s_state2 = te.placeholder((m, n)) s_init1 = te.compute((1, n), lambda _, i: x[0, i]) s_init2 = te.compute((1, n), lambda _, i: x[0, i]) s_xx = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + x[t, i], name="xx") s_x1 = te.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1") s_x2 = te.compute((m, n), lambda t, i: s_xx[t, i] + s_state2[t - 1, 2], name="x2") s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1") s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2") res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2]) s = te.create_schedule(res1.op) s[s_xx].compute_inline() s[s_x1].compute_inline() s[s_x2].compute_inline() stmt = tvm.lower(s, [x, res1, res2])
def try_yolo_conv(batch_size, config): global __COUNTER__ __COUNTER__ += 1 # get the compute yolo_conv = YoloConvLayer17() input_shape = yolo_conv.get_intput_shape() inputs = tvm.placeholder((batch_size, *input_shape), dtype="float32") weight = yolo_conv.get_weight() outputs = yolo_conv(inputs) s = tvm.create_schedule(outputs.op) schedule_yolo_conv_cuda(s, outputs, inputs, weight, config) arg_bufs = [inputs, weight, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) # print(stmt) dev_id = 0 ctx = tvm.nd.context("cuda", dev_id) max_dims = ctx.max_thread_dimensions kwargs = { "max_shared_memory_per_block": ctx.max_shared_memory_per_block, "max_threads_per_block": ctx.max_threads_per_block, "max_thread_x": max_dims[0], "max_thread_y": max_dims[1], "max_thread_z": max_dims[2] } verify = tvm.ir_pass.VerifyGPUCode(stmt, kwargs) print("%d. config is:\n %s" % (__COUNTER__, str(config))) if verify: print("Valid kernel") time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10) print("Yolo conv17 use", time_cost, "ms\n") else: print("Invalid kernel") time_cost = float("inf") return time_cost
def _get_gaussian_map_sum_tvm_mod(): rows, cols = tvm.var('rows'), tvm.var('cols') # the shape of output n = tvm.var('n') # the number of samples data = tvm.placeholder((n, 3), name='data') ni = tvm.reduce_axis((0, n), name='ni') pi = tvm.const(np.pi) def _gaussian_map_sum(i, j): # i is row, j is col x, y = data[ni, 0], data[ni, 1] sigma = data[ni, 2] sigma2 = sigma * sigma v = tvm.if_then_else( tvm.all(x >= 0, x < cols, y >= 0, y < rows), tvm.exp(-(topi.power((x - j), 2) + topi.power( (y - i), 2)) / (2 * sigma2)) / (2 * pi * sigma2), 0) return tvm.sum(v, axis=ni) out = tvm.compute((rows, cols), _gaussian_map_sum, name='out') s = tvm.create_schedule(out.op) out_i = s[out].fuse(*out.op.axis) s[out].parallel(out_i) print(tvm.lower(s, [data], simple_mode=True)) return tvm.build(s, [data, out])
def check_device(target): num_step = n_num_step print(tvm.lower(s, [Xi2h, Wh2h, scan_h, scan_c], simple_mode=True)) flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal(size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal(size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, ctx) scan_c_a = tvm.nd.array(scan_c_np, ctx) Xi2h_a = tvm.nd.array(Xi2h_np, ctx) Wh2h_a = tvm.nd.array(Wh2h_np, ctx) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() # measure time cost of second step. evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000) eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) print("Time cost=%g" % eval_result.mean)
def schedule_gpu1_1(four): neuron_i, synapse, temp, neuron_n = four sch = tvm.create_schedule(neuron_n.op) b, n = sch[neuron_n].op.axis print(sch[neuron_n].op.axis) no, ni = sch[neuron_n].split(n, nparts=49) noo, noi = sch[neuron_n].split(no, nparts=7) nio, nii = sch[neuron_n].split(ni, nparts=16) sch[temp].compute_at(sch[neuron_n], nii) block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") block_z = tvm.thread_axis("blockIdx.z") thread_x = tvm.thread_axis((0, 8), "threadIdx.x") thread_y = tvm.thread_axis((0, 8), "threadIdx.y") thread_z = tvm.thread_axis((0, 8), "threadIdx.z") ro, ri = sch[temp].split(temp.op.reduce_axis[0], 4) roo, roi = sch[temp].split(ro, 4) sch[temp].vectorize(ri) sch[temp].unroll(roi) sch[neuron_n].bind(noo, block_y) sch[neuron_n].bind(noi, block_x) sch[neuron_n].bind(nio, thread_y) sch[neuron_n].bind(nii, thread_x) #sch[neuron_n].reorder(y, x, ky, kx, i, n, b) print(tvm.lower(sch, four, simple_mode=True)) func = tvm.build(sch, [neuron_i, synapse, neuron_n], target='cuda') assert func print('GPU compilation done...') return func
def run_and_check(func, args, outs, var_dict={}, target='llvm'): def tvm_val_2_py_val(val): val = tvm.ir_pass.Substitute(val, var_dict) val = tvm.ir_pass.Simplify(val) assert isinstance(val, (tvm.expr.IntImm, tvm.expr.UIntImm)) return val.value ctx = tvm.context(target, 0) emu_args = [] nd_args = [] to_check = [] for i in args: if isinstance(i, tvm.tensor.Tensor): shape = [tvm_val_2_py_val(j) for j in i.shape] if i in outs: emu_args.append(numpy.zeros(shape).astype(i.dtype)) nd_args.append(tvm.nd.array(emu_args[-1], ctx)) to_check.append((nd_args[-1], emu_args[-1])) else: emu_args.append(numpy.random.randn(*shape).astype(i.dtype)) nd_args.append(tvm.nd.array(emu_args[-1], ctx)) else: assert isinstance(i, tvm.expr.Var) emu_args.append(tvm_val_2_py_val(i)) nd_args.append(emu_args[-1]) func(*emu_args) lowerd_func = tvm.lower(func(*args), args) module = tvm.build(lowerd_func, target=target) assert module module(*nd_args) for nd, np in to_check: numpy.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
def test_large_input(): @tvm.hybrid.script def compute(a, b): n = 16384 c = output_tensor((n, n), 'int32') for i in range(n): for j in range(n): c[i, j] = a[i, j] - b[i, j] return c n = 16384 shape = (n, n) a = te.placeholder(shape, name='a', dtype='int32') b = te.placeholder(shape, name='b', dtype='int32') c = te.compute(shape, lambda i, j: compute(a, b)[i, j]) c = te.compute(shape, lambda i, j: 1 + c[i, j]) s = te.create_schedule(c.op) stmt = tvm.lower(s, [a, b, c], simple_mode=True) def verify(n): if isinstance(n, tvm.tir.Allocate): assert n.extents[0].value == 268435456 tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
def test_ib(): print('aaaa') env = nnpu.get_env() nnpu.set_device(env) shape = (16, ) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder(shape, dtype_w, name='a') w = shape[0] e = 16 def build_nms_ir(ten_in, ten_out): ib = tvm.ir_builder.create() imm_value = 10 ib.scope_attr(env.nnpu_axis, "coproc_scope", 0) p_in = ib.buffer_ptr(ten_in[0]) p_out = ib.buffer_ptr(ten_out[0]) #with ib.for_range(0,w, name="k") as k: with ib.for_range(0, w / e, name="i") as i: ib.emit( make_intrin_call( "void", 'VAddI', ten_out[0].access_ptr("w", 'uint32') + i * dtype_bytes(dtype_w), ten_in[0].access_ptr("r", 'uint32') + i * dtype_bytes(dtype_w), tvm.const(imm_value, 'float64'), env.cfg['vector_unit']['size'], 3)) stmt = ib.get() return stmt sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) sph.MarkScope(a_buf) out = tvm.extern(a_buf.shape, [a_buf], build_nms_ir, in_buffers=[ tvm.decl_buffer(a_buf.shape, dtype_w, data_alignment=dtype_bytes(dtype_w), scope='local.nnpu_scratchpad0') ], out_buffers=[ tvm.decl_buffer(a_buf.shape, dtype_w, data_alignment=dtype_bytes(dtype_w), scope='local.nnpu_scratchpad0') ], dtype=dtype_w, name="test_ir") sph.MarkScope(out) out_host, out_dram = nnpu.utils.CopyBufToH(out, 'out', sph) s = tvm.create_schedule([out_host.op]) sph.Transform(s) print(tvm.lower(s, [a, out_host], simple_mode=True)) print(nnpu.lower(s, [a, out_host], simple_mode=True)) # exit(0) func = nnpu.build(s, [a, out_host], 'nnpu', 'llvm', name='nnpu_test') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(16, ), dtype=a.dtype, low=0, high=127) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(np.zeros(16, ).astype(out_host.dtype), ctx) func(a_nd, b_nd) print('a = ') print(a_np) print('xjb sum = ') print(b_nd.asnumpy()) return
# Run auto-tuning (search) task.tune(tune_option) # Apply the best schedule sch, args = task.apply_best(log_file) ################################################################################ # Inspecting the Optimized Schedule # --------------------------------- # We can lower the schedule to see the IR after auto-scheduling. The # auto-scheduler correctly performs optimizations including multi-level tiling, # layout transformation, parallelization, vectorization, unrolling, and # operator fusion. print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True)) ################################################################################ # Check correctness and evaluate performance # ------------------------------------------ # We build the binary and check its correctness and performance. func = tvm.build(sch, args, target) a_np = np.random.uniform(size=(N, L)).astype(np.float32) b_np = np.random.uniform(size=(L, M)).astype(np.float32) c_np = np.random.uniform(size=(N, M)).astype(np.float32) out_np = a_np.dot(b_np) + c_np dev = tvm.cpu() a_tvm = tvm.nd.array(a_np, device=dev) b_tvm = tvm.nd.array(b_np, device=dev)
import tvm from tvm import te import numpy as np ###################################################################### # We first write a very simple vector add and build it with the default schedule. Then, we use # our customized lowering pass to manipulate the IR directly instead of using schedule primitives. # n = tvm.tir.const(128, "int32") a = te.placeholder((n, ), name="a") b = te.placeholder((n, ), name="b") c = te.compute((n, ), lambda i: a[i] + b[i], name='c') sch = te.create_schedule(c.op) ir = tvm.lower(sch, [a, b, c], simple_mode=True) print(ir) ###################################################################### # Writing a Pass # -------------- # Essentially, an "IR transformation pass" is a function which maps a statement to a new statement. # Thus, we define this vectorize function and implement it step by step. # ###################################################################### # TVM already provides two class for users to both analyze and transform IR. # # IR Visitor # ~~~~~~~~~~ # We can use ``tvm.tir.ir_pass.PostOrderVisit(stmt, func)`` to gather information from the Halide IR.
import tvm n = 1024 m = 1024 A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, n), name='k') l = tvm.reduce_axis((0, m), name = 'l') B = tvm.compute((n,), lambda i: tvm.sum(A[i, l], axis=l), name='B') s = tvm.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=4) print(tvm.lower(s, [A, B], simple_mode=True)) print("---------cutting line---------") s[B].pragma(ki, "unroll") print(tvm.lower(s, [A, B], simple_mode=True))
#b_shared = sch.cache_read(b, 'shared', [c_acc]) b_shared = b b_frag = sch.cache_read(b_shared, 'wmma.matrix_b', [c_acc]) sch[b_frag].compute_at(sch[c_acc], c_rio) bxo, bxi = sch[b_frag].split(sch[b_frag].op.axis[0], 16) byo, byi = sch[b_frag].split(sch[b_frag].op.axis[1], 16) sch[b_frag].reorder(bxo, byo, bxi, byi) sch[b_frag].pragma(bxo, 'tensorize', 'tensorcore.load_b') #sch[b_shared].compute_at(sch[c_acc], c_roi) import tensorizer with tvm.transform.PassContext( opt_level=4, config={'tir.add_lower_pass': [(1, tensorizer.rewrite)]}): #with tvm.transform.PassContext(opt_level=4): ir = tvm.lower(sch, [a, b, c]) module = tvm.build(sch, [a, b, c], 'nvptx') print(ir) #print(module.imported_modules[0].get_source()) np_a = np.random.randn(n, k).astype('float16') np_b = np.random.randn(k, m).astype('float16') np_c = np.random.randn(n, m).astype('float32') #np_a = np.ones((n, k)).astype('float16') #np_b = np.ones((k, m)).astype('float16') #np_c = np.ones((n, m)).astype('float32') #np_a = np.array(np.array(list(range(k)) * n) % 3).astype('float16') #np_a.shape = (n, k) #np_b = np.array(np.array(list(range(k)) * m) % 3).astype('float16')
from __future__ import absolute_import, print_function import tvm import topi import numpy as np if __name__ == '__main__': x, y = 100, 10 a = tvm.placeholder((x, y, y), name='a') b = tvm.placeholder((y, y), name='b') c = a + b d = a * b e = topi.elemwise_sum([c, d]) f = e / 2.0 g = topi.sum(f) with tvm.target.cuda(): sg = topi.generic.schedule_reduce(g) print(tvm.lower(sg, [a, b], simple_mode=True))
# ------------- # Let's revisit the sum of rows operation (equivalent to :code:`B = numpy.sum(A, axis=1)`') \ # To compute the sum of rows of a two dimensional TVM tensor A, we should # specify the symbolic operation as well as schedule as follows # n = tvm.var("n") m = tvm.var("m") A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B") s = tvm.create_schedule(B.op) ###################################################################### # and to examine the IR code in human readable format, we can do # print(tvm.lower(s, [A], simple_mode=True)) ###################################################################### # However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with # :code: `tvm.compute`. Imagine for more complicated operations how much details we need to provide. # Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code`numpy.sum` # C = topi.sum(A, axis=1) ts = tvm.create_schedule(C.op) print(tvm.lower(ts, [A], simple_mode=True)) ###################################################################### # Numpy-style operator overloading # -------------------------------- # We can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes. # Even shorter, TOPI provides operator overloading for such common operations. For example,
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert (bn <= N) assert (num_thread * num_thread * 16 <= N) assert (num_block * num_block * 2 <= N) A = tvm.placeholder((N, N), name='A') B = tvm.placeholder((N, N), name='Btmp') k = tvm.reduce_axis((0, N), name='k') packedB = tvm.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name='B') C = tvm.compute((N, N), lambda ii, jj: tvm.sum( A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name='C') s = tvm.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = tvm.thread_axis("blockIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_x = tvm.thread_axis("threadIdx.x") thread_y = tvm.thread_axis("threadIdx.y") thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx") thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") temp = util.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, ctx, N, times)
dispatch_context = autotvm.apply_history_best(log_file) best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) else: config = task.config_space.get(PRETUNED_INDEX) dispatch_context = autotvm.task.ApplyConfig(config) print("Using pretuned config:") print(config) with dispatch_context: with tvm.target.create("cuda"): s, arg_bufs = conv2d(N, H, W, CO, CI, KH, KW, strides, padding, scaling_factor) print(tvm.lower(s, arg_bufs, simple_mode=True)) func = tvm.build(s, arg_bufs) print(func.imported_modules[0].get_source()) # check correctness a_np = np.random.randint(size=(N, CI // BI, H, W, BI), low=-128, high=127, dtype='int8') w_np = np.random.randint(size=(CO // BO, CI // BI, KH, KW, BO, BI), low=-128, high=127, dtype='int8') a_np_ = a_np.transpose((0, 1, 4, 2, 3)).ravel().reshape(N, CI, H, W) w_np_ = w_np.transpose((0, 4, 1, 5, 2, 3)).ravel().reshape(CO, CI, KH, KW) #c_np = conv2d_nchw_python(a_np_, w_np_, strides, padding).astype('int8')
output = op.output(0) conv_out = op.input_tensors[0] kernel_pack = conv_out.op.input_tensors[1] kernel = kernel_pack.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.name: data_pad = data data = data_pad.op.input_tensors[0] s = _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_pack, conv_out, output, output) print(tvm.lower(s, [A, W, Conv], simple_mode=True)) conv_unpack = tvm.nd.array( np.zeros(get_const_tuple(Conv.shape), dtype=dtype), ctx) func = tvm.build(s, [A, W, Conv], device) time_f = func.time_evaluator(func.entry_name, ctx, number=2000) cost_unpack = time_f(tvm.nd.array(a_np), tvm.nd.array(w_np), conv_unpack).mean print('conv: %g ms/op' % (cost_unpack * 1000.0)) # W0 # batch_size, in_channel, in_size, num_filter, kernel_size, stride, padding = 1, 3, 224, 64, 7, 2, 3 # ic_bn, oc_bn, ur_w = 3, 16, 28 # verify(1, 3, 224, 64, 7, 2, 3) # W1 # batch_size, in_channel, in_size, num_filter, kernel_size, stride, padding = 1, 64, 56, 64, 3, 1, 1
def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride): """ Runs the inference and checks the functional correctness between compute and schedule outputs """ (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride, out_dtype) # Create TVM placeholders data = te.placeholder(data_shape, name='data', dtype=data_dtype) kernel = te.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype) # Create the numpy arrays to be used for executing conv models if data_dtype == 'float32': data_array = tvm.nd.array( np.random.rand(*data_shape).astype(dtype=data_dtype), CTX) kernel_array = tvm.nd.array( np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX) else: data_array = tvm.nd.array( np.random.randint(100, size=data_shape).astype(data_dtype)) kernel_array = tvm.nd.array( np.random.randint(100, size=kernel_shape).astype(kernel_dtype)) # c_orig will be used for declaration ouptut # c_sch will be used for scheduled computation output c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX) with tvm.target.Target(TARGET_NAME): conv = topi.nn.conv2d_NCHWc(data, kernel, stride=hstride, padding=hpad, dilation=(1, 1), layout='NCHWc', out_layout='NCHWc', out_dtype=out_dtype) out = topi.nn.relu(conv) sch = te.create_schedule(out.op) func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out') func(data_array, kernel_array, c_orig) LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True)) # Generate and run the optimized schedule sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out]) func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv') func(data_array, kernel_array, c_sch) # Functional check if data_dtype == 'uint8': np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy()) else: assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy()) evaluator = func.time_evaluator(func.entry_name, CTX, number=1000) LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True)) return evaluator(data_array, kernel_array, c_sch).mean
type=str, default=None, dest='cuda_arch', help='The cuda arch for compiling kernels for') arguments = parser.parse_args() func_list_llvm = [] func_list_cuda = [] # TODO: attach instruction features to the library, e.g., avx-512, etc. for operator_def in __OP_DEF__: for sch, args, name in operator_def.invoke_all(): if tvm.module.enabled(get_target(operator_def.target)): func_list = func_list_llvm if operator_def.target == "cpu" else func_list_cuda func_lower = tvm.lower(sch, args, name=name, binds=operator_def.get_binds(args)) func_list.append(func_lower) lowered_funcs = {get_target("cpu"): func_list_llvm} if len(func_list_cuda) > 0: lowered_funcs[get_target("cuda")] = func_list_cuda cuda_arch = get_cuda_arch(arguments.cuda_arch) if cuda_arch is None: logging.info( 'No cuda arch specified. TVM will try to detect it from the build platform.' ) else: logging.info( 'Cuda arch {} set for compiling TVM operator kernels.'.format( cuda_arch))
name="res") ###################################################################### # Scheduling the Computation # -------------------------- # We'll look at a set of schedule transformations necessary to map the # matrix multiplications onto VTA in an efficient fashion. # Those include: # # - Computation blocking # - Lowering to VTA hardware intrinsics # Create TVM schedule s = te.create_schedule(res.op) # Let's look at the default TVM schedule print(tvm.lower(s, [data, weight, res], simple_mode=True)) ###################################################################### # Blocking the Computation # ~~~~~~~~~~~~~~~~~~~~~~~~ # The matrix multiplication is by default too large for activations or weights # to fit on VTA's on-chip buffers all at once. # We block the (1, 1024) by (1024, 1024) matrix multiplication into # smaller (1, 256) by (256, 256) matrix multiplications so the intermediate # tensors can fit on the accelerator's on-chip SRAM. # This approach is similar to blocking techniques applied to CPUs and GPUs in # order to increase cache hit rate. # # We perform blocking along each axes (the batch axis being untouched since # we are performing singe-batch inference). # We also leave the inner-most tensorization axes as-is in order to allow
func = tvm.build(sg, [a, b, g], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype) b_np = np.random.uniform(size=(y, y)).astype(b.dtype) g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(b_np, ctx) g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx) func(a_nd, b_nd, g_nd) np.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5) # common neural nets tarray = tvm.placeholder((512, 512), name="tarray") softmax_topi = topi.nn.softmax(tarray) with tvm.target.cuda(): sst = topi.generic.schedule_softmax(softmax_topi) # print(tvm.lower(sst, [tarray], simple_mode=True)) # fusing conv # fuse topi.nn.conv2d and topi.nn.relu together data = tvm.placeholder((1, 3, 224, 224)) kernel = tvm.placeholder((10, 3, 5, 5)) conv = topi.nn.conv2d(data, kernel, strides=1, padding=2) out = topi.nn.relu(conv) with tvm.target.create('cuda'): # 难道每种操作都有一个专门的调度 sconv = topi.generic.nn.schedule_conv2d_nchw(out) print(tvm.lower(sconv, [data, kernel], simple_mode=True))
s = tvm.te.reduce_axis([0, S], "s") D = tvm.te.compute( [P, Q], lambda i, j: tvm.te.sum(A[i * R + r, j * S + s] * C[i * R + r, j * S + s], axis=[r, s]), name="D", requires_grad=True) E = mse_loss(D, label) dA, = tvm.te.mygradient(E, [A]) s = tvm.te.create_schedule([E.op, dA.op]) print(tvm.lower(s, [A, label, E, dA], simple_mode=True)) func = tvm.build(s, [A, label, E, dA], target="llvm") A_np = np.random.uniform(-10, 10, [H, W]).astype("float32") label_np = np.random.uniform(-10, 10, [P, Q]).astype("float32") E_np = np.zeros([1]).astype("float32") dA_np = np.zeros([H, W]).astype("float32") ctx = tvm.context("llvm", 0) A_tvm = tvm.nd.array(A_np, ctx) label_tvm = tvm.nd.array(label_np, ctx) E_tvm = tvm.nd.array(E_np, ctx) dA_tvm = tvm.nd.array(dA_np, ctx) func(A_tvm, label_tvm, E_tvm, dA_tvm)