def check_cuda(dtype, n, lanes): if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version): print("skip because gpu does not support int8") return A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes)) B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes)) C = tvm.placeholder((n,), name='C', dtype="int32") D = tvm.compute((n,), lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D') s = tvm.create_schedule(D.op) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, tvm.thread_axis("blockIdx.x")) s[D].bind(xi, tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, B, C, D], "cuda") np_a = np.random.randint(low=-128, high=127, size=(n,lanes)) np_b = np.random.randint(low=-128, high=127, size=(n,lanes)) np_c = np.random.randint(low=0, high=127, size=(n,)) np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)] ctx = tvm.gpu(0) a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a) b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b) c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c) d = tvm.nd.empty((n,), D.dtype, ctx) fun(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), np_d)
def test_matmul_add(): n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = cblas.matmul(A, B) D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D") s = tvm.create_schedule(D.op) def verify(target="llvm"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.cblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) tvm.testing.assert_allclose( d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5) verify()
def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding): in_height = in_width = in_size with tvm.target.rasp(): A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') B = topi.nn.conv2d(A, W, stride, padding) s = topi.generic.schedule_conv2d_nchw([B]) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d.verify_conv2d") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], "llvm") func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" data, kernel, strides, padding, dilation, dtype = deserialize_args(args) batch, in_channel, height, width = get_const_tuple(data.shape) filter_channel, channel_multiplier, kh, kw = get_const_tuple(kernel.shape) ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding) sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) out_height = (height - kh + 2 * ph) // sh + 1 out_width = (width - kw + 2 * pw) // sw + 1 out_channel = filter_channel * channel_multiplier # get config here cfg = get_config() cfg.define_split("tile_ic", in_channel, num_outputs=2) cfg.define_split("tile_oc", out_channel, num_outputs=2) cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) # change shape with the value in config ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] new_data_shape = (batch, in_channel // ic_bn, height, width, ic_bn) new_kernel_shape = (out_channel // oc_bn, kh, kw, oc_bn) new_data = tvm.placeholder(new_data_shape, data.dtype) new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype) data_layout = "NCHW%dc" % ic_bn out_layout = "NCHW%dc" % oc_bn C = _depthwise_conv2d_NCHWc_cpu(cfg, new_data, new_kernel, strides, padding, dilation, data_layout, out_layout, dtype) s = schedule_depthwise_conv2d_NCHWc(cfg, [C]) return s, [new_data, new_kernel, C]
def verify_gather_nd(src_shape, indices_src, indices_dtype): src_dtype = "float32" indices_src = np.array(indices_src, dtype=indices_dtype) A = tvm.placeholder(shape=src_shape, dtype=src_dtype, name="A") indices = tvm.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices") out_tensor = topi.gather_nd(a=A, indices=indices) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_injective(out_tensor) func = tvm.build(s, [A, indices, out_tensor] , device, name="take") shape_size = 1 for i in range(len(src_shape)): shape_size = shape_size * src_shape[i] data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape)) out_npys = topi.testing.gather_nd_python(data_npy, indices_src) data_nd = tvm.nd.array(data_npy, ctx) indices_nd = tvm.nd.array(indices_src, ctx) out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype) func(data_nd, indices_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys) for device in get_all_backend(): check_device(device)
def verify_expand_like(in_shape, out_shape, axis): A = tvm.placeholder(shape=in_shape, name="A") B = tvm.placeholder(shape=out_shape, name="B") C = topi.expand_like(A, B, axis) s = tvm.create_schedule([C.op]) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) ctx = tvm.context(device, 0) f = tvm.build(s, [A, B, C], device, name="expand_like") input = np.random.uniform(size=in_shape).astype(A.dtype) tvm_input = tvm.nd.array(input, ctx) odim = len(out_shape) real_axis = [x if x >= 0 else x + odim for x in axis] real_axis = sorted(real_axis) for x in real_axis: input = np.expand_dims(input, x).astype(A.dtype) for x in real_axis: input = np.concatenate([input]*out_shape[x], axis=x).astype(A.dtype) assert input.shape == out_shape tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), ctx) out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), ctx) f(tvm_input, tvm_shape_like, out) tvm.testing.assert_allclose(out.asnumpy(), input) for device in ["llvm"]: check_device(device)
def verify_bitserial_dense(batch, in_dim, out_dim, activation_bits, weight_bits, unipolar): input_dtype = 'uint32' out_dtype = 'int16' with tvm.target.create('llvm'): A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A') B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B') C = topi.nn.bitserial_dense(A, B, activation_bits, weight_bits, out_dtype=out_dtype, unipolar=unipolar) s = topi.generic.schedule_bitserial_dense([C]) a_shape = get_const_tuple(A.shape) b_shape = get_const_tuple(B.shape) @memoize("topi.tests.test_topi_bitseral_dense") def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype) b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype) if unipolar: b_ = np.copy(b_np).astype(out_dtype) for x in np.nditer(b_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 c_np = np.dot(a_np, b_.T) else: c_np = np.dot(a_np, b_np.T) return a_np, b_np, c_np a_np, b_np, c_np = get_ref_data() ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func = tvm.build(s, [A, B, C], "llvm") func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def intrin_gemv(m, l): a = tvm.placeholder((l,), name='a') b = tvm.placeholder((m, l), name='b') k = tvm.reduce_axis((0, l), name='k') c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c') Ab = tvm.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1]) Bb = tvm.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[tvm.var("s1"), 1]) Cb = tvm.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1]) def intrin_func(ins, outs): ib = tvm.ir_builder.create() aa, bb = ins cc = outs[0] ib.emit(tvm.call_extern("int32", "gemv_update", cc.access_ptr("w"), aa.access_ptr("r"), bb.access_ptr("r"), m, l, bb.strides[0])) return ib.get() with tvm.build_config(offset_factor=1): return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
def test_conv2d(): if not tvm.module.enabled("metal"): print("skip because %s is not enabled..." % "metal") return n = 1 h = 14 w = 14 ci = 2 co = 4 kh = 3 kw = 3 stride = 2 A = tvm.placeholder((n, h, w, ci), name="x") B = tvm.placeholder((co, kh, kw, ci), name="w") C = mps.conv2d(A, B, 'SAME', 2) s1 = tvm.create_schedule(C.op) def verify(A, B, C, target="llvm"): if not tvm.get_global_func("tvm.contrib.mps.conv2d", True): print("skip because extern function is not available") return ctx = tvm.metal(0) f = tvm.build(s1, [A, B, C], "metal") a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), ctx) f(a, b, c) # print(c.asnumpy()) # print(c.shape) verify(A, B, C, s1)
def intrin_vadd(n, cache_read=False, cache_write=False): scope_ubuf = 'local' dtype = 'float32' x = tvm.placeholder((n,), dtype=dtype, name='vx') y = tvm.placeholder((n,), dtype=dtype, name='vy') z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z') s = tvm.create_schedule(z.op) def create_buffer(t): return tvm.decl_buffer(t.shape, t.dtype, name='W'+t.name, scope=scope_ubuf, offset_factor=16) binds = {} if cache_read: binds[x] = create_buffer(x) binds[y] = create_buffer(y) if cache_write: binds[z] = create_buffer(z) def intrin_func(ins, outs): ib = tvm.ir_builder.create() ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr'))) return ib.get() with tvm.build_config(offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
def test_add(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) def check_c(): mhost = tvm.build(s, [A, B, C], "c", name="fadd") temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.module.load(path_dso) fadd = m['fadd'] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) check_c()
def test_conv_tiling(): HSTR = WSTR = 1 in_channel = 128 kernel_height = kernel_width = 3 out_channel = 64 batch_size = 1 in_height = in_width = 64 out_height = out_width = in_height - kernel_height + 1 data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data') kernel = tvm.placeholder((kernel_height, kernel_width, in_channel, out_channel), name='kernel') ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute((batch_size, out_channel, out_height, out_width), lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]), name="conv2d") s = tvm.create_schedule(conv.op) n, oc, oh, ow = conv.op.axis oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.LoopPartition(stmt, True) stmt = tvm.ir_pass.Simplify(stmt) assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
def test_multiple_func(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def check_llvm(): if not tvm.module.enabled("llvm"): return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], "llvm") fadd1 = m['fadd1'] fadd2 = m['fadd2'] ctx = tvm.cpu(0) # launch the kernel. n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd1(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) fadd2(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) check_llvm()
def intrin_gemv(m, n): w = tvm.placeholder((m, n), name='w') x = tvm.placeholder((n,), name='x') k = tvm.reduce_axis((0, n), name='k') z = tvm.compute((m,), lambda i: tvm.sum(w[i, k] * x[k], axis=k), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", offset_factor=16, strides=[tvm.var('ldw'), 1]) def intrin_func(ins, outs): ww, xx = ins zz = outs[0] ww_ptr = ww.access_ptr("r") xx_ptr = xx.access_ptr("r") zz_ptr = zz.access_ptr("w") body = tvm.call_packed( "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) reset = tvm.call_packed( "fill_zero", zz_ptr, n) update = tvm.call_packed( "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) return body, reset, update with tvm.build_config(data_alignment=16, offset_factor=16): return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb})
def test_sort_np(): dshape = (1, 2, 3, 4, 5, 6) axis = 4 reduced_shape = (1, 2, 3, 4, 6) is_descend = False data = tvm.placeholder(dshape, name='data') sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32") out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
def test_lstm_cell_inline(): num_step = 128 num_input = 256 num_hidden = 1152 batch_size = 4 # Global transition matrix X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X") Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h") Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h") # h: output hidden state, c: cell state. s_state_h = tvm.placeholder((num_step, batch_size, num_hidden)) s_state_c = tvm.placeholder((num_step, batch_size, num_hidden)) s_init_c = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c") s_init_h = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h") # LSTM transition k = tvm.reduce_axis((0, num_input), name="ki2h") s_i2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k), name="s_i2h") k = tvm.reduce_axis((0, num_hidden), name="ki2h") s_h2h = tvm.compute( (num_step, 4, batch_size, num_hidden), lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k), name="s_h2h") # Gate rules gates = tvm.compute(s_i2h.shape, lambda *i: s_i2h(*i) + s_h2h(*i), name="gates") gshape = (num_step, batch_size, num_hidden) in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate") in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform") forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate") out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate") next_c = tvm.compute(gshape, lambda t, i, j: forget_gate[t, i, j] * s_state_c[t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j], name="next_c") next_h = tvm.compute(gshape, lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h") update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c") update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h") # schedule scan_h, scan_c = tvm.scan( [s_init_h, s_init_c], [update_h, update_c], [s_state_h, s_state_c], inputs=[X], name="lstm_scan") # schedule s = tvm.create_schedule(scan_h.op) # Inline gate computations s[gates].compute_inline() s[in_gate].compute_inline() s[in_transform].compute_inline() s[forget_gate].compute_inline() s[out_gate].compute_inline() # verify we can lower correctly tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
def test_argsort(): dshape = (1, 8) valid_count_shape = (2,) data = tvm.placeholder(dshape, name="data", dtype="float32") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") np_data = np.random.rand(dshape[0], dshape[1]).astype(data.dtype) np_valid_count = np.array([4]).astype(valid_count.dtype) np_result = np.argsort(-np_data) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): out = argsort(data, valid_count, axis = -1, is_ascend = False, flag=False) s = topi.generic.schedule_argsort(out) tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype="float32"), ctx) f = tvm.build(s, [data, valid_count, out], device) f(tvm_data, tvm_valid_count, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result.astype("float32"), rtol=1e0) for device in ['llvm', 'cuda', 'opencl']: check_device(device)
def test_sort(): n = 2 l = 5 m = 3 data = tvm.placeholder((n, l, m), name='data') sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_descend = True out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]] ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024): #Test Buffer register_mem(scope_tb, max_bits) m = 10 A = tvm.placeholder((m,), name='A') C = tvm.placeholder((m,), name='C') D = tvm.placeholder((m,), name='D') A0 = tvm.compute((m,), lambda i: A[i] + C[i], name='A0') A1 = tvm.compute((m,), lambda i: D[i] * D[i], name='A1') A2 = tvm.compute((m,), lambda i: A0[i] + A1[i], name='A2') B = tvm.compute((m,), lambda i: A2[i], name='B') s = tvm.create_schedule(B.op) A0L = s.cache_read(A0, scope_tb, [A2]) A1L = s.cache_read(A1, scope_tb, [A2]) A2L = s.cache_read(A2, scope_tb, [B]) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Cc = tvm.decl_buffer(C.shape, B.dtype, name='C') Dd = tvm.decl_buffer(D.shape, B.dtype, name='D') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) # verify only have one allocations. # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2
def test_schedule_create(): m = tvm.var('m') n = tvm.var('n') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') B = tvm.placeholder((n, l), name='B') AA = tvm.compute((m, l), lambda i, j: A[i, j]) T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k)) s = tvm.create_schedule(T.op) s[AA].set_scope("shared") xo, xi = s[T].split(T.op.axis[0], factor=10) xi1, xi2 = s[T].split(xi, factor=2) s[AA].compute_at(s[T], xi1) xo, xi = s[AA].split(AA.op.axis[0], factor=10) s[T].reorder(xi2, xi1) assert T.op.axis[1] in s[T].leaf_iter_vars # save load json json_str = tvm.save_json(s) s_loaded = tvm.load_json(json_str) assert isinstance(s_loaded, tvm.schedule.Schedule) assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body)) # pickle unpickle dump = pkl.dumps(s) s_loaded = pkl.loads(dump) assert isinstance(s_loaded, tvm.schedule.Schedule) assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
def test_upstream(): @tvm.hybrid.script def upstream(a): b = output_tensor((20, ), 'float32') for i in range(20): b[i] = a[i] * i return b a = tvm.placeholder((20, ), 'float32') b = tvm.placeholder((20, ), 'float32') c = tvm.compute((20, ), lambda x: a[x] + b[x]) d = upstream(c) sch = tvm.create_schedule([c.op, d.op]) ir = tvm.lower(sch, [a, b, d], simple_mode=True) func = tvm.build(sch, [a, b, d]) assert(func) a = numpy.random.randn(20).astype('float32') b = numpy.random.randn(20).astype('float32') ref = numpy.zeros((20, ), 'float32') for i in range(20): ref[i] = (a[i] + b[i]) * i tvm_a = tvm.nd.array(a) tvm_b = tvm.nd.array(b) tvm_d = tvm.nd.array(numpy.zeros((20, )).astype('float32')) func(tvm_a, tvm_b, tvm_d) tvm.testing.assert_allclose(tvm_d.asnumpy(), ref, 1e-5, 1e-5)
def test_compile_cache(): x = sym.Variable("x") y = sym.Variable("y") z = sym.exp(y + x) shape = (10, 1) dtype = tvm.float32 shape_dict = {"x": shape, "y": shape} def verify(graph, lib): m = graph_runtime.create(graph, lib, tvm.cpu(0)) # get member functions na = tvm.nd.array(np.random.uniform(size=shape).astype(dtype)) nb = tvm.nd.array(np.random.uniform(size=shape).astype(dtype)) m.run(x=na, y=nb) # get outputs out = m.get_output(0, tvm.nd.empty(shape, dtype)) tvm.testing.assert_allclose( out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy())) engine = nnvm.compiler.engine graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict) inputs = [tvm.placeholder((10,)), tvm.placeholder((10,))] gkey = nnvm.compiler.graph_key(nnvm.graph.create(z), inputs, "llvm") gkey2 = nnvm.compiler.graph_key(nnvm.graph.create(z), inputs + inputs, "llvm") gf = engine[gkey] assert gf is not None assert engine[gkey2] is None graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict) assert graph.index.num_nodes == 3 verify(graph, lib) # Test various set external cache engine.clear_cache() engine[gkey] = gf
def test_looptype(): @script def looptype(a, b, c): d = output_tensor((16, ), 'int32') e = output_tensor((16, ), 'int32') f = output_tensor((16, ), 'int32') for i in parallel(16): d[i] = a[i] for j in vectorize(16): e[j] = b[j] for k in unroll(16): f[k] = c[k] return d, e, f a = tvm.placeholder((16, ), name='a', dtype='int32') b = tvm.placeholder((16, ), name='b', dtype='int32') c = tvm.placeholder((16, ), name='c', dtype='int32') try: d, e, f = looptype(a, b, c) ir = d.op.body except: return iloop = ir.first jloop = ir.rest.first kloop = ir.rest.rest assert iloop.for_type == tvm.stmt.For.Parallel assert jloop.for_type == tvm.stmt.For.Vectorized assert kloop.for_type == tvm.stmt.For.Unrolled func, ins, outs = run_and_check(looptype, [a, b, c]) run_and_check(func, ins, outs=outs)
def test_non_zero(): @tvm.hybrid.script def blur(a): b = output_tensor((30, 30), 'float32') for i in range(2, 32): for j in range(2, 32): s = 0.0 for di in range(3): for dj in range(3): s += a[i-di, j-dj] b[i-2, j-2] = s / 9.0 return b a = tvm.placeholder((32, 32), 'float32', 'a') func, ins, outs = run_and_check(blur, [a]) run_and_check(func, ins, outs=outs) @tvm.hybrid.script def triangle(a, b): c = output_tensor((10, 10), dtype='float32') for i in range(10): for j in range(i, 10): c[i, j] = a[i] * b[j] return c a = tvm.placeholder((10, ), dtype='float32', name='a') b = tvm.placeholder((10, ), dtype='float32', name='b') func, ins, outs = run_and_check(triangle, [a, b]) run_and_check(func, ins, outs=outs)
def test_dot(): nn = 12 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') k = tvm.reduce_axis((0, n), 'k') C = tvm.compute((1,), lambda _: tvm.sum(A[k] * B[k], axis=k), name='C') s = tvm.create_schedule(C.op) fapi = lower(s, [A, B, C]) def verify(target): if not tvm.module.enabled(target): print("Target %s is not enabled" % target) return f = tvm.codegen.build_module(fapi, target) # verify ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4) verify("llvm")
def get_gemm_feature(target): k = tvm.reduce_axis((0, N), 'k') A = tvm.placeholder((N, N), name='A') B = tvm.placeholder((N, N), name='B') C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k), name='C') s = tvm.create_schedule(C.op) y, x = s[C].op.axis axes = list(s[C].tile(y, x, 8, 8)) + [k] perm = np.random.permutation(5) axes = [axes[x] for x in perm] s[C].reorder(*axes) if "gpu" in target.keys: pick = [] # filter out reduction axis for i in range(len(perm)): if perm[i] != 4: pick.append(axes[i]) s[C].bind(pick[0], tvm.thread_axis("blockIdx.x")) s[C].bind(pick[1], tvm.thread_axis("vthread")) s[C].bind(pick[2], tvm.thread_axis("threadIdx.y")) with target: feas = feature.get_itervar_feature(s, [A, B, C]) feas = feature.flatten_itervar_feature(feas) return feas
def matmul(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') s = tvm.create_schedule(C.op) # schedule y, x = s[C].op.axis k = s[C].op.reduce_axis[0] ##### define space begin ##### cfg = autotvm.get_config() cfg.define_split("tile_y", y, num_outputs=2) cfg.define_split("tile_x", x, num_outputs=2) ##### define space end ##### # schedule according to config yo, yi = cfg["tile_y"].apply(s, C, y) xo, xi = cfg["tile_x"].apply(s, C, x) s[C].reorder(yo, xo, k, yi, xi) return s, [A, B, C]
def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') bias = tvm.placeholder((num_filter, 1, 1), name='bias') a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): C = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), (dilation, dilation), layout='NCHW', out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4) for device in get_all_backend(): with autotvm.tophub.context(device): # load tophub pre-tuned parameters check_device(device)
def test_cpu(): n = 1024 dtype = "float32" A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') def test_device_ir(A, B, C): n = A.shape[0] max_threads = 8 ib = tvm.ir_builder.create() Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) Cptr = ib.buffer_ptr(C) with ib.for_range(0, n, name="i") as i: Cptr[i] = Aptr[i] + Bptr[i] body = ib.get() return body C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), name="vector_add", dtype=dtype) s = tvm.create_schedule(C.op) def check_target(target): if not tvm.module.enabled(target): return # build and invoke the kernel. fadd = tvm.build(s, [A, B, C], target) ctx = tvm.context(target, 0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) check_target("llvm")
def test_matmul_add(): n = 1024 l = 128 m = 235 A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = rocblas.matmul(A, B) s = tvm.create_schedule(C.op) def verify(target="rocm"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.rocm(0) f = tvm.build(s, [A, B, C], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) verify()
def test_add_pipeline(): n = tvm.var('n') A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='C') s = tvm.create_schedule(D.op) # GPU schedule have to split by gridIdx and threadIdx num_thread = 256 xo, xi = s[C].split(C.op.axis[0], factor=num_thread) s[C].bind(xo, tvm.thread_axis("threadIdx.x")) s[C].bind(xi, tvm.thread_axis("blockIdx.x")) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xo, tvm.thread_axis("threadIdx.x")) s[D].bind(xi, tvm.thread_axis("blockIdx.x")) # compile to IR s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Cb = tvm.decl_buffer(C.shape, C.dtype, name='C') stmt = tvm.ir_pass.LoopPartition(stmt) stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cb}, 64) stmt = tvm.ir_pass.Simplify(stmt) fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 0, True) fsplits = [x for x in tvm.ir_pass.SplitHostDevice(fapi)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) def check_target(device, host="stackvm"): if not tvm.module.enabled(host): return if not tvm.module.enabled(device): return ctx = tvm.context(device, 0) mhost = tvm.codegen.build_module(fsplits[0], host) mdev = tvm.codegen.build_module(fsplits[1:], device) mhost.import_module(mdev) code = mdev.get_source() f = mhost.entry_func # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx) f(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) def check_module_save(device, host="stackvm"): if not tvm.module.enabled(host): return if not tvm.module.enabled(device): return ctx = tvm.context(device, 0) fmt = "ptx" if device == "cuda" else "cl" mhost = tvm.codegen.build_module(fsplits[0], host) mdev = tvm.codegen.build_module(fsplits[1:], device) temp = util.tempdir() mpath = temp.relpath("test.%s" % fmt) mdev.save(mpath) mdev2 = tvm.module.load(mpath) mhost.import_module(mdev2) f = mhost.entry_func # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx) f(a, b, c) np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) check_target("cuda", host="stackvm") check_target("cuda", host="llvm") check_module_save("cuda", host="stackvm") check_target("nvptx", host="llvm") check_target("rocm", host="llvm")
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"): # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A", dtype=dtype) A1 = topi.sqrt(topi.exp(A)) out_dtype = dtype if type == "sum": B = topi.sum(A1, axis=axis, keepdims=keepdims) elif type == "max": B = topi.max(A1, axis=axis, keepdims=keepdims) elif type == "min": B = topi.min(A1, axis=axis, keepdims=keepdims) elif type == "argmax": B = topi.argmax(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" elif type == "argmin": B = topi.argmin(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" else: raise NotImplementedError def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_reduce(B) foo = tvm.build(s, [A, B], device, name=type) # Test in_npy = np.random.uniform(size=in_shape).astype(dtype) in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype) if type == "sum": out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy_map.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy_map.min(axis=axis, keepdims=keepdims) elif type == "argmax": out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims) elif type == "argmin": out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=ctx) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) if type == "argmax" or type == "argmin": out_tvm_indices = out_tvm.asnumpy() if keepdims: out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis) if axis is None: out_tvm_val = in_npy_map.ravel()[out_tvm_indices] else: other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):])) sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:] out_tvm_val = in_npy_map[sel_indices] if type == "argmax": np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3) elif type == "argmin": np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3) else: np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan", "nvptx"]: check_device(device)
'b = np.random.rand(K, N).astype(dtype)\n', stmt='answer = np.dot(a, b)', number=np_repeat) print("Numpy running time: %f" % (np_runing_time / np_repeat)) # ground truth a = tvm.nd.array(np.random.rand(M, K).astype(dtype), ctx) b = tvm.nd.array(np.random.rand(K, N).astype(dtype), ctx) c = tvm.nd.array(np.zeros((M, N), dtype=dtype), ctx) answer = np.dot(a.asnumpy(), b.asnumpy()) ################### # TVM part # Algorithm k = tvm.reduce_axis((0, K), 'k') A = tvm.placeholder((M, K), name='A') B = tvm.placeholder((K, N), name='B') C = tvm.compute((M, N), lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k), name='C') # Default schedule s = tvm.create_schedule(C.op) func = tvm.build(s, [A, B, C], target=target, name='mmult') print(tvm.lower(s, [A, B, C], simple_mode=True)) func(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) evaluator = func.time_evaluator(func.entry_name, ctx, number=1) print('Baseline: %f' % evaluator(a, b, c).mean)
def visit_Assign(self, node): """Visit targets = value Returns ------- Stmt: Store node, tvm.var, tvm.buffer, or tvm.compute IR """ # Currently, we only allow one output target target = node.targets[0] index = 0 content = None is_tvm = False dtype = "float32" # Analyze right hand side first if isinstance(node.value, ast.Call): call = node.value call_type = self.check_call_type(call) if len(call_type) == 1: # External function call. We do not support it right now content = self.visit(call) else: args = call.args keywords = call.keywords # Currently we only support tvm calls if call_type[0] == "tvm": is_tvm = True if call_type[1] == "var": # tvm.var assert isinstance( target, ast.Name), "target of tvm.var must be a name" for keyword in keywords: # check every keyword in tvm.var if keyword.arg == "dtype": dtype = keyword.value.s elif keyword.arg == "name": pass else: raise ValueError( "Unknown/Unsupported keyowrds to tvm.var: " + str(keyword[0])) name = target.id tvm_var = tvm.var(name, dtype=dtype) var = { 'var': tvm_var, 'type': 'tvm', 'allocated': False } if name in self.arg_list: # check whether this var belongs to io self.io_dict[name] = {'arg': tvm_var} var['allocated'] = True self.insert_var(name, var) content = None elif call_type[1] == "placeholder": # tvm.placeholder assert isinstance( target, ast.Name ), "target of tvm.placeholder must be a name" for keyword in keywords: # check every keyword in tvm.var if keyword.arg == "dtype": dtype = keyword.value.s elif keyword.arg == "name": pass else: raise ValueError( "Unknown/Unsupported keyowrds to tvm.placeholder: " + str(keyword[0])) name = target.id shape = self.get_shape(call.args[0]) placeholder = tvm.placeholder(shape, name=name, dtype=dtype) buff = tvm.decl_buffer(placeholder.shape, placeholder.dtype, placeholder.name) buffer = { 'tensor': placeholder, 'buffer': buff, 'type': 'input', 'ast': node, 'shape': shape, 'allocated': False } if name in self.arg_list: self.io_dict[name] = {'arg': buff} buffer['allocated'] = True self.insert_buffer(name, buffer) content = None elif call_type[1] == "compute": name = target.id shape = self.get_shape(call.args[0]) placeholder = tvm.placeholder(shape, name=name, dtype=dtype) buff = tvm.decl_buffer(placeholder.shape, placeholder.dtype, placeholder.name) buffer = { 'tensor': placeholder, 'buffer': buff, 'type': 'compute', 'ast': node, 'shape': shape, 'allocated': False } if name in self.arg_list: self.io_dict[name] = {'arg': buff} buffer['allocated'] = True self.insert_buffer(name, buffer) lamb = call.args[1] assert isinstance( lamb, ast.Lambda ), "The second argument to tvm.compute must be a lambda function" self.scope += 1 ret = self.visit(lamb)[0] args = lamb.args.args if len(shape) == 1: var_name = args[0].id var = tvm.var(var_name, "int32") st = tvm.make.Store(buff.data, ret, var, self.true) if not isinstance(ret, tuple): ret = self.ReplaceVar(var_name, var).mutate(ret) st = tvm.make.Store(buff.data, ret, var, self.true) content = tvm.make.For(var, 0, shape[0], 0, 0, st) else: ret[0] = self.ReplaceVar(var_name, var).mutate(ret[0]) ret[1] = self.ReplaceVar(var_name, var).mutate(ret[1]) st = tvm.make.Store(buff.data, ret[1], var, self.true) content = tvm.make.For( var, 0, shape[0], 0, 0, tvm.make.Block(ret[0], st)) else: var_name1 = args[0].id var_name2 = args[1].id var1 = tvm.var(var_name1, "int32") var2 = tvm.var(var_name2, "int32") if not isinstance(ret, tuple): ret = self.ReplaceVar(var_name1, var1).mutate(ret) ret = self.ReplaceVar(var_name2, var2).mutate(ret) st = tvm.make.Store(buff.data, ret, (var1 * shape[1] + var2), self.true) expr = tvm.make.For(var2, 0, shape[1], 0, 0, st) else: if ret[0] is not None: ret0 = self.ReplaceVar(var_name1, var1).mutate(ret[0]) ret0 = self.ReplaceVar(var_name2, var2).mutate(ret0) ret1 = self.ReplaceVar(var_name1, var1).mutate(ret[1]) ret1 = self.ReplaceVar(var_name2, var2).mutate(ret1) st = tvm.make.Store(buff.data, ret1, (var1 * shape[1] + var2), self.true) if ret[0] is not None: expr = tvm.make.For( var2, 0, shape[1], 0, 0, tvm.make.Block(ret0, st)) else: expr = tvm.make.For( var2, 0, shape[1], 0, 0, st) content = tvm.make.For(var1, 0, shape[0], 0, 0, expr) self.scope -= 1 else: raise ValueError( "Unkown/Unsupported tvm function: tvm." + call_type[1]) return content else: # if call_type[1] == "tvm" raise ValueError("Currently we only support tvm functions") else: # if isinstance(node.value, ast.Call) content = self.visit(node.value) # left hand side var, name, _type = self.get_target(target) if _type == 'name': if var is None: if isinstance(content, int): var = tvm.var(name, "int32") elif isinstance(content, tvm.expr.Load): var = tvm.var(name, content.dtype) else: var = tvm.var(name, "float32") self.insert_var(name, { 'var': var, 'type': 'intermediate', 'allocated': False }) else: var = var['var'] else: index = self.visit(target) var = var['buffer'].data assert (not is_tvm) if isinstance(node.value, ast.IfExp): then = tvm.make.Store(var, content[1], index) orelse = tvm.make.Store(var, content[2], index) return tvm.make.IfThenElse(content[0], then, orelse) else: return tvm.make.Store(var, content, index)
def test_schedule(): @script def outer_product(a, b): c = output_tensor((64, 64), a.dtype) for i in range(64): for j in range(64): c[i, j] = a[i] * b[j] return c a = tvm.placeholder((64, ), name='a', dtype='float32') b = tvm.placeholder((64, ), name='b', dtype='float32') c = outer_product(a, b) # Test perfect loop split # Test loop reorder # Test loop annotation sch = tvm.create_schedule(c.op) i, j = c.op.axis io, ii = sch[c].split(i, 4) sch[c].parallel(ii) jo, ji = sch[c].split(j, 4) joo, joi = sch[c].split(jo, 4) sch[c].vectorize(ji) sch[c].reorder(ii, io, joo, joi, ji) ir = tvm.lower(sch, [a, b, c], simple_mode=True) assert isinstance(ir, tvm.stmt.ProducerConsumer) ir = ir.body assert isinstance(ir, tvm.stmt.AttrStmt) ir = ir.body assert isinstance(ir, tvm.stmt.For) assert ir.loop_var.name == 'i.inner' ir = ir.body assert isinstance(ir, tvm.stmt.For) assert ir.loop_var.name == 'i.outer' ir = ir.body assert isinstance(ir, tvm.stmt.For) assert ir.loop_var.name == 'j.outer.outer' ir = ir.body assert isinstance(ir, tvm.stmt.For) assert ir.loop_var.name == 'j.outer.inner' ir = ir.body func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c]) run_and_check(func, ins, outs=outs) # Test fuse sch = tvm.create_schedule(c.op) sch[c].fuse(c.op.axis[0], c.op.axis[1]) ir = tvm.lower(sch, [a, b, c], simple_mode=True) assert isinstance(ir, tvm.stmt.ProducerConsumer) ir = ir.body assert isinstance(ir, tvm.stmt.AttrStmt) ir = ir.body assert isinstance(ir, tvm.stmt.For) assert ir.loop_var.name == 'i.j.fused' func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c]) run_and_check(func, ins, outs=outs) # Test imperfect loop split sch = tvm.create_schedule(c.op) sch[c].split(c.op.axis[0], 3) ir = tvm.lower(sch, [a, b, c], simple_mode=True) func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c]) run_and_check(func, ins, outs=outs)
def _alter_conv2d_layout(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" if 'cudnn' in tvm.target.current_target( ).libs or 'miopen' in tvm.target.current_target().libs: return None import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype data, kernel = tinfos[0:2] N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) dispatch_ctx = autotvm.DispatchContext.current target = tvm.target.current_target() if groups == 1: # query config of this workload workload = autotvm.task.args_to_workload([ tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype ], conv2d) cfg = autotvm.DispatchContext.current.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None if cfg.template_key == 'direct': return None if cfg.template_key == 'int8': assert 'cuda' in target.keys new_layout = 'NCHW4c' new_attrs['layout'] = new_layout new_attrs['out_layout'] = new_layout new_attrs['kernel_layout'] = 'OIHW4o4i' ic_block_factor = oc_block_factor = 4 # Store the same config for the altered operator (workload) new_data = tvm.placeholder( (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype) new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\ oc_block_factor, ic_block_factor), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype ], conv2d) dispatch_ctx.update(target, new_workload, cfg) return sym.conv2d(*copy_inputs, **new_attrs) if attrs.get_int_tuple("dilation") != (1, 1): warnings.warn( "Does not support weight pre-transform for dilated convolution." ) return None # pre-compute weight transformation in winograd tile_size = _infer_tile_size(tinfos[0], tinfos[1]) weight = sym.contrib.conv2d_winograd_weight_transform( copy_inputs[1], tile_size=tile_size) weight = sym.transpose(weight, axes=[0, 1, 3, 2]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size # Store the same config for the altered operator (workload) new_data = data new_weight = tvm.placeholder( (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_weight, strides, padding, dilation, layout, out_dtype, tile_size ], conv2d_winograd_without_weight_transform) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_winograd_without_weight_transform( *copy_inputs, **new_attrs) elif groups != CI: workload = autotvm.task.args_to_workload([ tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype ], group_conv2d_nchw) cfg = autotvm.DispatchContext.current.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None if cfg.template_key == 'int8': assert 'cuda' in target.keys new_layout = 'NCHW4c' new_attrs['layout'] = new_layout new_attrs['out_layout'] = new_layout new_attrs['kernel_layout'] = 'OIHW4o4i' ic_block_factor = oc_block_factor = 4 # Store the same config for the altered operator (workload) new_data = tvm.placeholder( (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype) new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,\ KH, KW, oc_block_factor, ic_block_factor), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, groups, out_dtype ], group_conv2d_nchw) dispatch_ctx.update(target, new_workload, cfg) return sym.conv2d(*copy_inputs, **new_attrs) # do nothing for depthwise convolution return None
def test_tensor_inputs(): x = tvm.placeholder((1,), name='x') y = tvm.compute(x.shape, lambda i: x[i] + x[i]) assert tuple(y.op.input_tensors) == (x,)
def matvec(n, m, l): wei = tvm.placeholder((n, m), dtype='float32') data= tvm.placeholder((l, m), dtype='float32') res = topi.nn.dense(img, wei) cfg = autotvm.template.DispatchContext.current.query(None, None) s = tvm.create_schedule(res.op) if not tvm.gpu(0).exist: raise ValueError('shit!') n, k = get_const_tuple(data.shape) m, _ = get_const_tuple(wei.shape) cfg.add_flop(2 * n * l * m) output = den OL = s.cache_write(den, 'local') # create cache stage AA = s.cache_read(data, 'shared', [OL]) WW = s.cache_read(weight, 'shared', [OL]) # bind y, x = s[output].op.axis cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) scope, y = s[output].split(y, nparts=1) by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) s[output].bind(by, tvm.thread_axis("blockIdx.y")) s[output].bind(bx, tvm.thread_axis("blockIdx.x")) s[output].bind(vy, tvm.thread_axis("vthread")) s[output].bind(vx, tvm.thread_axis("vthread")) s[output].bind(ty, tvm.thread_axis("threadIdx.y")) s[output].bind(tx, tvm.thread_axis("threadIdx.x")) s[output].reorder(scope, by, bx, vy, vx, ty, tx, yi, xi) s[OL].compute_at(s[output], tx) # tile and bind reduction axes y, x = s[OL].op.axis r, = s[OL].op.reduce_axis cfg.define_split("tile_r", cfg.axis(r), num_outputs=3) ro, rm, ri = cfg['tile_r'].apply(s, OL, r) s[OL].reorder(ro, rm, ri, y, x) s[AA].compute_at(s[OL], ro) s[WW].compute_at(s[OL], rm) # s[AL].compute_at(s[OL], rxm) # s[WL].compute_at(s[OL], rxm) for load in [AA, WW]: fused = s[load].fuse(*list(s[load].op.axis)) fused, tx = s[load].split(fused, cfg["tile_x"].size[2]) fused, ty = s[load].split(fused, cfg["tile_y"].size[2]) s[load].bind(ty, tvm.thread_axis("threadIdx.y")) s[load].bind(tx, tvm.thread_axis("threadIdx.x")) cfg.other_option("auto_unroll_max_step", [0, 512, 1500]) cfg.other_option("unroll_explicit", [0, 1]) s[output].pragma(scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(scope, 'unroll_explicit', cfg['unroll_explicit'].val) return s, [img, wei, res]
""" from __future__ import absolute_import, print_function import tvm import numpy as np ###################################################################### # Define Matrix Multiplication # ---------------------------- # Take matrix multiplication as our example. # Matmul first multiply the corresponding elements between two matrix, # then accumulate across a certain axis. # The following lines describe the computation :code:`A * B^T` in TVM. # N, M, L = 1024, 512, 64 A = tvm.placeholder((N, L), name='A') B = tvm.placeholder((M, L), name='B') k = tvm.reduce_axis((0, L), name='k') C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k), name='C') s = tvm.create_schedule(C.op) print(tvm.lower(s, [A, B, C], simple_mode=True)) ###################################################################### # Schedule the Matmul # ------------------- # Now, suppose we have an accelerator that supports # matrix-vector multiplication (GEMV) as a hardware primitive, # which can take arbitrary size of reduce axis, # but another axis needs to be no larger than 16.
env = nnpu.get_env() nnpu.set_device(env, type=args.sim) with ScheduleProcHelper(): env = nnpu.get_env() shape = (32, 64) # (32, 64) -> (32, ) rshape = (16, 16) # the shape that MReduceSum insn accepts assert shape[0] % rshape[0] == 0, 'height must be divisible to {0}'.format( rshape[0]) assert shape[0] % env.cfg['vector_unit']['size'] == 0, \ 'height must be divisible to {0}'.format(env.cfg['vector_unit']['size']) assert shape[1] % rshape[1] == 0, 'width must be divisible to {0}'.format( rshape[0]) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'], a = tvm.placeholder(shape, dtype_n, 'a') a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a') k = tvm.reduce_axis((0, shape[1]), 'k0') re_shape = (shape[0], ) re_buf = tvm.compute( re_shape, lambda i: tvm.sum(a_buf[i, k].astype(dtype_w), axis=k), 're_buf') nnpu.utils.MarkScope(re_buf, 'acc') res_buf = nnpu.utils.CopyAccToBuf(re_buf, 'res') res_host, _ = nnpu.utils.CopyBufToH(res_buf, 'res') s = nnpu.create_schedule(res_host.op)
def test_gemm(): # graph nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n l = n A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((m, l), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute( (n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') # schedule s = tvm.create_schedule(C.op) xtile, ytile = 32, 32 scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis("threadIdx.y") CC = s.cache_write(C, "local") AA = s.cache_read(A, "shared", [CC]) BB = s.cache_read(B, "shared", [CC]) by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].reorder(by, bx, yi, xi) s[C].bind(by, block_y) s[C].bind(bx, block_x) ty, yi = s[C].split(yi, nparts=num_thread) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(ty, tx, yi, xi) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) s[CC].compute_at(s[C], tx) s[AA].compute_at(s[CC], k) s[BB].compute_at(s[CC], k) s[AA].double_buffer() s[BB].double_buffer() ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) # lowering test s = s.normalize() # one line to build the function. def check_device(device): if not tvm.module.enabled(device): print("skip because %s is not enabled.." % device) return f = tvm.build(s, [A, B, C], device) ctx = tvm.context(device, 0) # launch the kernel. n = nn m = n l = n a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) ftimer = f.time_evaluator(f.entry_name, ctx, number=1) tcost = ftimer(a, b, c).mean print("%s: exec=%g sec/op" % (ctx, tcost)) np.testing.assert_allclose( c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5) check_device("nvptx -mcpu=sm_20") check_device("rocm") check_device("metal") check_device("opencl") check_device("cuda")
def verify_resize3d(batch, in_channel, in_depth, in_height, in_width, out_depth, out_height, out_width, layout='NCDHW', coordinate_transformation_mode="half_pixel", method="trilinear"): if layout == 'NCDHW': A = tvm.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A', dtype='float32') dtype = A.dtype out_shape = (batch, in_channel, out_depth, out_height, out_width) a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height, in_width)).astype(dtype) elif layout == 'NDHWC': A = tvm.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A', dtype='float32') dtype = A.dtype out_shape = (batch, out_depth, out_height, out_width, in_channel) a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width, in_channel)).astype(dtype) else: raise NotImplementedError('Layout not supported {} '.format(layout)) B = topi.image.resize3d( A, (out_depth, out_height, out_width), layout=layout, coordinate_transformation_mode=coordinate_transformation_mode, method=method) if method == "trilinear": b_np = topi.testing.trilinear_resize3d_python( a_np, (out_depth, out_height, out_width), layout, coordinate_transformation_mode) else: scale_d = out_depth / in_depth scale_h = out_height / in_height scale_w = out_width / in_width b_np = topi.testing.upsampling3d_python(a_np, (scale_d, scale_h, scale_w), layout) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_injective(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) for device in get_all_backend(): check_device(device)
def gemm_int8(n, m, l): A = tvm.placeholder((n, l), name='A', dtype='int8') B = tvm.placeholder((m, l), name='B', dtype='int8') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype( 'int32'), axis=k), name='C') cfg = autotvm.get_config() s = tvm.create_schedule(C.op) y, x = C.op.axis AA = s.cache_read(A, 'shared', [C]) BB = s.cache_read(B, 'shared', [C]) AL = s.cache_read(AA, 'local', [C]) BL = s.cache_read(BB, 'local', [C]) CC = s.cache_write(C, 'local') k = CC.op.reduce_axis[0] cfg.define_split('tile_k', cfg.axis(k), num_outputs=3, filter=lambda entity: entity.size[2] == 4 and \ entity.size[0] * 2 >= entity.size[1]) ko, kt, ki = cfg['tile_k'].apply(s, CC, k) s[CC].tensorize(ki, intrin_dp4a) block_x = tvm.thread_axis('blockIdx.x') block_y = tvm.thread_axis('blockIdx.y') thread_x = tvm.thread_axis('threadIdx.x') thread_y = tvm.thread_axis('threadIdx.y') def block_size_filter(entity): return entity.size[0] * 2 >= entity.size[1] * 2 and \ entity.size[1] <= 16 and entity.size[3] <= 4 cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter) cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter) by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y) bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].bind(tyz, tvm.thread_axis('vthread')) s[C].bind(txz, tvm.thread_axis('vthread')) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis s[CC].reorder(ko, kt, yo, xo, ki) s[CC].unroll(kt) for stage in [AL, BL]: s[stage].compute_at(s[CC], kt) _, xi = s[stage].split(stage.op.axis[1], factor=4) s[stage].vectorize(xi) s[stage].double_buffer() cfg.define_knob('storage_align', [16, 48]) for stage in [AA, BB]: s[stage].storage_align(s[stage].op.axis[0], cfg['storage_align'].val, 0) s[stage].compute_at(s[CC], ko) fused = s[stage].fuse(*s[stage].op.axis) ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2]) tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2]) _, xi = s[stage].split(xi, factor=16) s[stage].bind(ty, thread_y) s[stage].bind(tx, thread_x) s[stage].vectorize(xi) cfg.define_knob('auto_unroll_max_step', [512, 1500]) s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[C].pragma(by, 'unroll_explicit', False) cfg.add_flop(n*m*l*2) return s, [A, B, C]
def test_stmt_constructor(): v = tvm.var("aa") buffer_var = tvm.var("buf", dtype="handle") nop = tvm.stmt.Evaluate(1) x = tvm.stmt.LetStmt(v, 1, tvm.stmt.Evaluate(1)) assert isinstance(x, tvm.stmt.LetStmt) assert x.var == v assert x.value.value == 1 assert isinstance(x.body, tvm.stmt.Evaluate) x = tvm.stmt.AttrStmt(v == 1, "xx", 1, tvm.stmt.Evaluate(1)) assert isinstance(x, tvm.stmt.AttrStmt) assert x.value.value == 1 x = tvm.stmt.AssertStmt(tvm.const(1, "uint1"), tvm.convert("hellow"), nop) assert isinstance(x, tvm.stmt.AssertStmt) assert x.body == nop x = tvm.stmt.ProducerConsumer(None, True, nop) assert isinstance(x, tvm.stmt.ProducerConsumer) assert x.body == nop x = tvm.stmt.For(tvm.var("x"), 0, 10, 0, 0, nop) assert isinstance(x, tvm.stmt.For) assert x.min.value == 0 assert x.extent.value == 10 assert x.body == nop x = tvm.stmt.Store(buffer_var, 1, 10, tvm.const(1, "uint1")) assert isinstance(x, tvm.stmt.Store) assert x.buffer_var == buffer_var assert x.index.value == 10 assert x.value.value == 1 tensor = tvm.placeholder((), dtype="float32") x = tvm.stmt.Provide(tensor.op, 0, 10, []) assert isinstance(x, tvm.stmt.Provide) assert x.value_index == 0 assert x.value.value == 10 x = tvm.stmt.Allocate(buffer_var, "float32", [10], tvm.const(1, "uint1"), nop) assert isinstance(x, tvm.stmt.Allocate) assert x.dtype == "float32" assert x.buffer_var == buffer_var assert x.body == nop x = tvm.stmt.AttrStmt(buffer_var, "xyz", 1, nop) assert isinstance(x, tvm.stmt.AttrStmt) assert x.node == buffer_var assert x.attr_key == "xyz" assert x.body == nop x = tvm.stmt.Free(buffer_var) assert isinstance(x, tvm.stmt.Free) assert x.buffer_var == buffer_var x = tvm.stmt.Realize(None, 0, "float", [], tvm.const(1, "uint1"), nop) assert isinstance(x, tvm.stmt.Realize) assert x.body == nop x = tvm.stmt.IfThenElse(tvm.const(1, "uint1"), tvm.stmt.Evaluate(11), nop) assert isinstance(x, tvm.stmt.IfThenElse) assert x.then_case.value.value == 11 assert x.else_case == nop x = tvm.stmt.Prefetch(None, 1, "float32", []) assert isinstance(x, tvm.stmt.Prefetch) assert x.value_index == 1
def test_dwarf_debug_information(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def check_llvm_object(): if not tvm.module.enabled("llvm"): return if tvm.codegen.llvm_version_major() < 5: return if tvm.codegen.llvm_version_major() > 6: return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], "llvm") temp = util.tempdir() o_path = temp.relpath("temp.o") m.save(o_path) import re import shutil import subprocess import sys # Try the dwarfdump utility (OS X) if shutil.which("dwarfdump"): output = subprocess.check_output(["dwarfdump", o_path]) assert re.search(r"""DW_AT_name\\t\("fadd1"\)""", str(output)) assert re.search(r"""DW_AT_name\\t\("fadd2"\)""", str(output)) # Try gobjdump (OS X) if shutil.which("gobjdump"): output = subprocess.check_output(["gobjdump", "--dwarf", o_path]) assert re.search(r"""DW_AT_name.*fadd1""", str(output)) assert re.search(r"""DW_AT_name.*fadd2""", str(output)) # Try objdump (Linux) - Darwin objdump has different DWARF syntax. if shutil.which("objdump") and sys.platform != 'darwin': output = subprocess.check_output(["objdump", "--dwarf", o_path]) assert re.search(r"""DW_AT_name.*fadd1""", str(output)) assert re.search(r"""DW_AT_name.*fadd2""", str(output)) def check_llvm_ir(): if not tvm.module.enabled("llvm"): return if tvm.codegen.llvm_version_major() < 5: return if tvm.codegen.llvm_version_major() > 6: return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], target="llvm -target=aarch64-linux-gnu") ll = m.get_source("ll") # On non-Darwin OS, don't explicitly specify DWARF version. import re assert not re.search(r""""Dwarf Version""" "", ll) assert re.search(r"""llvm.dbg.value""", ll) # Try Darwin, require DWARF-2 m = tvm.build([f1, f2], target="llvm -target=x86_64-apple-darwin-macho") ll = m.get_source("ll") assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll) assert re.search(r"""llvm.dbg.value""", ll) check_llvm_object() check_llvm_ir()
def test(): env = nnpu.get_env() nnpu.set_device(env) shape = (2, 2, 16) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder(shape, dtype_w, 'a') sph = ScheduleProcHelper() a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph) k = tvm.reduce_axis((0, 2), 'k') add_buf = tvm.compute( (2, 16), lambda i, j: tvm.sum(a_buf[k, i, j], axis=k), 'add_buf') sph.MarkScope(add_buf) add_host, add_dram = nnpu.utils.CopyBufToH(add_buf, 'add', sph) k1 = tvm.reduce_axis((0, 2), 'k1') mul_buf = tvm.compute( (2, 16), lambda i, j: tvm.sum(a_buf[k1, i, j], axis=k1), 'mul_buf') sph.MarkScope(mul_buf) mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph) s = tvm.create_schedule([add_host.op, mul_host.op]) sph.Transform(s) ko, ki = s[add_buf].split(add_buf.op.reduce_axis[0], factor=1) s[add_buf].reorder(ko, ki, *(s[add_buf].op.axis)) s[add_buf].tensorize(ki, env.intrins.get('MAddMerge', shape=shape, mode='w')) ko1, ki1 = s[mul_buf].split(mul_buf.op.reduce_axis[0], factor=1) s[mul_buf].reorder(ko1, ki1, *(s[mul_buf].op.axis)) s[mul_buf].tensorize(ki1, env.intrins.get('MMulMerge', shape=shape, mode='w')) print(nnpu.lower(s, [a, add_host, mul_host], simple_mode=True)) func = nnpu.build(s, [a, add_host, mul_host], 'nnpu', 'llvm', name='nnpu_func') #exit() ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(2, 2, 16), dtype=a.dtype, low=-16, high=16) a_nd = tvm.nd.array(a_np, ctx) add_nd = tvm.nd.array(np.zeros((2, 16)).astype(add_host.dtype), ctx) mul_nd = tvm.nd.array(np.zeros((2, 16)).astype(mul_host.dtype), ctx) func(a_nd, add_nd, mul_nd) print('a = ') print(a_np) print('reduce sum row = ') print(add_nd.asnumpy()) print('ground truth is: ') gt = np.sum(a_np, axis=0) print(gt) np.testing.assert_allclose(add_nd.asnumpy(), gt) print('reduce mul row = ') print(mul_nd.asnumpy()) gt = np.multiply.reduce(a_np, axis=0, dtype=a.dtype) print(gt) np.testing.assert_allclose(mul_nd.asnumpy(), gt)
def test_llvm_add_pipeline(): nn = 1024 n = tvm.convert(nn) A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def verify_elf(path, e_machine): with open(path, "rb") as fi: arr = fi.read(20) assert struct.unpack('ccc', arr[1:4]) == (b'E',b'L',b'F') endian = struct.unpack('b', arr[0x5:0x6])[0] endian = '<' if endian == 1 else '>' assert struct.unpack(endian + 'h', arr[0x12:0x14])[0] == e_machine def build_i386(): if not tvm.runtime.enabled("llvm"): print("Skip because llvm is not enabled..") return temp = util.tempdir() target = "llvm -target=i386-pc-linux-gnu" f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x03) def build_arm(): target = "llvm -target=armv7-none-linux-gnueabihf" if not tvm.runtime.enabled(target): print("Skip because %s is not enabled.." % target) return temp = util.tempdir() f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x28) asm_path = temp.relpath("myadd.asm") f.save(asm_path) # Do a RPC verification, launch kernel on Arm Board if available. host = os.environ.get('TVM_RPC_ARM_HOST', None) remote = None if host: port = int(os.environ['TVM_RPC_ARM_PORT']) try: remote = rpc.connect(host, port) except tvm.TVMError as e: pass if remote: remote.upload(path) farm = remote.load_module("myadd.o") ctx = remote.cpu(0) n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) farm(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) print("Verification finish on remote..") build_i386() build_arm()
def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False): print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8') bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias', dtype='int8') a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype) # convert to NCHWc _, _, out_height, out_width = c_np.shape c_np = c_np.reshape((batch, num_filter // oc_block_factor, oc_block_factor, \ out_height, out_width)).transpose(0, 1, 3, 4, 2) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % device) with tvm.target.create(device): dW = topi.nn.dilate(W, (1, 1, dilation, dilation)) C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding), layout='NCHW', out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device in ["cuda"]: check_device(device)
def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96): A = tvm.placeholder((n, l), name='A', dtype='float16') B = tvm.placeholder((l, m), name='B', dtype='float16') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda i, j: tvm.sum( A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k)) s = tvm.create_schedule(C.op) y, x = s[C].op.axis k = s[C].op.reduce_axis[0] AA = s.cache_read(A, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BB = s.cache_read(B, "shared", [C]) BL = s.cache_read(BB, "local", [C]) CL = s.cache_write(C, "local") bx = 4 by = 32 step_k = 8 v = 4 TX = 8 TY = 1 tile_x = bx * TX tile_y = by * TY WX = min(warp_tile_m, tile_x) tile_k = 16 vthread = 1 yo, ty = s[C].split(y, tile_y * vthread) vy, ty = s[C].split(ty, tile_y) ty, yi = s[C].split(ty, TY) xo, xi = s[C].split(x, tile_x) tz, xi = s[C].split(xi, WX) tx, xi = s[C].split(xi, TX) ko, ki = s[CL].split(k, step_k * tile_k) kl, ki = s[CL].split(ki, tile_k) s[C].reorder(yo, xo, tz, ty, tx, yi, xi) s[C].bind(yo, tvm.thread_axis("blockIdx.y")) s[C].bind(xo, tvm.thread_axis("blockIdx.x")) s[C].bind(ty, tvm.thread_axis("threadIdx.y")) s[C].bind(tz, tvm.thread_axis("threadIdx.z")) s[C].bind(tx, tvm.thread_axis("threadIdx.x")) s[C].bind(vy, tvm.thread_axis((0, vthread), "vthread", name="vy")) s[CL].compute_at(s[C], tx) yo, xo = CL.op.axis s[CL].reorder(ko, kl, ki, yo, xo) s[AA].compute_at(s[CL], ko) xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v) tz, tx = s[AA].split(xi, factor=(WX // TX) * v) tx, vec = s[AA].split(tx, factor=v) fused = s[AA].fuse(s[AA].op.axis[0], xo) _, ty = s[AA].split(fused, factor=by) s[AA].bind(ty, tvm.thread_axis("threadIdx.y")) s[AA].bind(tz, tvm.thread_axis("threadIdx.z")) s[AA].bind(tx, tvm.thread_axis("threadIdx.x")) s[AA].vectorize(vec) s[BB].compute_at(s[CL], ko) xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v) tz, tx = s[BB].split(xi, factor=(WX // TX) * v) tx, vec = s[BB].split(tx, factor=v) fused = s[BB].fuse(s[BB].op.axis[0], xo) _, ty = s[BB].split(fused, factor=by) s[BB].bind(ty, tvm.thread_axis("threadIdx.y")) s[BB].bind(tz, tvm.thread_axis("threadIdx.z")) s[BB].bind(tx, tvm.thread_axis("threadIdx.x")) s[BB].vectorize(vec) s[AL].compute_at(s[CL], kl) s[BL].compute_at(s[CL], kl) s[CL].pragma(ko, 'tensor_core') func = tvm.build(s, [A, B, C], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(l, m)).astype(B.dtype) c_np = np.zeros((n, m), dtype=np.float32) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) func(a, b, c) evaluator = func.time_evaluator(func.entry_name, ctx, number=3) print('gemm m=%d n=%d k=%d: %f ms' % (m, n, l, evaluator(a, b, c).mean * 1e3)) c_np = np.dot(a_np, b_np) np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
def test_rpc_remote_module(): if not tvm.module.enabled("rpc"): return server = rpc.Server("localhost") remote = rpc.connect(server.host, server.port) # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') s = tvm.create_schedule(B.op) def check_remote(): if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return temp = util.tempdir() ctx = remote.cpu(0) f = tvm.build(s, [A, B], "llvm", name="myadd") path_dso = temp.relpath("dev_lib.so") f.export_library(path_dso) remote.upload(path_dso) f1 = remote.load_module("dev_lib.so") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10) cost = time_f(a, b).mean print('%g secs/op' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) def check_remote_link_cl(): """Test function to run remote code such as cl This is not enabled because there is forking issue of TVM runtime when server launches after OpenCL runtime initializes. We leave it as an example on how to do rpc when we want to do linking on remote. """ if not tvm.module.enabled("llvm"): print("Skip because llvm is not enabled") return if not tvm.module.enabled("opencl"): print("Skip because opencl is not enabled") return temp = util.tempdir() ctx = remote.cl(0) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd") # Option 1: save modules separately and rely on remote compiler path_o = temp.relpath("myadd.o") path_cl = temp.relpath("myadd.cl") path_json = temp.relpath("myadd.tvm_meta.json") f.save(path_o) f.imported_modules[0].save(path_cl) remote.upload(path_o) remote.upload(path_cl) # upload meta data remote.upload(path_json) fhost = remote.load_module("myadd.o") fdev = remote.load_module("myadd.cl") fhost.import_module(fdev) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) # Option 2: export library as a tar ball then handled by remote compiler path_tar = temp.relpath("myadd.tar") f.export_library(path_tar) remote.upload(path_tar) fhost = remote.load_module("myadd.tar") a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) fhost(a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) check_remote()
def test_convolution_inference(): BATCH = 8 IH = 48 IW = 48 IC = 16 OC = 16 K = 3 PAD = 1 STRIDE = 1 OH = (IH + 2 * PAD - K) + 1 OW = (IW + 2 * PAD - K) + 1 dshape = (BATCH, IC, IH, IW) kshape = (OC, IC, K, K) bshape = (OC, ) oshape = (BATCH, OC, OH, OW) data = tvm.placeholder(dshape, name='data') kernel = tvm.placeholder(kshape, name='kernel') bias = tvm.placeholder(bshape, name='bias') def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=True): if not tvm.module.enabled(target): pytest.skip("%s is not enabled..." % target) if not tvm.get_global_func( "tvm.contrib.nnpack.fully_connected_inference", True): pytest.skip("extern function is not available") if not nnpack.is_available(): pytest.skip("nnpack is not available") ctx = tvm.cpu(0) output = nnpack.convolution_inference(data, kernel, bias if with_bias else None, [PAD, PAD, PAD, PAD], [STRIDE, STRIDE], algorithm=algorithm) s = tvm.create_schedule(output.op) f = tvm.build(s, [data, kernel, bias, output], target) na = np.random.uniform(size=dshape).astype(data.dtype) nb = np.random.uniform(size=kshape).astype(kernel.dtype) nc = np.zeros(bshape, dtype=bias.dtype) ta = tvm.nd.array(na, ctx) tb = tvm.nd.array(nb, ctx) tc = tvm.nd.array(nc, ctx) td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx) f(ta, tb, tc, td) nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1) tvm.testing.assert_allclose(td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5) for algorithm in [ nnpack.ConvolutionAlgorithm.AUTO, nnpack.ConvolutionAlgorithm.FFT_8x8, nnpack.ConvolutionAlgorithm.FFT_16x16, nnpack.ConvolutionAlgorithm.WT_8x8, nnpack.ConvolutionAlgorithm.IMPLICIT_GEMM, nnpack.ConvolutionAlgorithm.WT_8x8_FP16, ]: for with_bias in [True, False]: verify(algorithm=algorithm, with_bias=with_bias)
def verify_depthwise_conv2d_back_input(batch, in_channel, in_h, channel_multiplier, filter_h, stride_h, padding_h): in_w = in_h filter_channel = in_channel filter_w = filter_h stride_w = stride_h padding_w = padding_h out_h = np.int((in_h + 2 * padding_h - filter_h) / stride_h + 1) out_w = np.int((in_w + 2 * padding_w - filter_w) / stride_w + 1) out_channel = in_channel * channel_multiplier ishape = [batch, in_h, in_w, in_channel] oshape = [batch, out_h, out_w, out_channel] # placeholder Out_grad = tvm.placeholder(oshape, name='Out_grad') Filter = tvm.placeholder( (filter_h, filter_w, filter_channel, channel_multiplier)) # declare In_grad = topi.nn.depthwise_conv2d_backward_input_nhwc( Filter, Out_grad, oshape, ishape, stride=[stride_h, stride_w], padding=[padding_h, padding_w]) # schedule schedule = schedule_depthwise_conv2d_backward_input_nhwc(In_grad) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) # build the kernel f = tvm.build(schedule, [Filter, Out_grad, In_grad], device) # prepare pod type for test data closure dtype = Out_grad.dtype out_grad_shape = get_const_tuple(Out_grad.shape) filter_shape = get_const_tuple(Filter.shape) # use memoize to pickle the test data for next time use @memoize("topi.tests.test_topi_depthwise_conv2d_backward_input.nhwc") def get_ref_data(): out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) dilated_out_grad_np = topi.testing.dilate_python( out_grad_np, [1, stride_h, stride_w, 1]) # padding params in forward propagation fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple( [padding_h, padding_w], (filter_h, filter_w)) # padding params in backward propagation bpad_top = filter_h - 1 - fpad_top bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1) bpad_left = filter_w - 1 - fpad_left bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1) padded_out_grad = np.zeros( (batch, dilated_out_grad_np.shape[1] + bpad_top + bpad_bottom, dilated_out_grad_np.shape[2] + bpad_left + bpad_right, out_channel)) padded_out_grad[:, bpad_top:dilated_out_grad_np.shape[1] + bpad_top, bpad_left:dilated_out_grad_np.shape[2] + bpad_left, :] = dilated_out_grad_np in_grad_np = np.zeros((batch, in_h, in_w, in_channel)) for b in range(batch): for c in range(in_channel): for m in range(channel_multiplier): in_grad_np[b, :, :, c] += signal.convolve2d(padded_out_grad[b, :, :, c*channel_multiplier+m], \ filter_np[:, :, c, m], mode='valid')[0:in_h, 0:in_w] return (out_grad_np, filter_np, in_grad_np) (out_grad_np, filter_np, in_grad_np) = get_ref_data() out_grad_tvm = tvm.nd.array(out_grad_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) in_grad_tvm = tvm.nd.array(np.zeros(shape=ishape, dtype=dtype), ctx) # launch the kernel timer = f.time_evaluator(f.entry_name, ctx, number=1) tcost = timer(filter_tvm, out_grad_tvm, in_grad_tvm).mean np.testing.assert_allclose(in_grad_np, in_grad_tvm.asnumpy(), rtol=1e-5) check_device("opencl") check_device("cuda") check_device("metal") check_device("rocm") check_device("vulkan")
def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False): print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding)) in_height = in_width = in_size A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') bias = tvm.placeholder((num_filter, 1, 1), name='bias') a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): dW = topi.nn.dilate(W, (1, 1, dilation, dilation)) C = topi.nn.conv2d(A, dW, (stride, stride), (padding, padding), layout='NCHW', out_dtype=dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.generic.schedule_conv2d_nchw([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) func(a, w, c) np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device in get_all_backend(): check_device(device)
def col2im_manual_schedule(shape, kernel, stride, pad, dtype, output_H_W, polyhedral=True, attrs=None): """ Col2im operation with manual schedule. Args: shape (Union[list, tuple]): seven int numbers for the input's image size. kernel (Union[list, tuple]): two int numbers for the sliding window's size. stride (Union[list, tuple]): two int numbers for the sliding window's stride. pad: (Union[list, tuple]): four int numbers for padding's sizes: top, bottom, left, and right dtype (str): parameters' type. output_H_W (Union[list, tuple]): two int numbers for the output's height and width. polyhedral (bool): If True, use auto-schedule, else use manual-schedule, default value is True. attrs (dict): Specifies parameters used in manual-schedule. Returns: tvm.tensor.Tensor as result for col2im operation. """ N, C1, KH, KW, OH, OW, C0 = shape H, W = output_H_W output_shape = (N, C1, H, W, C0) kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_t, pad_b, pad_l, pad_r = pad assert H == (OH - 1) * stride_h + kernel_h - (pad_t + pad_b), "Height of input and output do not match" assert W == (OW - 1) * stride_w + kernel_w - (pad_l + pad_r), "Width of input and output do not match" col2im = intrin_col2im(shape, output_shape, kernel, stride, pad, dtype) # tensor for the input data data = tvm.placeholder(shape, dtype, name="input_data") # assume we need the whole width of A # choose a section of the rows of A that encompasses all of the windows in the current window-batch res = tvm.compute( output_shape, lambda b, c1, h, w, c0: data(b, c1, h % KH, w % KW, h % OH, w % OW, c0), name="col2im_intrinsic" ) # schedule for differetiation operation s = tvm.create_schedule([res.op]) res_ub = s.cache_write(res, "local.UB") data_ub = s.cache_read(data, "local.UB", [res_ub]) b, c1, h, w, c0 = res.op.axis s[data_ub].compute_at(s[res], c1) s[res_ub].compute_at(s[res], c1) s[res_ub].tensorize(res_ub.op.axis[0], col2im) with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, res], "cce", name="col2im_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "col2im_manual_schedule" utils.create_code(kernel_name, "./", source_code) return mod
""" from __future__ import absolute_import, print_function import tvm import numpy as np ###################################################################### # Direct Declare Extern Math Call # ------------------------------- # The most straight-forward way to call target specific function is via # extern function call construct in tvm. # In th following example, we use :any:`tvm.call_pure_extern` to call # :code:`__expf` function, which is only available under CUDA. # n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda i: tvm.call_pure_extern("float32", "__expf", A[i]), name="B") s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B], "cuda", name="myexp") print(f.imported_modules[0].get_source()) ###################################################################### # Unified Intrinsic Call # ---------------------- # The above code verifies that direct external call can be used to
def test(): env = nnpu.get_env() nnpu.set_device(env) shape = (2, 16) a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host') print('a host ' + str(a_host)) a = tvm.compute(shape, lambda *i: a_host(*i), name='a') a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf') b_buf = tvm.compute( shape, lambda i, j: tvm.log(a_buf[i, j].astype(env.cfg['dtype_w'])), name='b_buf') b = tvm.compute(shape, lambda *i: b_buf(*i), name='b') b_host = tvm.compute(shape, lambda *i: b(*i), name='b_host') s = tvm.create_schedule(b_host.op) # mark variable scopes s[a].set_scope(env.dram_scope) s[b].set_scope(env.dram_scope) s[a_buf].set_scope(env.uni_scratchpad_scope) s[b_buf].set_scope(env.uni_scratchpad_scope) #print # (dir(s[b].op.body)) # mark compiler pragmas s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma) s[b_host].pragma(s[b_host].op.axis[0], env.dma_copy_pragma) s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls) s[b].pragma(s[b].op.axis[0], env.scratchpad_ls) s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0]) # tensorize s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VLOG', mode='inc')) # build print(tvm.lower(s, [a_host, b_host], simple_mode=True)) print(nnpu.lower(s, [a_host, b_host], simple_mode=True)) #exit() func = nnpu.build(s, [a_host, b_host], 'nnpu', 'llvm', name='nnpu_log') print('function built: ') #print(func.get_source()) # prepare data ctx = tvm.nd.TVMContext(13, 0) #??? print('i want to know:') print(ctx.exist) a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=1, high=20) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(np.zeros(shape).astype(b_host.dtype), ctx) # run func(a_nd, b_nd) print('run finished') b_np = b_nd.asnumpy() print('a=') print(a_np) print('b=') print(b_np) print('ground truth =') gt = np.log(a_np, dtype=b_host.dtype) print(gt) np.testing.assert_allclose(b_np, gt)
def _alter_conv2d_layout(attrs, inputs, tinfo, F): import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} data, kernel = tinfo[0], tinfo[1] batch_size, in_channel, height, width = get_const_tuple(data.shape) groups = attrs.get_int("groups") out_channel = attrs.get_int("channels") if F == sym else attrs.get_int( "channels").value padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") out_dtype = attrs["out_dtype"] layout_name = 'layout' if F == sym else 'data_layout' layout = attrs[layout_name] kh, kw = attrs.get_int_tuple("kernel_size") dtype = data.dtype out_dtype = dtype if out_dtype in ("same", "") else out_dtype is_depthwise = groups == in_channel and groups == out_channel # only optimize for NCHW if layout != 'NCHW': return None if groups != 1 and not is_depthwise: return None dispatch_ctx = autotvm.task.DispatchContext.current target = tvm.target.current_target() # query schedule and fallback if necessary workload = autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \ if is_depthwise else \ autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise) ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] new_attrs[layout_name] = 'NCHW%dc' % ic_bn new_attrs['out_layout'] = 'NCHW%dc' % oc_bn new_data = tvm.placeholder( (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data.dtype) if is_depthwise: new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn # Store altered operator's config new_kernel = tvm.placeholder( (out_channel // oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name], new_attrs['out_layout'], out_dtype ], depthwise_conv2d_NCHWc) else: out_channel, _, kh, kw = get_const_tuple(kernel.shape) # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) # Store altered operator's config new_kernel = tvm.placeholder( (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name], new_attrs['out_layout'], out_dtype ], conv2d_NCHWc) dispatch_ctx.update(target, new_workload, cfg) if is_depthwise: if F == sym: logging.warning( "Use native layout for depthwise convolution on NNVM.") return None return F.nn.contrib_depthwise_conv2d_nchwc(*copy_inputs, **new_attrs) else: if F == sym: return F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs) return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
def _declaration_conv_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload HPAD, WPAD = padding if isinstance(padding, (tuple, list)) else (padding, padding) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) assert (dh, dw) == (1, 1), "Does not support dilation" n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) in_channel = ic_chunk * ic_bn if data.dtype == 'uint8': oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple( kernel.shape) else: oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple( kernel.shape) num_filter = oc_chunk * oc_bn if cfg.is_fallback: _get_default_config( cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype), tvm.placeholder( (num_filter, in_channel, kernel_height, kernel_width), dtype=kernel.dtype), strides, padding, out_dtype) # output shape out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1 oshape = (n, oc_chunk, out_height, out_width, oc_bn) # DOPAD DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") else: data_pad = data ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') if data.dtype == 'uint8': assert out_dtype == "int32", \ "INT8 convolution requires input dtype = uint8 and output dtype=int32" # Intel performs dot product of 2 "4" Int8 values # Current implementation requires ic_bn to be a multiple of 4 n_elems = 4 assert ic_bn % n_elems == 0 ic_outer = tvm.reduce_axis((0, in_channel // ic_bn), name='ic_outer') ic_f_inner = tvm.reduce_axis((0, ic_bn // n_elems), name='ic_f_inner') ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner') return tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum( data_pad[n, ic_outer, oh * HSTR + kh, ow * WSTR + kw, ic_f_inner * n_elems + ic_s_inner].astype(out_dtype) * kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype), axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]), name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8") # else: fp implementation return tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[ n, ic // ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % ic_bn].astype( out_dtype) * kernel[oc_chunk, ic // ic_bn, kh, kw, ic % ic_bn, oc_block], axis=[ic, kh, kw]), name='conv2d_NCHWc', tag="conv2d_NCHWc")
def test_ewise(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') shape = (20, 3) def test_apply(func, name, f_numpy, low, high, check_round=False, skip_name_check=False): B = func(A) assert tuple(B.shape) == tuple(A.shape) if not skip_name_check: assert B.op.body[0].name == name a_np = np.random.uniform(low=low, high=high, size=shape).astype( A.dtype) * 10 # avoid round check too close to boundary if check_round: a_np += ((np.fmod(a_np, 1) - 0.5) < 1e-6) * 1e-5 b_np = f_numpy(a_np) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_injective(B) foo = tvm.build(s, [A, B], device, name=name) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros_like(b_np), ctx) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) for device in [ 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel', 'aocl_sw_emu' ]: check_device(device) test_apply(topi.floor, "floor", np.floor, -100, 100) test_apply(topi.ceil, "ceil", np.ceil, -100, 100) test_apply(topi.sign, "sign", np.sign, -100, 100, skip_name_check=True) test_apply(topi.trunc, "trunc", np.trunc, -100, 100) test_apply(topi.abs, "fabs", np.abs, -100, 100) test_apply(topi.round, "round", np.round, -100, 100, check_round=True) test_apply(topi.exp, "exp", np.exp, -1, 1) test_apply(topi.tanh, "tanh", np.tanh, -10, 10) test_apply(topi.sigmoid, "sigmoid", lambda x: 1 / (1 + np.exp(-x)), -1, 1) test_apply(topi.log, "log", np.log, 0, 100) test_apply(topi.sqrt, "sqrt", np.sqrt, 0, 100) test_apply(topi.rsqrt, "rsqrt", lambda x: np.ones_like(x) / np.sqrt(x), 0, 100, skip_name_check=True)
def test_bind(): if not tvm.gpu(0).exist: print('[Warning] No GPU found! Skip bind test!') return @script def vec_add(a, b): c = output_tensor((1000, ), 'float32') for tx in bind('threadIdx.x', 1000): c[tx] = a[tx] + b[tx] return c a = tvm.placeholder((1000, ), dtype='float32', name='a') b = tvm.placeholder((1000, ), dtype='float32', name='b') func, ins, outs = run_and_check(vec_add, [a, b], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') @script def raw(a, b): c = output_tensor((1000, ), 'float32') for i in range(1000): c[i] = a[i] + b[i] return c c = raw(a, b) sch = tvm.create_schedule(c.op) x = tvm.thread_axis('threadIdx.x') sch[c].bind(c.op.axis[0], x) func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') @tvm.hybrid.script def foo(a): c = output_tensor((a.shape[0], ), a.dtype) total = allocate((1, ), a.dtype, 'local') len_i = a.shape[0] len_j = a.shape[1] for i in bind('threadIdx.x', len_i): total[0] = 0. for k in const_range(len_j): total[0] += a[i, k] c[i] = total[0] return c a = tvm.placeholder((8, 4), 'float32') c = foo(a) s = tvm.create_schedule(c.op) ir = tvm.lower(s, [a, c], simple_mode=True) assert not isinstance(ir, tvm.stmt.AttrStmt) func, ins, outs = run_and_check(foo, [a], target='cuda') run_and_check(func, ins, outs=outs, target='cuda') @tvm.hybrid.script def max_threads(a): b = output_tensor(a.shape, a.dtype) n = a.shape[0] m = max_num_threads(True) for i in bind('threadIdx.x', m): for j in bind('blockIdx.x', ceil_div(n, m)): if i * m + j < n: b[i * m + j] = a[i * m + j] + a[i * m + j] return b a = tvm.placeholder((10000, ), 'float32') with tvm.target.create('cuda'): func, ins, outs = run_and_check(max_threads, [a], target='cuda') run_and_check(func, ins, outs=outs, target='cuda')