def test_buffer_index_merge_mult_mod(): m = tvm.var('m') n = tvm.var('n') s = tvm.var('s') k0 = tvm.var('k0') k1 = tvm.var('k1') A = tvm.decl_buffer((m, n), tvm.float32) A_stride = tvm.decl_buffer((m, n), tvm.float32, strides=(s, 1)) def assert_simplified_equal(index_simplified, index_direct): assert tvm.ir_pass.Equal(index_simplified, index_direct),\ "index_simplified=%s, index_direct=%s" %(index_simplified, index_direct) # Test Case1 index_simplified = A_stride.vload(((k0 % k1) / s, (k0 % k1) % s + (k0 / k1) * k1)) index_direct = A_stride.vload((0, k0)) assert_simplified_equal(index_simplified, index_direct) # Test Case2 index_simplified = A.vload(((k0 % (k1 / s)) / n, (k0 % (k1 / s)) % n + (k0 % k1))) index_direct = A.vload((0, k0 % k1 + k0 % (k1 / s))) assert_simplified_equal(index_simplified, index_direct) # Test Case3 index_simplified = A.vload((((k0 / (k1 / s)) * (k1 / s)) / n + (k0 % (k1 / s)) / n, ((k0 / (k1 / s)) * (k1 / s)) % n + (k0 % (k1 / s)) % n)) index_direct = A.vload((0, k0)) assert_simplified_equal(index_simplified, index_direct) # Test Case4 (not able to simplify) index_simplified = A.vload(((k0 % (k1 / s)) / n, (k0 % (k1 / n)) % n + (k0 % k1))) index_direct = A.vload((0, ((k0 % (k1 / s)) / n) * n + ((k0 % (k1 / n)) % n + (k0 % k1)))) assert_simplified_equal(index_simplified, index_direct)
def test_lrn(): n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w") x = relay.var("x", shape=(n, c , h, w)) y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=.00001, beta=0.75) "alpha=" in y.astext() yy = relay.ir_pass.infer_type(y) assert yy.checked_type == relay.TensorType((n, c , h, w)) shape = (1, 5, 10, 10) dtype = "float32" x = relay.var("x", relay.TensorType(shape, dtype)) size=5 axis=1 bias=0.5 alpha=.00001 beta=0.75 z = relay.nn.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta) yy = relay.ir_pass.infer_type(z) assert yy.checked_type == relay.TensorType(shape, dtype) func = relay.Function([x], z) x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta) for target, ctx in ctx_list(): intrp1 = relay.create_executor("graph", ctx=ctx, target=target) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
def test_l2_normalize(): n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w") x = relay.var("x", shape=(n, c , h, w)) y = relay.nn.l2_normalize(x, eps=0.001, axis=[1]) "axis=" in y.astext() yy = relay.ir_pass.infer_type(y) assert yy.checked_type == relay.TensorType((n, c , h, w)) shape = (1, 5, 10, 10) dtype = "float32" x = relay.var("x", relay.TensorType(shape, dtype)) eps=0.001 axis=1 z = relay.nn.l2_normalize(x, eps=0.001, axis=[axis]) yy = relay.ir_pass.infer_type(z) assert yy.checked_type == relay.TensorType(shape, dtype) func = relay.Function([x], z) x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = topi.testing.l2_normalize_python(x_data, eps, axis) for target, ctx in ctx_list(): intrp1 = relay.create_executor("graph", ctx=ctx, target=target) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
def test_conv2d_transpose_infer_type(): # symbolic in batch dimension n, c, h, w = tvm.var("n"), 10, 10, 12 x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) w = relay.var("w", relay.IncompleteType()) y = relay.nn.conv2d_transpose(x, w, kernel_size=(3, 3), padding=(1, 1), channels=15) assert "channels=15" in y.astext() yy = relay.ir_pass.infer_type(y) assert yy.checked_type == relay.TensorType( (n, 15, 10, 12), "float32") assert yy.args[1].checked_type == relay.TensorType( (10, 15, 3, 3), "float32") # infer by shape of w, mixed precision n, c, h, w = tvm.var("n"), 10, 10, 12 x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) w = relay.var("w", relay.TensorType((12, 11, 5, 5), "float32")) y = relay.nn.conv2d_transpose(x, w, output_padding=(1, 1), channels=11, data_layout="NHWC") yy = relay.ir_pass.infer_type(y) assert yy.checked_type == relay.TensorType( (n, 15, 15, 11), "float32")
def test_flatten_infer_type(): d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4") x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32")) y = relay.nn.batch_flatten(x) yy = relay.ir_pass.infer_type(y) assert yy.checked_type == relay.TensorType((d1, ((d2*d3)*d4)), "float32") x = relay.var("x", relay.TensorType((3, 2, 4, 3), "float32")) y = relay.nn.batch_flatten(x) yy = relay.ir_pass.infer_type(y) assert yy.checked_type == relay.TensorType((3, 24), "float32") x = relay.var("x", relay.TensorType((d1, 2, d3, 3), "float32")) y = relay.nn.batch_flatten(x) yy = relay.ir_pass.infer_type(y) assert yy.checked_type == relay.TensorType((d1, ((2*d3)*3)), "float32") shape = (1, 5, 10, 10) o_shape = (1, 500) dtype = "float32" x = relay.var("x", relay.TensorType(shape, dtype)) z = relay.nn.batch_flatten(x) yy = relay.ir_pass.infer_type(z) assert yy.checked_type == relay.TensorType(o_shape, dtype) func = relay.Function([x], z) x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = x_data.flatten().reshape(o_shape) for target, ctx in ctx_list(): intrp1 = relay.create_executor("graph", ctx=ctx, target=target) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
def test_reduce_functions(): def _with_keepdims(func): def _wrapper(data, axis=None, keepdims=False): if not keepdims: return func(data, axis=axis) else: if axis is not None: axis = axis[0] out_shape = list(data.shape) out_shape[axis] = 1 else: out_shape = [1 for _ in range(len(data.shape))] return func(data, axis=axis).reshape(out_shape) return _wrapper d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4") for func in [[relay.sum, np.sum], [relay.max, np.max], [relay.min, np.min], [relay.mean, np.mean], [relay.prod, np.prod], [relay.argmin, _with_keepdims(np.argmin)], [relay.argmax, _with_keepdims(np.argmax)]]: verify_reduce(func, (d1, d2, d3, d4), (2,), True, False, (d1, d2, 1, d4)) verify_reduce(func, (d1, d2, d3), (1,), True, False, (d1, 1, d3)) verify_reduce(func, (d1, d2, d3), None, True, False, (1, 1, 1)) verify_reduce(func, (d1, d2, d3), (0, 1), True, False, (1, 1, d3)) verify_reduce(func, (2, 3, 4), (1,), True, False, (2, 1, 4)) verify_reduce(func, (2, 3, 4), (0, 1, 2), False, False, ()) verify_reduce(func, (4, 4, 3), None, False, True, ()) verify_reduce(func, (4, 4, 3), (0, 2), False, False, (4,)) verify_reduce(func, (128, 24, 128), (0, 1), False, False, (128,)) verify_reduce(func, (128, 24, 128), (0, 2), False, False, (24,)) verify_reduce(func, (128, 24, 128), (0, 1), True, False, (1, 1, 128)) verify_reduce(func, (128, 24, 128), (0, 2), True, False, (1, 24, 1))
def test_strided_slice(): def verify(dshape, begin, end, strides, output, test_ref=True): x = relay.var("x", relay.TensorType(dshape, "float32")) z = relay.strided_slice(x, begin=begin, end=end, strides=strides) func = relay.Function([x], z) func = relay.ir_pass.infer_type(func) text = func.astext() assert "begin=" in text assert "end=" in text if output: assert func.body.checked_type == relay.ty.TensorType(output, "float32") if not test_ref: return x_data = np.random.uniform(size=dshape).astype("float32") ref_res = topi.testing.strided_slice_python( x_data, begin, end, strides) for target, ctx in ctx_list(): intrp = relay.create_executor("graph", ctx=ctx, target=target) op_res = intrp.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res) d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4") verify((d1, d2, 3), [None, None, 1], [None, None, 2], None, (d1, d2, 1), False) verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2)) verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3)) verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3)) verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (1, 2, 2)) verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3)) verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3)) verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3)) verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3)) verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
def test_min_max_bound(): analyzer = tvm.arith.Analyzer() x, y = tvm.var("x"), tvm.var("y") analyzer.update(x, tvm.arith.ConstIntBound(-9, 11)) analyzer.update(y, tvm.arith.ConstIntBound(4, 10)) bd = analyzer.const_int_bound(tvm.min(x, y)) assert bd.min_value == -9 assert bd.max_value == 10 analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True) analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True) bd = analyzer.const_int_bound(tvm.min(x, y)) assert bd.min_value == bd.NEG_INF assert bd.max_value == 10 bd = analyzer.const_int_bound(tvm.max(x, y)) assert bd.min_value == 4 assert bd.max_value == bd.POS_INF analyzer.update(x, tvm.arith.ConstIntBound(1, bd.POS_INF), override=True) analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True) bd = analyzer.const_int_bound(tvm.max(x, y)) assert bd.min_value == 4 assert bd.max_value == bd.POS_INF
def test_add_sub_bound(): analyzer = tvm.arith.Analyzer() x, y = tvm.var("x", "int64"), tvm.var("y", "int64") bd = analyzer.const_int_bound(x + y) assert bd.min_value == bd.NEG_INF assert bd.max_value == bd.POS_INF analyzer.update(x, tvm.arith.ConstIntBound(0, 4)) analyzer.update(y, tvm.arith.ConstIntBound(1, 10)) bd = analyzer.const_int_bound(x + y) assert bd.min_value == 1 assert bd.max_value == 14 bd = analyzer.const_int_bound(x - y) assert bd.min_value == -10 assert bd.max_value == 3 analyzer.update(x, tvm.arith.ConstIntBound(0, bd.POS_INF), override=True) bd = analyzer.const_int_bound(x - y) assert bd.min_value == -10 assert bd.max_value == bd.POS_INF bd = analyzer.const_int_bound(1 - x) assert bd.min_value == bd.NEG_INF assert bd.max_value == 1
def test_vectorize_if_then_else(): n = tvm.var('n') x = tvm.var('x') ib = tvm.ir_builder.create() A = ib.pointer("float32", name="A") with ib.for_range(0, 4, for_type="vectorize") as i: A[i] = tvm.call_intrin("float32", "tvm_if_then_else", i > 0, A[i] + 1, A[i]) stmt = ib.get() stmt = tvm.ir_pass.VectorizeLoop(stmt) assert isinstance(stmt, tvm.stmt.For) ib = tvm.ir_builder.create() A = ib.pointer("float32", name="A") with ib.for_range(0, n) as k: with ib.for_range(0, 4, for_type="vectorize") as i: A[k * 4 + i] = tvm.call_intrin("float32", "tvm_if_then_else", k > 0, A[k * 4 + i], 0) stmt = ib.get() assert isinstance(stmt.body, tvm.stmt.For) stmt = tvm.ir_pass.VectorizeLoop(stmt) assert not isinstance(stmt.body, tvm.stmt.For) assert isinstance(stmt.body.value.args[2], tvm.expr.Broadcast)
def test_buffer_vload(): m = tvm.var('m') n = tvm.var('n') Ab = tvm.decl_buffer((m, n), tvm.float32, elem_offset=100) load = Ab.vload([2, 3]) offset = tvm.ir_pass.Simplify(load.index) assert tvm.ir_pass.Equal(offset, n * 2 + 103)
def test_scan(): m = tvm.var("m") n = tvm.var("n") x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x") s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: x[0, i], name="s_init") x_trans = tvm.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans") s_up1 = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1") s_update = tvm.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update") s_scan = tvm.scan(s_init, s_update, s_state) def test_getbody(): body = tvm.schedule.ScanGetBody(s_scan.op) assert set(body) == set([s_scan.op, s_update.op, s_up1.op]) def test_attach_path(): s = tvm.create_schedule(s_scan.op) s[x_trans].compute_at(s[s_update], s_update.op.axis[0]) apath = tvm.schedule.CreateAttachPath(s) assert(tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis])) assert(tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis])) def test_fix_pt(): body = tvm.schedule.ScanGetBody(s_scan.op) fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body) assert(fxpt[s_scan.spatial_axis_[0]].value != 0)
def test_expand_dims_infer_type(): n, t, d = tvm.var("n"), tvm.var("t"), 100 x = relay.var("x", shape=(n, t, d)) y = relay.expand_dims(x, axis=2) assert "axis=2" in y.astext() checked = relay.ir_pass.infer_type(y) assert checked.checked_type == relay.TensorType((n, t, 1, 100))
def test_nms(): num_anchors = 60 overlap_threshold = 0.5 force_suppress = True nms_topk = 10 n = tvm.var("n") x0 = relay.var("x0", relay.ty.TensorType((n, num_anchors, 6), "float32")) x1 = relay.var("x1", relay.ty.TensorType((n,), "int")) z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, nms_topk) assert "overlap_threshold" in z.astext() zz = relay.ir_pass.infer_type(z) assert zz.checked_type == relay.ty.TensorType( (n, num_anchors, 6), "float32") n = tvm.var("n") x0 = relay.var("x0", relay.ty.TensorType((n, num_anchors, 6), "float32")) x1 = relay.var("x1", relay.ty.TensorType((n,), "int")) z = relay.vision.nms(x0, x1) zz = relay.ir_pass.infer_type(z) assert zz.checked_type == relay.ty.TensorType( (n, num_anchors, 6), "float32")
def test_equal_compute(): x = tvm.var('x') y = tvm.var('y') n = 128 A = tvm.placeholder((n, n), name='A') B = tvm.placeholder((n, n), name='B') ii = tvm.var('i') jj = tvm.var('j') def func1(): k = tvm.reduce_axis((0, n), name='k') return tvm.sum(A[ii, k] * B[jj, k], axis=k) Ab = tvm.decl_buffer((n,), name='A') n = tvm.var("n") def func2(): ib = tvm.ir_builder.create() A = ib.buffer_ptr(Ab) with ib.for_range(0, n, name="i") as i: A[i] = A[i] + 1 with ib.for_range(0, 10, name="j") as j: A[j] = A[j] + 2 return ib.get() assert tvm.ir_pass.Equal(func1(), func1()) assert tvm.ir_pass.Equal(func2(), func2())
def test_deduce(): a = tvm.var('a') b = tvm.var('b') c = tvm.var('c') d = tvm.var('d') b_s = tvm.arith.intset_interval(2, 3) c_s = tvm.arith.intset_interval(10, 15) d_s = tvm.arith.intset_interval(-3, -1) e0 = (-b)*a+c-d res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((d - c) /(b*-1)) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) e0 = d*a+c-d res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((0-c)/d + 1) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) e1 = (a*4+b < c) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) ans1 = (((c - b) + -1)/4) assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1) e2 = (tvm.max(5, a * 4) < 0) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max()) == "neg_inf" assert str(res2.min()) == "pos_inf" e3 = (-b)+a*c-d res3 = tvm.arith.DeduceBound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s}) ans3 = 2/c+1 assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)
def test_dynamic_tensor(): dtype = 'float32' stype = 'csr' target = 'llvm' ctx = tvm.context(target, 0) nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n') A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype) assert(A.stype == 'csr') C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter') s = tvm.create_schedule(C.op) _nr, _nc = 3, 5 a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.) a = tvmsp.array(a, ctx) assert a.data.dtype == a.dtype Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr']) Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data') Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices') binds = {A.data: Ab.data, A.indices: Ab.indices} f = tvm.build(s, [nr, A.data, C], target, binds=binds) c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx) c.data = tvm.nd.empty(a.data.shape, dtype) c.indices = a.indices c.indptr = a.indptr f(a.data.shape[0], a.data, c.data) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
def test_equality(): a = tvm.var('a') b = tvm.var('b') c = (a == b) assert not c d = (c != c) assert not d
def test_schedule_create(): m = tvm.var('m') n = tvm.var('n') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') B = tvm.placeholder((n, l), name='B') AA = tvm.compute((m, l), lambda i, j: A[i, j]) T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k)) s = tvm.create_schedule(T.op) s[AA].set_scope("shared") xo, xi = s[T].split(T.op.axis[0], factor=10) xi1, xi2 = s[T].split(xi, factor=2) s[AA].compute_at(s[T], xi1) xo, xi = s[AA].split(AA.op.axis[0], factor=10) s[T].reorder(xi2, xi1) assert T.op.axis[1] in s[T].leaf_iter_vars # save load json json_str = tvm.save_json(s) s_loaded = tvm.load_json(json_str) assert isinstance(s_loaded, tvm.schedule.Schedule) assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body)) # pickle unpickle dump = pkl.dumps(s) s_loaded = pkl.loads(dump) assert isinstance(s_loaded, tvm.schedule.Schedule) assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
def test_infer_type_leaky_relu(): n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w") x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) y = relay.nn.leaky_relu(x, alpha=0.1) "alpha=0.1" in y.astext() yy = relay.ir_pass.infer_type(y) assert yy.checked_type == relay.TensorType((n, c, h, w), "float32") shape = (1, 5, 10, 10) dtype = "float32" x = relay.var("x", relay.TensorType(shape, dtype)) z = relay.nn.leaky_relu(x, alpha=0.1) assert "alpha=0.1" in z.astext() yy = relay.ir_pass.infer_type(z) assert yy.checked_type == relay.TensorType(shape, dtype) func = relay.Function([x], z) x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = np.where(x_data > 0, x_data, x_data * 0.1) for target, ctx in ctx_list(): intrp1 = relay.create_executor("graph", ctx=ctx, target=target) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) op_res2 = intrp2.evaluate(func)(x_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
def test_basic(): a = tvm.var() b = tvm.var() m = tvm.arith.EvalModular(a * 4 + b * 6 + 7) assert m.coeff == 2 assert m.base == 1 m = tvm.arith.EvalModular((a * 4 + 1) * (b * 8 + 3)) assert m.coeff == 4 assert m.base == 3 m = tvm.arith.EvalModular((a * 4 + 1) / (b * 8 + 3)) assert m.coeff == 1 assert m.base == 0 m = tvm.arith.EvalModular((a * 4 + 1) * (b * 8 / 4)) assert m.coeff == 2 assert m.base == 0 m = tvm.arith.EvalModular((a * 12 + 1) - (b * 3 * 7 + 2)) assert m.coeff == 3 assert m.base == 2 m = tvm.arith.EvalModular(a * 12 + tvm.min(b * 3 * 7, 2)) assert m.coeff == 1 assert m.base == 0
def test_scan_group(): m = tvm.var("m") n = tvm.var("n") x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x") s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: x[0, i]) s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i]) s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1) s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1) res = tvm.scan(s_init, s_update3, s_state, inputs=x) s = tvm.create_schedule(res.op) assert s[s_update1].group is not None assert s[s_update2].group == s[s_update1].group # Assign within group, is valid s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1]) # create a new group, for [s_update2 and s_update1] g2 = s.create_group(outputs=s_update2, inputs=[s_state, x]) assert g2.group is not None assert g2.group == s[s_update3].group assert s[s_update2].group == g2 assert s[s_update1].group == g2 g2.compute_at(s[s_update3], s_update3.op.axis[1]) assert g2.attach_stage == s[s_update3] try: # compute outside group error. s[s_update2].compute_at(s[s_init], s_init.op.axis[0]) assert False except tvm.TVMError: pass
def test_storage_share(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') num_stage = 5 B = A for t in range(num_stage): B = tvm.compute((m, l), lambda i, j: B[i, j] + (t+1), name='A%d' % t) s = tvm.create_schedule(B.op) bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.CanonicalSimplify(stmt) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.StorageRewrite(stmt) # verify only have one allocations. # verify inplace folding works num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 1
def test_parallel_alloc(): ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i", for_type="parallel") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("float32", n, name="A", scope="global") A[j] = A[j] + 2 body = ib.get() body = tvm.ir_pass.StorageRewrite(body) assert (isinstance(body.body.body, tvm.stmt.Allocate)) ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="t") as i: ib.scope_attr( tvm.const(1, "int32") , "pragma_scope", tvm.make.StringImm("parallel_launch_point")) with ib.for_range(0, n, name="i", for_type="parallel") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("float32", n, name="A", scope="global") A[j] = A[j] + 2 body = ib.get() body = tvm.ir_pass.StorageRewrite(body) assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate))
def test_storage_sync(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1') A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2') s = tvm.create_schedule(A2.op) xo, xi = s[A2].split(A2.op.axis[0], factor=8) s[A2].bind(xo, tvm.thread_axis("blockIdx.x")) s[A1].compute_at(s[A2], xo) s[A1].set_scope("shared") bounds = tvm.schedule.InferBound(s) assert isinstance(bounds, tvm.container.Map) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64) f = tvm.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 0, True) flist = tvm.ir_pass.SplitHostDevice(f) f = flist[1] f = tvm.ir_pass.ThreadSync(f, "shared") body_list = tvm.make.stmt_list(f.body.body.body.body) assert(body_list[1].value.name == "tvm_storage_sync")
def test_multivariate(): v = [tvm.var("v%d" % i) for i in range(4)] b = tvm.var("b") m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8, v) assert(tvm.ir_pass.Equal(tvm.ir_pass.Simplify(m[0]), b + 5)) assert(m[1].value == 8) m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[2], v) assert(len(m) == 0) m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[1] + v[3], v) assert(len(m) == 0) m = tvm.arith.DetectLinearEquation(((v[0] * b + v[1]) * 8 + v[2] + 1) * 2, v) assert(m[1].value == 16) assert(m[2].value == 2) assert(m[len(m)-1].value == 2) m = tvm.arith.DetectLinearEquation((v[0] - v[1]), [v[2]]) assert(m[0].value == 0) assert(tvm.ir_pass.Simplify(m[1] - (v[0] - v[1])).value == 0) m = tvm.arith.DetectLinearEquation((v[0] - v[1]), []) assert(len(m) == 1) assert(tvm.ir_pass.Simplify(m[0] - (v[0] - v[1])).value == 0)
def _post_order(op): if isinstance(op, tvm.stmt.Allocate): buffer_var = op.buffer_var if not buffer_var in rw_info: return None new_var = rw_info[buffer_var] let_stmt = tvm.make.LetStmt( new_var, tvm.call_extern( "handle", "VTABufferCPUPtr", env.dev.command_handle, buffer_var), op.body) alloc = tvm.make.Allocate( buffer_var, op.dtype, op.extents, op.condition, let_stmt) del rw_info[buffer_var] return alloc if isinstance(op, tvm.expr.Load): buffer_var = op.buffer_var if not buffer_var in rw_info: rw_info[buffer_var] = tvm.var( buffer_var.name + "_ptr", "handle") new_var = rw_info[buffer_var] return tvm.make.Load(op.dtype, new_var, op.index) if isinstance(op, tvm.stmt.Store): buffer_var = op.buffer_var if not buffer_var in rw_info: rw_info[buffer_var] = tvm.var( buffer_var.name + "_ptr", "handle") new_var = rw_info[buffer_var] return tvm.make.Store(new_var, op.value, op.index) raise RuntimeError("not reached")
def test_tensor_comm_reducer(): m = tvm.var('m') n = tvm.var('n') A = tvm.placeholder((m, n), name='A') k = tvm.reduce_axis((0, n), "k") mysum = tvm.comm_reducer(lambda x, y: x+y, lambda t: tvm.const(0, dtype=t)) C = tvm.compute((m,), lambda i: mysum(A[i, k], axis=k))
def test_simplify_if_then_else(): ck = CanonicalChecker() x = tvm.var("x") y = tvm.var("y") # simplification that takes condition into account. res = tvm.if_then_else((x * 4 + y) >= 466036, tvm.if_then_else(24512 <= ((((x*4) + y) - 466036) % 24528), (((((x*4) + y) - 466036) % 24528) -24512) % 16, x), y) expected = tvm.if_then_else( tvm.expr.LE(466036, (x * 4 + y)), tvm.if_then_else(tvm.expr.LE(24512, ((((x*4) + y) - 4) % 24528)), (((x*4) + y) - 4) % 16, x), y) ck.verify(res, expected) # can only simplify if condition res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3, (x + 100) % 3) expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3, (x + 100) % 3) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 > 2, x, 0), 0) expected = tvm.expr.Select(x >= 10, x, 0) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 < 2, x, 0), 0) ck.verify(res, 0)
def test_mix_index(): a = tvm.var("a") b = tvm.var("b") analyzer = tvm.arith.Analyzer() m = analyzer.modular_set(a * 4 + b * 6 + 7) assert m.coeff == 2 assert m.base == 1 m = analyzer.modular_set((a * 4 + 1) * (b * 8 + 3)) assert m.coeff == 4 assert m.base == 3 m = analyzer.modular_set((a * 4 + 1) / (b * 8 + 3)) assert m.coeff == 1 assert m.base == 0 m = analyzer.modular_set((a * 4 + 1) * (b * 8 / 4)) assert m.coeff == 2 assert m.base == 0 m = analyzer.modular_set((a * 12 + 1) - (b * 3 * 7 + 2)) assert m.coeff == 3 assert m.base == 2 m = analyzer.modular_set(a * 12 + tvm.min(b * 3 * 7, 2)) assert m.coeff == 1 assert m.base == 0
def test_let(): x = tvm.var('x') y = tvm.var('y') stmt = tvm.make.LetStmt(x, 10, tvm.make.Evaluate(x + 1))
def test_ir2(): x = tvm.var("n") a = tvm.var("array", tvm.handle) st = tvm.make.Store(a, x + 1, 1) assert isinstance(st, tvm.stmt.Store) assert (st.buffer_var == a)
def _intrin_popcount(m, k_i, w_b, x_b): dtype = 'uint8' w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w') x = tvm.placeholder(( x_b, k_i, ), dtype=dtype, name='x') k = tvm.reduce_axis((0, k_i), name='k') bw = tvm.reduce_axis((0, w_b), name='bw') bx = tvm.reduce_axis((0, x_b), name='bx') z = tvm.compute((m, ), lambda i: tvm.sum(tvm.popcount(w[bw, i, k].astype( 'uint16') & x[bx, k].astype('uint16')) << (bw + bx).astype('uint16'), axis=[bw, bx, k]), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", offset_factor=k_i, strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) Xb = tvm.decl_buffer(x.shape, x.dtype, name="X", offset_factor=k_i, strides=[tvm.var('ldw'), 1]) def _intrin_func(ins, outs): ww, xx = ins zz = outs[0] vpadd = "llvm.arm.neon.vpadd.v8u8" vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16" args_1 = tvm.const(1, 'uint32') args_2 = tvm.const(2, 'uint32') def _instr(index): irb = tvm.ir_builder.create() if index == 1: irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) return irb.get() cnts8 = [None] * 8 cnts4 = [None] * 4 cnts2 = [None] * 2 for bw in range(w_b): for bx in range(x_b): if k_i == 16: for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload( [bx, 0], 'uint8x16') cnts = tvm.popcount(ands) upper_half = tvm.call_pure_intrin( 'uint8x8', 'vectorhigh', cnts) lower_half = tvm.call_pure_intrin( 'uint8x8', 'vectorlow', cnts) cnts8[i] = upper_half + lower_half for i in range(m // 2): cnts4[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw + bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) else: # ki == 8 for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload( [bx, 0], 'uint8x8') cnts8[i] = tvm.popcount(ands) for i in range(m // 2): cnts4[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw + bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) irb.emit(zz.vstore(0, out)) return irb.get() # body, reset, update return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x: Xb})
import pdb import tvm import numpy as np import timeit import os from tvm.contrib import util input(os.getpid()) tgt_host="cpu" #device = "cpu" M = tvm.var("n") K = tvm.var("n") N = tvm.var("n") A = tvm.placeholder((M, K), name='A',dtype='float32') B = tvm.placeholder((K, N), name='B',dtype='float32') k = tvm.reduce_axis((0, K), 'k') C = tvm.compute((M, N), lambda i, j: 0, name='C') s = tvm.create_schedule(C.op) func = tvm.build(s, [A, B, C], "c",name='DPUGemm') print(tvm.lower(s, [A, B, C], simple_mode=True)) #print(func.get_source()) batch = 2 in_channels = 3 out_channels = 2 in_height = 6 in_width = 6
def test_select(): ck = IntSetChecker() x, y = tvm.var("x"), tvm.var("y") ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 11))
def test_dir(): x = tvm.var('x') dir(x)
def rnn_matexp(): n_num_step = 128 n_num_hidden = 1152 n_batch_size = 4 detect_global_barrier = DETECT_GLOBAL_BARRIER num_step = tvm.var("num_step") num_hidden = tvm.convert(n_num_hidden) batch_size = tvm.convert(n_batch_size) num_thread_y = 8 num_thread_x = 16 * 3 num_sm = 24 Whh = tvm.placeholder((num_hidden, num_hidden), name="Whh") s_init = tvm.compute((1, batch_size, num_hidden), lambda _, i, j: 1.0, name="init") s_state = tvm.placeholder((num_step, batch_size, num_hidden)) kh = tvm.reduce_axis((0, num_hidden), name="kh") s_update = tvm.compute( (num_step, batch_size, num_hidden), lambda t, i, j: tvm.sum(s_state[t - 1, i, kh] * Whh[kh, j], axis=kh), name="update") s_scan = tvm.scan(s_init, s_update, s_state) # schedule s = tvm.create_schedule(s_scan.op) CL = s_update SS = s.cache_read(s_state, "shared", [CL]) SL = s.cache_read(SS, "local", [CL]) WhhL = s.cache_read(Whh, "local", [CL]) ko, ki = s[CL].split(s[CL].op.reduce_axis[0], nparts=num_thread_y) CLF = s.rfactor(CL, ko) block_x = tvm.thread_axis((0, num_sm), "blockIdx.x") thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x") thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y") if PERSIST_KERNEL: s[s_scan.op].env_threads([block_x, thread_y, thread_x]) bx, xi = s[s_init].split(s_init.op.axis[2], nparts=num_sm) tx, xi = s[s_init].split(xi, nparts=num_thread_x) s[s_init].bind(bx, block_x) s[s_init].bind(tx, thread_x) bx, xi = s[s_update].split(s[CL].op.axis[2], nparts=num_sm) tx, xi = s[s_update].split(xi, nparts=num_thread_x) s[s_update].bind(bx, block_x) s[s_update].bind(tx, thread_x) s[CL].bind(s[CL].op.reduce_axis[0], thread_y) s[CLF].compute_at(s[CL], s[CL].op.reduce_axis[0]) # Duplicate store predicate. s[CL].set_store_predicate(thread_y.equal(0)) if PERSIST_KERNEL: s[WhhL].compute_at(s[s_scan], thread_x) s[WhhL].unroll(WhhL.op.axis[0]) else: s[WhhL].compute_at(s[CLF], CLF.op.axis[3]) kr, ki = s[CLF].split(CLF.op.reduce_axis[0], nparts=1) ko, ki = s[CLF].split(ki, factor=4) s[SS].compute_at(s[CLF], kr) s[SL].compute_at(s[CLF], ko) xo, xi = s[SS].split(SS.op.axis[2], factor=num_thread_x * num_thread_y * 3) ty, xi = s[SS].split(xi, nparts=num_thread_y) tx, xi = s[SS].split(xi, nparts=num_thread_x) s[SS].bind(ty, thread_y) s[SS].bind(tx, thread_x) def check_device(target): with tvm.build_config(detect_global_barrier=detect_global_barrier, auto_unroll_min_depth=2, auto_unroll_max_step=128, unroll_explicit=False): f = tvm.build(s, [s_scan, Whh], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. res_np = np.zeros( (n_num_step, n_batch_size, n_num_hidden)).astype("float32") Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32") Whh_np[:] = 2.0 / n_num_hidden Whh_np[:, n_num_hidden // 2:] = 0 res_a = tvm.nd.array(res_np, ctx) Whh_a = tvm.nd.array(Whh_np, ctx) # Skip first pass as it is compilation f(res_a, Whh_a) ctx.sync() # measure time cost of second step. tstart = time.time() f(res_a, Whh_a) ctx.sync() tgap = time.time() - tstart print("Time cost=%g" % tgap) # correctness if not SKIP_CHECK: res_gpu = res_a.asnumpy() res_cmp = np.ones_like(res_np).astype("float64") Whh_np = Whh_np.astype("float64") for t in range(1, n_num_step): res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np) for i in range(n_num_step): for j in range(n_num_hidden): if abs(res_cmp[i, 0, j] - res_gpu[i, 0, j]) > 1e-5: print("%d, %d: %g vs %g" % (i, j, res_cmp[i, 0, j], res_gpu[i, 0, j])) np.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3) check_device("cuda")
def test_multibox_prior(): def get_ref_result(dshape, sizes=(1.0, ), ratios=(1.0, ), steps=(-1.0, -1.0), offsets=(0.5, 0.5), clip=True): in_height = dshape[2] in_width = dshape[3] num_sizes = len(sizes) num_ratios = len(ratios) size_ratio_concat = sizes + ratios steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width offset_h = offsets[0] offset_w = offsets[1] oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4) dtype = "float32" np_out = np.zeros(oshape).astype(dtype) for i in range(in_height): center_h = (i + offset_h) * steps_h for j in range(in_width): center_w = (j + offset_w) * steps_w for k in range(num_sizes + num_ratios - 1): w = size_ratio_concat[k] * in_height / in_width / 2.0 if k < num_sizes else \ size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0 h = size_ratio_concat[k] / 2.0 if k < num_sizes else \ size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0 count = i * in_width * (num_sizes + num_ratios - 1) + j * ( num_sizes + num_ratios - 1) + k np_out[0][count][0] = center_w - w np_out[0][count][1] = center_h - h np_out[0][count][2] = center_w + w np_out[0][count][3] = center_h + h if clip: np_out = np.clip(np_out, 0, 1) return np_out def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0, ), ratios=(1.0, ), steps=(-1.0, -1.0), offsets=(0.5, 0.5), clip=True, check_size=False, check_type_only=False): z = relay.vision.multibox_prior(x, sizes, ratios, steps, offsets, clip) zz = run_infer_type(z) if check_size: assert "sizes=" in z.astext() assert zz.checked_type == relay.TensorType( (1, dshape[2] * dshape[3] * (len(sizes) + len(ratios) - 1), 4), "float32") if check_type_only: return data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32") func = relay.Function([x], z) func = run_infer_type(func) for target, ctx in ctx_list(): intrp1 = relay.create_executor("graph", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res2 = intrp2.evaluate(func)(data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) sizes = (0.3, 1.5, 0.7) ratios = (1.3, 2.4) steps = (2.0, 1.5) offsets = (0.2, 0.3) dshape = (1, 3, 56, 56) ref_res = get_ref_result(dshape, sizes, ratios, steps, offsets) x = relay.var("x", relay.TensorType(dshape, "float32")) verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets, check_size=True) y = relay.var("y", relay.TensorType((tvm.var("n"), 3, 56, 56), "float32")) verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets, check_size=True, check_type_only=True) dshape = (1, 24, 32, 32) ref_res = get_ref_result(dshape, clip=False) x = relay.var("x", relay.TensorType(dshape, "float32")) verify_multibox_prior(x, dshape, ref_res, clip=False) y = relay.var("y", relay.TensorType((tvm.var("n"), 24, 32, 32), "float32")) verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
def test_non_max_suppression(): def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res, iou_threshold=0.5, force_suppress=False, top_k=-1, check_type_only=False): x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32")) x1 = relay.var("x1", relay.ty.TensorType((dshape[0], ), "int32")) z = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \ iou_threshold = iou_threshold, force_suppress = force_suppress, \ top_k = top_k, return_indices=False) z_indices = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \ iou_threshold = iou_threshold, force_suppress = force_suppress, \ top_k = top_k) assert "iou_threshold" in z.astext() assert "iou_threshold" in z_indices.astext() zz = run_infer_type(z) zz_indices = run_infer_type(z_indices) assert zz.checked_type == relay.ty.TensorType(dshape, "float32") assert zz_indices.checked_type == relay.ty.TensorType( (dshape[0], dshape[1]), "int32") if check_type_only: return func = relay.Function([x0, x1], z) func = run_infer_type(func) func_indices = relay.Function([x0, x1], z_indices) func_indices = run_infer_type(func_indices) for target, ctx in ctx_list(): intrp1 = relay.create_executor("graph", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(x0_data, x1_data) op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) tvm.testing.assert_allclose(op_indices_res1.asnumpy(), ref_indices_res, rtol=1e-5) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res2 = intrp2.evaluate(func)(x0_data, x1_data) op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) tvm.testing.assert_allclose(op_indices_res2.asnumpy(), ref_indices_res, rtol=1e-5) np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80], [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79], [1, 0.5, 100, 60, 70, 110]]]).astype("float32") np_valid_count = np.array([4]).astype("int32") np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45], [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]]) np_indices_result = np.array([[3, 0, -1, -1, -1]]) num_anchors = 5 dshape = (tvm.var("n"), num_anchors, 6) verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result, force_suppress=True, top_k=2, check_type_only=True) dshape = (1, num_anchors, 6) verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result, force_suppress=True, top_k=2, check_type_only=False) np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]]) np_indices_result = np.array([[3, 0, 1, -1, -1]]) dshape = (tvm.var("n"), num_anchors, 6) verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result, check_type_only=True) dshape = (1, num_anchors, 6) verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result, top_k=3)
def lstm(): if not PERSIST_KERNEL: raise ValueError("Non persist LSTM not yet supported") num_thread_y = 8 num_thread_x = 16 * 3 // 2 num_sm = 24 n_num_step = 128 num_step = tvm.var('num_step') num_hidden = 1152 // 2 # cell的个数 batch_size = 1 # 处理对象是图片时,表示同时处理这么多张图片 # Global transition matrix # Input hidden channel can be pre-caculated by a gemm Xi2h = tvm.placeholder((num_step, batch_size, 4, num_hidden), name="Xi2h") # Only handle hidden transition, saves space. Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h") # h: output hidden state, c: cell state. s_state_h = tvm.placeholder((num_step, batch_size, num_hidden)) # h(t-1) s_state_c = tvm.placeholder((num_step, batch_size, num_hidden)) # c(t-1) s_init_c = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c") # 初始c状态为0 s_init_h = tvm.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h") # 初始h状态为0 # LSTM transition k = tvm.reduce_axis((0, num_hidden), name="ki2h") # H*W,axis = k s_h2h = tvm.compute( (num_step, batch_size, 4, num_hidden), lambda t, i, x, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k), name="s_h2h") # Gate rules # gates = w*[h(t-1),x(t)] gates = tvm.compute(Xi2h.shape, lambda *i: Xi2h(*i) + s_h2h(*i), name="gates") # 把4个门一个个的拆开,这样就能降到3维? gshape = (num_step, batch_size, num_hidden) # 输入门:in_gate = i(t) = sigmoid(w0*[h(t-1),x(t)]) w0=wi in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 0, j]), name="in_gate") # 候选值:in_transform = \tilde{C}(t) = tanh(w1*[h(t-1),x(t)]) w1=wc in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, i, 1, j]), name="in_transform") # 忘记门:forget_gate = f(t) = sigmoid(w2*[h(t-1),x(t)]) w2=wf forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 2, j]), name="forget_gate") # 输出门:out_gate = o(t)= sigmoid(w3*[h(t-1),x(t)]) w3=wo out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 3, j]), name="out_gate") # 更新细胞状态:next_c=c(t)=f(t)*c(t-1)+i(t)*\tilde{C}(t) next_c = tvm.compute( gshape, lambda t, i, j: forget_gate[t, i, j] * s_state_c[ t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j], name="next_c") # 最终的输出:next_h=h(t)=o(t)*tanh(c(t)) next_h = tvm.compute( gshape, lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h") update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c") # 更新细胞状态 update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h") # 更新输出 # schedule:h和c都是时间序列的形式,tvm中用scan来描述 scan_h, scan_c = tvm.scan([s_init_h, s_init_c], [update_h, update_c], [s_state_h, s_state_c], inputs=[Xi2h], name="lstm_scan") # schedule s = tvm.create_schedule(scan_h.op) # Inline gate computations s[gates].compute_inline() s[in_gate].compute_inline() s[in_transform].compute_inline() s[forget_gate].compute_inline() s[out_gate].compute_inline() block_x = tvm.thread_axis((0, num_sm), "blockIdx.x") thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x") thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y") s_state_h_S = s.cache_read(s_state_h, "shared", [s_h2h]) s_state_c_S = s.cache_read(s_state_c, "shared", [next_c]) Wh2hL = s.cache_read(Wh2h, "local", [s_h2h]) ko, ki = s[s_h2h].split(s[s_h2h].op.reduce_axis[0], nparts=num_thread_y) s_h2h_rf = s.rfactor(s_h2h, ko) s[s_h2h].bind(s[s_h2h].op.reduce_axis[0], thread_y) s[s_h2h_rf].compute_at(s[s_h2h], s[s_h2h].op.reduce_axis[0]) if PERSIST_KERNEL: s[scan_h.op].env_threads([block_x, thread_y, thread_x]) s[Wh2hL].compute_at(s[scan_h.op], thread_x) else: s[Wh2hL].compute_at(s[s_h2h], s[s_h2h].op.axis[3]) if UNROLL_WLOAD: s[Wh2hL].unroll(Wh2hL.op.axis[0]) s[Wh2hL].unroll(Wh2hL.op.axis[2]) s[s_state_h_S].compute_at(s[s_h2h_rf], s[s_h2h_rf].op.axis[3]) s[s_state_c_S].compute_at(s[scan_h.op], s[scan_h].op.scan_axis) for ss in [s_state_h_S]: xo, xi = s[ss].split(ss.op.axis[2], factor=num_thread_x * num_thread_y) ty, xi = s[ss].split(xi, nparts=num_thread_y) tx, xi = s[ss].split(xi, nparts=num_thread_x) s[ss].bind(ty, thread_y) s[ss].bind(tx, thread_x) for init in [s_init_c, s_init_h]: bx, xi = s[init].split(init.op.axis[2], nparts=num_sm) tx, xi = s[init].split(xi, nparts=num_thread_x) s[init].bind(bx, block_x) s[init].bind(tx, thread_x) s[next_c].set_store_predicate(thread_y.equal(0)) s[next_h].set_store_predicate(thread_y.equal(0)) for update in [update_c, update_h]: bx, xi = s[update].split(s[update].op.axis[2], nparts=num_sm) tx, xi = s[update].split(xi, nparts=num_thread_x) s[update].bind(bx, block_x) s[update].bind(tx, thread_x) s[update].set_store_predicate(thread_y.equal(0)) # verify we can lower correctly def check_device(target): num_step = n_num_step flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target) ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0) # launch the kernel. scan_h_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") scan_c_np = np.zeros( (num_step, batch_size, num_hidden)).astype("float32") Xi2h_np = np.random.normal(size=(num_step, batch_size, 4, num_hidden)).astype("float32") Wh2h_np = np.random.normal(size=(4, num_hidden, num_hidden)).astype("float32") scan_h_a = tvm.nd.array(scan_h_np, ctx) scan_c_a = tvm.nd.array(scan_c_np, ctx) Xi2h_a = tvm.nd.array(Xi2h_np, ctx) Wh2h_a = tvm.nd.array(Wh2h_np, ctx) flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) ctx.sync() # measure time cost of second step. evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000) eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a) print("Time cost=%g" % eval_result.mean) # set unroll_explicit for more readable code. with tvm.build_config(detect_global_barrier=DETECT_GLOBAL_BARRIER, auto_unroll_max_step=128, unroll_explicit=False): check_device("cuda")
def test_basic(): m = tvm.var('m') ret = tvm.ir_pass.CanonicalSimplify(tvm.make.Evaluate(m - 1)) assert str(ret.value) == "(m - 1)"
def test_basic(): a = tvm.var('a') b = tvm.var('b') c = a + b assert str(c) == '(%s + %s)' % (a.name, b.name)
def test_stmt(): x = tvm.make.Evaluate(0) tvm.make.For(tvm.var('i'), 0, 1, tvm.stmt.For.Serial, 0, x)
def test_add_pipeline(): n = tvm.var('n') A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(), name='C') D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='D') s = tvm.create_schedule(D.op) # GPU schedule have to split by gridIdx and threadIdx num_thread = 256 xo, xi = s[C].split(C.op.axis[0], factor=num_thread) s[C].bind(xi, tvm.thread_axis("threadIdx.x")) s[C].bind(xo, tvm.thread_axis("blockIdx.x")) xo, xi = s[D].split(D.op.axis[0], factor=num_thread) s[D].bind(xi, tvm.thread_axis("threadIdx.x")) s[D].bind(xo, tvm.thread_axis("blockIdx.x")) # compile to IR s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') Db = tvm.decl_buffer(D.shape, D.dtype, name='D') stmt = tvm.ir_pass.LoopPartition(stmt, False) stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, D: Db}, 64) stmt = tvm.ir_pass.Simplify(stmt) fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Db], 0, True) fsplits = [x for x in tvm.ir_pass.SplitHostDevice(fapi)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) def check_target(device, host="stackvm"): ctx = tvm.context(device, 0) if not ctx.exist: return if not tvm.module.enabled(host): return mhost = tvm.codegen.build_module(fsplits[0], host) mdev = tvm.codegen.build_module(fsplits[1:], device) mhost.import_module(mdev) code = mdev.get_source() f = mhost.entry_func # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx) d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx) f(a, b, d) np.testing.assert_allclose(d.asnumpy(), a.asnumpy() + b.asnumpy() + 1) def check_module_save(device, host="stackvm"): ctx = tvm.context(device, 0) if not ctx.exist: return if not tvm.module.enabled(host): return fmt = "ptx" if device == "cuda" else device mhost = tvm.codegen.build_module(fsplits[0], host) mdev = tvm.codegen.build_module(fsplits[1:], device) temp = util.tempdir() mpath = temp.relpath("test.%s" % fmt) mdev.save(mpath) mdev2 = tvm.module.load(mpath) mhost.import_module(mdev2) f = mhost.entry_func # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx) d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx) f(a, b, d) np.testing.assert_allclose(d.asnumpy(), a.asnumpy() + b.asnumpy() + 1) check_target("cuda", host="stackvm") check_target("cuda", host="llvm") check_module_save("cuda", host="stackvm") check_target("nvptx", host="llvm") check_target("vulkan", host="llvm") check_target("rocm", host="llvm") check_module_save("vulkan", host="stackvm")
def test_dtype(): x = tvm.var('x') assert x.dtype == 'int32' y = tvm.var('y') assert (x > y).dtype == 'uint1'
def test_deduce(): a = tvm.var('a') b = tvm.var('b') c = tvm.var('c') d = tvm.var('d') b_s = tvm.arith.IntervalSet(2, 3) c_s = tvm.arith.IntervalSet(10, 15) d_s = tvm.arith.IntervalSet(-3, -1) zero = tvm.const(0, "int32") fdiv = tvm.floordiv e0 = (-b) * a + c - d res0 = tvm.arith.deduce_bound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = fdiv(d - c, b * -1) assert_expr_equal(res0.max_value, ans0) # expression containing variable a is on rhs res0 = tvm.arith.deduce_bound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res0.max_value, ans0) e0 = d * a + c - d res0 = tvm.arith.deduce_bound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = fdiv(d - c, d) assert_expr_equal(res0.max_value, ans0) # expression containing variable a is on rhs res0 = tvm.arith.deduce_bound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res0.max_value, ans0) e1 = (a * 4 + b < c) res1 = tvm.arith.deduce_bound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) ans1 = fdiv(c - 1 - b, 4) assert_expr_equal(res1.max_value, ans1) # expression containing variable a is on rhs e1 = (c > a * 4 + b) res1 = tvm.arith.deduce_bound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res1.max_value, ans1) e2 = (tvm.max(5, a * 4) < 0) res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max_value) == "neg_inf" assert str(res2.min_value) == "pos_inf" # expression containing variable a is on rhs e2 = (zero < tvm.max(5, a * 4)) res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max_value) == "neg_inf" assert str(res2.min_value) == "pos_inf" e3 = (-b) + a * c - d res3 = tvm.arith.deduce_bound(a, e3 >= 0, { b: b_s, c: c_s, d: d_s }, { b: b_s, d: d_s }) ans3 = fdiv(2, c) + 1 assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3) res3 = tvm.arith.deduce_bound(a, zero <= e3, { b: b_s, c: c_s, d: d_s }, { b: b_s, d: d_s }) assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3) # tests for `EQ` op res4 = tvm.arith.deduce_bound(a, a == b, {}, {}) assert_expr_equal(res4.max_value, b) assert_expr_equal(res4.min_value, b) # Unsatisfiable `EQ`, variable as one of the Operand res5 = tvm.arith.deduce_bound(a, (a == b), {b: b_s}, {b: b_s}) assert str(res5.max_value) == "neg_inf" assert str(res5.min_value) == "pos_inf" # variable `a` on the RHS side res6 = tvm.arith.deduce_bound(a, 10 == a, {}, {}) assert_expr_equal(res6.max_value, 10) assert_expr_equal(res6.min_value, 10) # Add, Sub in `EQ` e4 = ((a - c) == (b + d)) ans4 = (b + d + c) res7 = tvm.arith.deduce_bound(a, e4, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res7.max_value, ans4) assert_expr_equal(res7.min_value, ans4) # Satisfiable Mul in `EQ` with negative sign res8 = tvm.arith.deduce_bound(a, (5 * a == -10), {}, {}) assert_expr_equal(res8.max_value, -2) assert_expr_equal(res8.min_value, -2) # Unsatisfiable Mul in `EQ` e5 = (4 * a == b) res9 = tvm.arith.deduce_bound(a, e5, {b: b_s}, {}) assert str(res9.max_value) == "neg_inf" assert str(res9.min_value) == "pos_inf" # Unsatisfiable Mul in `EQ` res10 = tvm.arith.deduce_bound( a, (b * a == b), {b: b_s}, {}) # simplifier is not able to prove that (b % b == 0) assert str(res10.max_value) == "neg_inf" assert str(res10.min_value) == "pos_inf"
def test_sym_add(): a = tvm.var('a') b = tvm.var('b') c = tvm_ext.sym_add(a, b) assert c.a == a and c.b == b
# As a first step, we need to describe our computation. # TVM adopts tensor semantics, with each intermediate result # represented as multi-dimensional array. The user need to describe # the computation rule that generate the tensors. # # We first define a symbolic variable n to represent the shape. # We then define two placeholder Tensors, A and B, with given shape (n,) # # We then describe the result tensor C, with a compute operation. # The compute function takes the shape of the tensor, as well as a lambda function # that describes the computation rule for each position of the tensor. # # No computation happens during this phase, as we are only declaring how # the computation should be done. # n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.placeholder((n, ), name='B') C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C") print(type(C)) ###################################################################### # Schedule the Computation # ------------------------ # While the above lines describes the computation rule, we can compute # C in many ways since the axis of C can be computed in data parallel manner. # TVM asks user to provide a description of computation called schedule. # # A schedule is a set of transformation of computation that transforms # the loop of computations in the program. #
import tvm import numpy as np # 同一个计算有多种不同的计算方式,更会有不同的性能 # Schedule来决定如何计算,schedule是一组计算转换,用于转化程序中的循环计算 # schedule 是由一组opts组成 # 默认情况下,以行优先的串行方式计算 n = tvm.var('n') m = tvm.var('m') A = tvm.placeholder((m, n), name='A') B = tvm.placeholder((m, n), name='B') C = tvm.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C') s = tvm.create_schedule([C.op]) # lower会将计算从定义转换为真正的可调用函数。 使用参数`simple_mode = True`, # 它将返回一个可读的C like语句,我们在此处使用它来打印计划结果。 # print(tvm.lower(s, [A, B, C], simple_mode=True)) # 一个schedule由多个stage组成,一个stage代表一个opt # 每个stage提供多种方法 # split # 将特定的一维拆成两维 A = tvm.placeholder((m, ), name='A') B = tvm.compute((m, ), lambda i: A[i] * 2, name='B') s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=32) # print(tvm.lower(s, [A, B], simple_mode=True)) s = tvm.create_schedule(B.op) bx, tx = s[B].split(B.op.axis[0], nparts=32)
def dot_16x1x16_int8_int8_int32(): """ Int8 dot product by every 4 elements using AVX2 Skylake instructions. This function takes two arrays of int8 datatype -- data[4] and kernel[16][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[16] of int32 datatype. The pseudo code is as follows. .. code-block:: c void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < 16; i++){ out[i] = 0; for (int k = 0; k < 4; k++){ out[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in an AVX512 vector register and the data[4] is broadcasted to another AVX512 vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Returns ------- intrin : TensorIntrin The Skylake int8 TensorIntrin that can be used in tensorizing schedule """ int32_lanes = 16 # 16 int32 lanes in AVX512 num_int8_elements = 4 # 4 int8 elements in int32 data = tvm.placeholder((num_int8_elements, ), dtype='uint8', name='data') kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel') k = tvm.reduce_axis((0, num_int8_elements), name='k') C = tvm.compute( (int32_lanes, ), lambda i: tvm.sum( data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k), name="C") a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer", offset_factor=1, strides=[tvm.var('ldw'), 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.ir_builder.create() if index == 1: ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) return ib.get() a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8) vec_ai32 = re_int32.astype('int32x16') vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") pair_reduction = tvm.call_llvm_intrin( 'int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vec_a, vec_b) quad_reduction = tvm.call_llvm_intrin( 'int32x16', 'llvm.x86.avx512.pmaddw.d.512', tvm.const(0, 'uint32'), pair_reduction, vec_one) if index == 0: ib.emit(outs[0].vstore(0, quad_reduction)) else: ib.emit(outs[0].vstore( 0, quad_reduction + outs[0].vload([0], 'int32x16'))) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer })
def test_gemm(): # graph nn = 1024 n = tvm.var('n') n = tvm.convert(nn) m = n l = n A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((m, l), name='B') k = tvm.reduce_axis((0, l), name='k') C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k), name='CC') # schedule s = tvm.create_schedule(C.op) xtile, ytile = 32, 32 scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") block_y = tvm.thread_axis("blockIdx.y") thread_y = tvm.thread_axis("threadIdx.y") CC = s.cache_write(C, "local") AA = s.cache_read(A, "shared", [CC]) BB = s.cache_read(B, "shared", [CC]) by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].reorder(by, bx, yi, xi) s[C].bind(by, block_y) s[C].bind(bx, block_x) ty, yi = s[C].split(yi, nparts=num_thread) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(ty, tx, yi, xi) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) s[CC].compute_at(s[C], tx) s[AA].compute_at(s[CC], k) s[BB].compute_at(s[CC], k) ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) # lowering test s = s.normalize() # one line to build the function. def check_device(device): if not tvm.module.enabled(device): print("skip because %s is not enabled.." % device) return f = tvm.build(s, [A, B, C], device) ctx = tvm.context(device, 0) # launch the kernel. n = nn m = n l = n a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) ftimer = f.time_evaluator(f.entry_name, ctx, number=1) tcost = ftimer(a, b, c).mean print("%s: exec=%g sec/op" % (ctx, tcost)) np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5) check_device("nvptx -mcpu=sm_20") check_device("metal") check_device("opencl") check_device("cuda")
def test_canonical_mixed(): ck = CanonicalChecker() x = tvm.var("x") z = tvm.const(3, "int32") ck.verify(x / (z * z) - x / (z * z), 0) ck.verify(x / (z + z) - x / (z + z), 0)
def single_lstm(): num_gate = 4 hidden_size = tvm.var('hidden_size') batch_size = tvm.var('batch_size') input_size = tvm.var('input_size') # A single LSTM block operations without unrolling # '*' linear transformation # '(*)' elementwise multiplication # F_t = sigmoid( W_f * x_t + R_f * h_t-1 + b_f ) # I_t = sigmoid( W_i * x_t + R_i * h_t-1 + b_i ) # O_t = sigmoid( W_o * x_t + R_o * h_t-1 + b_o ) # C'_t = tanh( W_c * x_t + R_c * h_t-1 + b_c ) # C_t = F_t (*) C_t-1 + I_t (*) C'_t # h_t = O_t (*) tanh( C_t ) # Global transition matrix # input X[0..t-1] X = tvm.placeholder((batch_size, input_size), name="X") Prev_h = tvm.placeholder((batch_size, hidden_size), name="Prev_h") Prev_c = tvm.placeholder((batch_size, hidden_size), name="Prev_c") # Parameters # Weight matrices [W_i, W_f, W_o, W_c]: 4 * hidden_size * input_size # Bias: 4 * hidden_size Wi2h = tvm.placeholder((num_gate, hidden_size, input_size), name="Wi2h") Bi2h = tvm.placeholder((num_gate, hidden_size), name="Bi2h") # Weight matrices [R_i, R_f, R_o, R_c]: 4 * hidden_size * hidden_size # Only handle hidden transition, saves space. Wh2h = tvm.placeholder((num_gate, hidden_size, hidden_size), name="Wh2h") Bh2h = tvm.placeholder((num_gate, hidden_size), name="Bh2h") # LSTM transition # [W_i, W_f, W_o, W_c] * X_t: 4 * num_hidden l = tvm.reduce_axis((0, input_size), name="li2h") i2h = tvm.compute((batch_size, num_gate, hidden_size), lambda i, x, j: tvm.sum(X[i, l] * Wi2h[x, j, l], axis=l), name="i2h") # [R_i, R_f, R_o, R_c] * h_t-1: 4 * hidden_size # R: hidden_size * hidden_size, h: hidden_size * 1 k = tvm.reduce_axis((0, hidden_size), name="ki2h") h2h = tvm.compute( (batch_size, num_gate, hidden_size), lambda i, x, j: tvm.sum(Prev_h[i, k] * Wh2h[x, j, k], axis=k), name="h2h") gates = tvm.compute( (batch_size, num_gate, hidden_size), lambda i, j, k: i2h[i, j, k] + h2h[i, j, k] + Bi2h[j, k] + Bh2h[j, k], name="gates") gshape = (batch_size, hidden_size) in_gate = tvm.compute(gshape, lambda i, j: tvm.sigmoid(gates[i, 0, j]), name="in_gate") forget_gate = tvm.compute(gshape, lambda i, j: tvm.sigmoid(gates[i, 1, j]), name="forget_gate") out_gate = tvm.compute(gshape, lambda i, j: tvm.sigmoid(gates[i, 2, j]), name="out_gate") in_transform = tvm.compute(gshape, lambda i, j: tvm.tanh(gates[i, 3, j]), name="in_transform") # C_t = F_t o C_t-1 + I_t o C'_t state_c = tvm.compute((batch_size, hidden_size), lambda i, j: forget_gate[i, j] * Prev_c[i, j] + in_gate[i, j] * in_transform[i, j], name="state_c") # h_t = O_t o tanh( C_t ) # state_h = tvm.compute((batch_size, hidden_size), # lambda i, j: out_gate[i, j] * tvm.tanh(state_c[i, j]), name="state_h") out_c, out_h = tvm.compute( (batch_size, hidden_size), lambda i, j: (state_c[i, j], out_gate[i, j] * tvm.tanh(state_c[i, j])), name="outputs_c_h") # schedule s = tvm.create_schedule(out_h.op) print( tvm.lower(s, [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h], simple_mode=True)) lstm = tvm.build(s, [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h], name="single_lstm") print(lstm) lstm.save("remy_single_lstm.o") print(lstm.imported_modules) cc.create_shared("remy_single_lstm.so", ["remy_single_lstm.o"])
import tvm import numpy as np m = tvm.var('m') n = tvm.var('n') X = tvm.placeholder((m, n), name='X') s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: X[0, i]) s_update = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i]) s_scan = tvm.scan(s_init, s_update, s_state, inputs=[X]) # Schedule the Scan Cell s = tvm.create_schedule(s_scan.op) num_thread = 256 block_x = tvm.thread_axis('blockIdx.x') thread_x = tvm.thread_axis('threadIdx.x') xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread) s[s_init].bind(xo, block_x) s[s_init].bind(xi, thread_x) xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread) s[s_update].bind(xo, block_x) s[s_update].bind(xi, thread_x) print(tvm.lower(s, [X, s_scan], simple_mode=True)) # Build and Verify f_scan = tvm.build(s, [X, s_scan], 'cuda', name='my_scan') ctx = tvm.gpu(0) n = 1024 m = 10 a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype) a = tvm.nd.array(a_np, ctx=ctx)
from __future__ import absolute_import, print_function import tvm import numpy as np N = tvm.var('N') # Data set size V = tvm.var('V') # Feature number C = tvm.var('C') # Center number data = tvm.placeholder((N, V), name='data') center = tvm.placeholder((C, V), name='center') # === Start computation # Compute distances rv = tvm.reduce_axis((0, V), name='rv') dis = tvm.compute((N, C), lambda n, c: tvm.sum( (data[n, rv]-center[c, rv]).astype('float64')* (data[n, rv]-center[c, rv]).astype('float64'), axis=rv), name='dis') rc = tvm.reduce_axis((0, C), name='rc') mse_n = tvm.compute((N,), lambda n: tvm.sum(dis[n, rc], axis=rc), name='mse_n') rn = tvm.reduce_axis((0, N), name='rn') mse = tvm.compute((1,), lambda i: tvm.sum(mse_n[rn], axis=rn), name='mse') # === End computation # Scheduling s = tvm.create_schedule(mse.op) # Compilation
def test_make(): x = tvm.const(1, "int32") y = tvm.var("x") z = x + y assert isinstance(tvm.max(x, y), tvm.expr.Max) assert isinstance(tvm.min(x, y), tvm.expr.Min)
def test_vector_simplify(): ck = RewriteChecker() x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z") # Add rules ck.verify( tvm.expr.Ramp(x, 1, 4) + tvm.expr.Ramp(y, 2, 4), tvm.expr.Ramp(x + y, 3, 4)) ck.verify(tvm.expr.Ramp(x, 1, 2) + y, tvm.expr.Ramp(x + y, 1, 2)) ck.verify(y + tvm.expr.Ramp(x, 1, 2), tvm.expr.Ramp(y + x, 1, 2)) ck.verify( y.astype("int32x2") + x.astype("int32x2"), (y + x).astype("int32x2")) # Sub rules ck.verify( tvm.expr.Ramp(x, 4, 4) - tvm.expr.Ramp(y, 2, 4), tvm.expr.Ramp(x - y, 2, 4)) ck.verify(tvm.expr.Ramp(x, 1, 2) - y, tvm.expr.Ramp(x - y, 1, 2)) ck.verify(y - tvm.expr.Ramp(x, 1, 2), tvm.expr.Ramp(y - x, -1, 2)) ck.verify( y.astype("int32x2") - x.astype("int32x2"), (y - x).astype("int32x2")) # Mul rules ck.verify( y.astype("int32x2") * x.astype("int32x2"), (y * x).astype("int32x2")) ck.verify(tvm.expr.Ramp(x, 4, 4) * 2, tvm.expr.Ramp(x * 2, 8, 4)) ck.verify(2 * tvm.expr.Ramp(x, 4, 4), tvm.expr.Ramp(x * 2, 8, 4)) ## Div rules ck.verify( y.astype("int32x2") / x.astype("int32x2"), (y / x).astype("int32x2")) ck.verify(tvm.expr.Ramp(x, 4, 4) / 2, tvm.expr.Ramp(x / 2, 2, 4)) ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True) ck.verify(tvm.expr.Ramp(x * 8 + 1, 1, 4) / 8, (x).astype("int32x4")) ck.verify( tvm.expr.Ramp(x * 8 + 15, 1, 4) / 8, tvm.expr.Ramp(x * 8 + 15, 1, 4) / 8) ## Mod rules ck.verify( y.astype("int32x2") % x.astype("int32x2"), (y % x).astype("int32x2")) ck.verify(tvm.expr.Ramp(x, 4, 4) % 2, tvm.expr.Broadcast(x % 2, 4)) ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True) ck.verify(tvm.expr.Ramp(x * 8 + 1, 1, 4) % 8, tvm.expr.Ramp(1, 1, 4)) ck.verify(tvm.expr.Ramp(x * 8 + 1, 15, 4) % 8, tvm.expr.Ramp(1, 15, 4) % 8) # Min/Max rules vx = tvm.var("vx", dtype="int32x2") vc = tvm.var("vc", dtype="uint1") ck.verify(tvm.min(y.astype("int32x2"), x.astype("int32x2")), tvm.min(y, x).astype("int32x2")) ck.verify(tvm.min(tvm.min(vx, y.astype("int32x2")), x.astype("int32x2")), tvm.min(vx, tvm.min(y, x).astype("int32x2"))) ck.verify(tvm.max(y.astype("int32x2"), x.astype("int32x2")), tvm.max(y, x).astype("int32x2")) ck.verify(tvm.max(tvm.max(vx, y.astype("int32x2")), x.astype("int32x2")), tvm.max(vx, tvm.max(y, x).astype("int32x2"))) ## Logical rules ck.verify( y.astype("int32x2").equal(x.astype("int32x2")), (y.equal(x)).astype("uint1x2")) ck.verify(tvm.expr.NE(y.astype("int32x2"), (x.astype("int32x2"))), (tvm.expr.NE(y, x)).astype("uint1x2")) ck.verify( y.astype("int32x2") > x.astype("int32x2"), (x < y).astype("uint1x2")) ck.verify( y.astype("int32x2") >= x.astype("int32x2"), (x <= y).astype("uint1x2")) ck.verify( y.astype("int32x2") < x.astype("int32x2"), (y < x).astype("uint1x2")) ck.verify( y.astype("int32x2") <= x.astype("int32x2"), (y <= x).astype("uint1x2")) ck.verify( tvm.expr.And( y.astype("int32x2") <= x.astype("int32x2"), vc.astype("uint1x2")), (tvm.expr.And(y <= x, vc)).astype("uint1x2")) ck.verify( tvm.expr.Or( y.astype("int32x2") <= x.astype("int32x2"), vc.astype("uint1x2")), (tvm.expr.Or(y <= x, vc)).astype("uint1x2"))
lstm = tvm.build(s, [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h], name="single_lstm") print(lstm) lstm.save("remy_single_lstm.o") print(lstm.imported_modules) cc.create_shared("remy_single_lstm.so", ["remy_single_lstm.o"]) if __name__ == "__main__": #single_lstm() num_gate = 4 batch_size = 1 hidden_size = 2 input_size = tvm.var('input_size') lstm = tvm.module.load("./remy_single_lstm.so") x_np = np.array([[5]], dtype='float32') Wi2h_np = np.array([[[1], [3]], [[-5], [7]], [[1], [1]], [[1], [1]]], dtype='float32') Bi2h_np = np.array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype='float32') Wh2h_np = np.array([[[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 0], [0, 0]]], dtype='float32') Bh2h_np = np.array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype='float32') scan_h_np = np.zeros(shape=(batch_size, hidden_size)).astype("float32") scan_c_np = np.zeros(shape=(batch_size, hidden_size)).astype("float32") x = tvm.nd.array(x_np) Wi2h = tvm.nd.array(Wi2h_np)
def test_bound(): m = tvm.var('m') vrange = tvm.convert( {m: tvm.Range(tvm.const(0, "int32"), tvm.const(10, "int32"))}) ret = tvm.ir_pass.Simplify(m % 10, vrange) assert ret == m
def test_deduce(): a = tvm.var('a') b = tvm.var('b') c = tvm.var('c') d = tvm.var('d') b_s = tvm.arith.intset_interval(2, 3) c_s = tvm.arith.intset_interval(10, 15) d_s = tvm.arith.intset_interval(-3, -1) zero = tvm.const(0, "int32") e0 = (-b) * a + c - d res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((d - c) / (b * -1)) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) # expression containing variable a is on rhs res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) e0 = d * a + c - d res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((0 - c) / d + 1) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) # expression containing variable a is on rhs res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) e1 = (a * 4 + b < c) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) ans1 = (((c - b) + -1) / 4) assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1) # expression containing variable a is on rhs e1 = (c > a * 4 + b) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1) e2 = (tvm.max(5, a * 4) < 0) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max()) == "neg_inf" assert str(res2.min()) == "pos_inf" # expression containing variable a is on rhs e2 = (zero < tvm.max(5, a * 4)) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max()) == "neg_inf" assert str(res2.min()) == "pos_inf" e3 = (-b) + a * c - d res3 = tvm.arith.DeduceBound(a, e3 >= 0, { b: b_s, c: c_s, d: d_s }, { b: b_s, d: d_s }) ans3 = 2 / c + 1 assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3) res3 = tvm.arith.DeduceBound(a, zero <= e3, { b: b_s, c: c_s, d: d_s }, { b: b_s, d: d_s }) assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)