def _instr(index): ib = tvm.ir_builder.create() if index == 1: ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16'))) return ib.get() a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8) vec_ai32 = re_int32.astype('int32x16') vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], "int8x64") vec_one = tvm.const(1, "int16x32") pair_reduction = tvm.call_llvm_intrin('int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.const(0, 'uint32'), vec_a, vec_b) quad_reduction = tvm.call_llvm_intrin('int32x16', 'llvm.x86.avx512.pmaddw.d.512', tvm.const(0, 'uint32'), pair_reduction, vec_one) if index == 0: ib.emit(outs[0].vstore(0, quad_reduction)) else: ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16'))) return ib.get()
def my_clip(x, a_min, a_max): """Unlike topi's current clip, put min and max into two stages.""" const_min = tvm.const(a_min, x.dtype) const_max = tvm.const(a_max, x.dtype) x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") return x
def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters ---------- cls_prob : tvm.Tensor Class probabilities. loc_pred : tvm.Tensor Location regression predictions. anchor : tvm.Tensor Prior anchor boxes. clip : boolean Whether to clip out-of-boundary boxes. threshold : float Threshold to be a positive prediction. variances : tuple of float Variances to be decoded from box regression output. Returns ------- ret : tuple of tvm.Tensor """ return hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor, tvm.const(clip, "bool"), tvm.const(threshold, "float32"), tvm.convert(variances))
def stmt_generater(dtype_list, length): ib = tvm.ir_builder.create() base_dtype = dtype_list[0] global_a = tvm.placeholder((length,), name = "global_a", dtype = base_dtype) assert len(dtype_list) == 4 with ib.for_range(0, length, name="j") as j: dtype = dtype_list[0] A = ib.allocate(dtype, length, name="A", scope="local.L0A") A[j] = tvm.const(1, dtype = dtype) with ib.for_range(0, length, name="j") as j: dtype = dtype_list[1] B = ib.allocate(dtype, length, name="B", scope="local.L0A") B[j] = tvm.const(1, dtype = dtype) with ib.for_range(0, length, name="j") as j: dtype = dtype_list[2] C = ib.allocate(dtype, length, name="C", scope="local.L0A") C[j] = tvm.const(1, dtype = dtype) with ib.for_range(0, length, name="j") as j: dtype = dtype_list[3] D = ib.allocate(dtype, length, name="D", scope="local.L0A") D[j] = tvm.const(1, dtype = dtype) with ib.for_range(0, length, name="j") as j: dtype = "int8" E = ib.allocate(dtype, length, name="E", scope="local.L0A") E[j] = A[j].astype(dtype) + B[j].astype(dtype) + C[j].astype(dtype) + D[j].astype(dtype) return ib.get()
def test_reuse_small_buffer(): ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("int16", 200, name="A", scope="local.L0A") A[j] = tvm.const(1, "int16") B = ib.allocate("int16", 200, name="B", scope="local.L0A") B[j] = tvm.const(1, "int16") B1 = ib.allocate("int16", 200, name="B1", scope="local.L0A") B1[j] = A[j] + B[j] C = ib.allocate("int16", 400, name="C", scope="local.L0A") C[j] = tvm.const(1, "int16") D = ib.allocate("int16", 400, name="D", scope="local.L0A") D[j] = tvm.const(1, "int16") E = ib.allocate("int16", 400, name="E", scope="local.L0A") E[j] = C[j] body = ib.get() body = tvm.ir_pass.StorageRewrite(body) num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 assert n.extents[0].value == 800 tvm.ir_pass.PostOrderVisit(body, verify) assert num_alloc[0] == 1
def test_alloc_seq_type(): ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("float32", 200, name="A", scope="local.L0A") A1 = ib.allocate("float32", 200, name="A1", scope="local.L0A") A[j] = 1.2 A1[j] = 1.3 B = ib.allocate("int16", 200, name="B", scope="local.L0A") B[j] = tvm.const(1, "int16") C = ib.allocate("int16", 200, name="C", scope="local.L0A") C[j] = tvm.const(1, "int16") D = ib.allocate("int16", 200, name="D", scope="local.L0A") D[j] = B[j] + C[j] A2 = ib.allocate("float32", 200, name="A2", scope="local.L0A") A2[j] = A[j] body = ib.get() body = tvm.ir_pass.StorageRewrite(body) num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 assert n.extents[0].value == 500 tvm.ir_pass.PostOrderVisit(body, verify) assert num_alloc[0] == 1
def _sample(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype('int32') roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4] roi_start_h *= spatial_scale roi_end_h *= spatial_scale roi_start_w *= spatial_scale roi_end_w *= spatial_scale # force malformed ROIs to be 1x1 roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype)) roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype)) bin_h = roi_h / pooled_size_h bin_w = roi_w / pooled_size_w if sample_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32') else: roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32') roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32') count = roi_bin_grid_h * roi_bin_grid_w rh = tvm.reduce_axis((0, roi_bin_grid_h)) rw = tvm.reduce_axis((0, roi_bin_grid_w)) roi_start_h += ph * bin_h roi_start_w += pw * bin_w return tvm.sum(_bilinear(batch_index, c, roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count, axis=[rh, rw])
def _bitpack(*indices): packed_data = [tvm.const(0, pack_type)] * bits for k in range(data_width): # Translate indices for packed data back to original idx = [0] * n j = 0 for i in range(n+1): if i == bit_axis: continue elif i == pack_axis: idx[j] = indices[i] * data_width + k else: idx[j] = indices[i] j += 1 element = data(*idx) for b in range(bits): extracted_bit = ((element & tvm.const(masks[b], "int32")) >> b).astype(pack_type) packed_data[b] = (packed_data[b] | extracted_bit) if k < data_width - 1: packed_data[b] = packed_data[b] << 1 if k == data_width - 1: return tuple(packed_data) return tuple(packed_data)
def select_array(i, j): now = tvm.const(0.0, dtype) for ii in range(row): for jj in range(col): now = tvm.expr.Select(tvm.all(i % row == ii, j % col == jj), tvm.const(matrix[ii][jj], dtype), now) return now
def test_const_saveload_json(): # save load json x = tvm.const(1, "int32") y = tvm.const(10, "int32") z = x + y z = z + z json_str = tvm.save_json(z) zz = tvm.load_json(json_str) assert tvm.save_json(zz) == tvm.save_json(z)
def test_make_smap(): # save load json x = tvm.const(1, "int32") y = tvm.const(10, "int32") z = tvm.expr.Add(x, y) smap = tvm.convert({"z": z, "x": x}) json_str = tvm.save_json(tvm.convert([smap])) arr = tvm.load_json(json_str) assert len(arr) == 1 assert arr[0]["z"].a == arr[0]["x"]
def _intrin_func(ins, outs): ww, xx = ins zz = outs[0] vpadd = "llvm.arm.neon.vpadd.v8u8" vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16" args_1 = tvm.const(1, 'uint32') args_2 = tvm.const(2, 'uint32') def _instr(index): irb = tvm.ir_builder.create() if index == 1: irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) return irb.get() cnts8 = [None] * 8 cnts4 = [None] * 4 cnts2 = [None] * 2 for bw in range(w_b): for bx in range(x_b): if k_i == 16: for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16') cnts = tvm.popcount(ands) upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts) lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts) cnts8[i] = upper_half + lower_half for i in range(m//2): cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts8[i*2], cnts8[i*2+1]) for i in range(m//4): cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts4[i*2], cnts4[i*2+1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw+bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) else: # ki == 8 for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8') cnts8[i] = tvm.popcount(ands) for i in range(m//2): cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts8[i*2], cnts8[i*2+1]) for i in range(m//4): cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts4[i*2], cnts4[i*2+1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw+bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) irb.emit(zz.vstore(0, out)) return irb.get() # body, reset, update return _instr(0), _instr(1), _instr(2)
def test_bitwise(): x = tvm.var('x') y = tvm.var('y') assert str(x << y) == 'shift_left(x, y)' assert str(x >> y) == 'shift_right(x, y)' assert str(x & y) == 'bitwise_and(x, y)' assert str(x | y) == 'bitwise_or(x, y)' assert str(x ^ y) == 'bitwise_xor(x, y)' assert str(~x) == 'bitwise_not(x)' assert(tvm.const(1, "int8x2") >> 1).dtype == "int8x2" assert(x >> tvm.const(1, "int32x2")).dtype == "int32x2" assert(tvm.var("z", "int8x2") << tvm.const(1, "int8x2")).dtype == "int8x2"
def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'): """declare the Im2Col method for conv2d""" _, CI, IH, IW = [x.value for x in data.shape] CO, _, KH, KW = [x.value for x in kernel.shape] HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride N = 1 OH = (IH + 2*HPAD - KH) // HSTR + 1 OW = (IW + 2*WPAD - KW) // WSTR + 1 DO_PAD = (HPAD != 0 and WPAD != 0) if DO_PAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data ALIGN = 16 def upround(x, align): return (x + align - 1) // align * align # A [CO, CI * KH * KW] reduce_len = upround(CI * KH * KW, ALIGN) A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j: kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A') # B [CI * KH * KW, N * OH * OW] B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\ tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW), data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH] [j % OW*WSTR + i % KW], tvm.const(0, data_pad.dtype)), name='B') gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1] # C [CO, N * OH * OW] k = tvm.reduce_axis((0, gemm_l), name='k') C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') # output # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment, # otherwise the alignment above will be eliminated by bound inference output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\ C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1], name='output', tag='im2col_conv_output') return output
def test_const_propagation(): x1 = tvm.const(4, "int32") x2 = x1 + 5 assert isinstance(x2, tvm.expr.IntImm) and x2.value == 9 x3 = x2 / 3 assert isinstance(x3, tvm.expr.IntImm) and x3.value == 3 x4 = x3 + 0.5 assert isinstance(x4, tvm.expr.FloatImm) and x4.value == 3.5 x5 = tvm.ceil(x4) assert isinstance(x5, tvm.expr.FloatImm) and x5.value == 4 x6 = x5.astype('int') assert isinstance(x6, tvm.expr.IntImm) and x6.value == 4 y = (tvm.round((tvm.const(6.5, 'float32') - 1) / 1.5) + 2).astype('int') assert isinstance(y, tvm.expr.IntImm) and y.value == 6
def compute_clip(attrs, inputs, _): """ Clip operator. """ x = inputs[0] a_min = attrs.get_float("a_min") a_max = attrs.get_float("a_max") const_min = tvm.const(a_min, x.dtype) const_max = tvm.const(a_max, x.dtype) with tvm.tag_scope(topi.tag.ELEMWISE): x = tvm.compute( x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") x = tvm.compute( x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") return x
def check_select(ctx, n, dtype): A = tvm.placeholder((n,), name='A', dtype=dtype) true_value = tvm.const(1, dtype=dtype) false_value = tvm.const(3, dtype=dtype) max_lhs = tvm.const(2, dtype=dtype) max_rhs = tvm.expr.Select(A[0] > 0, true_value, false_value) C = tvm.compute((n,), lambda i: tvm.max(max_lhs, max_rhs), name='C') s = tvm.create_schedule(C.op) s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, C], target) a = tvm.nd.empty((n,), A.dtype, ctx) c = tvm.nd.empty((n,), A.dtype, ctx) # Only need to test compiling here fun(a, c)
def test_scan(): m = tvm.var("m") n = tvm.var("n") x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x") s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: x[0, i], name="s_init") x_trans = tvm.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans") s_up1 = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1") s_update = tvm.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update") s_scan = tvm.scan(s_init, s_update, s_state) def test_getbody(): body = tvm.schedule.ScanGetBody(s_scan.op) assert set(body) == set([s_scan.op, s_update.op, s_up1.op]) def test_attach_path(): s = tvm.create_schedule(s_scan.op) s[x_trans].compute_at(s[s_update], s_update.op.axis[0]) apath = tvm.schedule.CreateAttachPath(s) assert(tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis])) assert(tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis])) def test_fix_pt(): body = tvm.schedule.ScanGetBody(s_scan.op) fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body) assert(fxpt[s_scan.spatial_axis_[0]].value != 0)
def test_unroll_loop(): ib = tvm.ir_builder.create() dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) Aptr = ib.buffer_ptr(Ab) # for i in 0 to n-1: with ib.for_range(n, n + 2, name="i") as i: with ib.for_range(0, 8, name="i", for_type="unroll") as j: Aptr[j + 1] = Aptr[i] + 1 stmt = ib.get() assert isinstance(stmt, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, True) assert not isinstance(ret, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, 0, True) assert isinstance(ret, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, False) assert isinstance(ret, tvm.stmt.For) assert ret.for_type == tvm.stmt.For.Unrolled ib = tvm.ir_builder.create() ib.scope_attr(tvm.const(0, "int32"), "pragma_auto_unroll_max_step", 16) ib.emit(stmt) wrapped = ib.get() wrapped = tvm.make.Block(wrapped, stmt) assert isinstance(ret, tvm.stmt.For) ret = tvm.ir_pass.UnrollLoop(wrapped, 0, 8, 0, False) assert isinstance(ret.first, tvm.stmt.For) assert ret.first.for_type == tvm.stmt.For.Unrolled assert isinstance(ret.rest, tvm.stmt.For) assert ret.rest.for_type != tvm.stmt.For.Unrolled
def test_tensor_comm_reducer(): m = tvm.var('m') n = tvm.var('n') A = tvm.placeholder((m, n), name='A') k = tvm.reduce_axis((0, n), "k") mysum = tvm.comm_reducer(lambda x, y: x+y, lambda t: tvm.const(0, dtype=t)) C = tvm.compute((m,), lambda i: mysum(A[i, k], axis=k))
def test_const_param(): @tvm.hybrid.script def add_something(a, b): c = output_tensor((11, ), 'int32') for i in range(11): c[i] = a[i] + b return c a = tvm.placeholder((11, ), dtype='int32', name='a') b = tvm.const(11, 'int32') c = add_something(a, b) sch = tvm.create_schedule(c.op) module = tvm.build(sch, [a, c], 'llvm') assert(module) np_a = numpy.arange(11).astype('int32') np_b = 11 np_c = numpy.zeros((11, )).astype('int32') nd_a = tvm.ndarray.array(np_a) nd_c = tvm.ndarray.array(numpy.zeros((11, )).astype('int32')) module(nd_a, nd_c) ref = add_something(np_a, 11) tvm.testing.assert_allclose(nd_c.asnumpy(), ref, 1e-5, 1e-5)
def test_parallel_alloc(): ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i", for_type="parallel") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("float32", n, name="A", scope="global") A[j] = A[j] + 2 body = ib.get() body = tvm.ir_pass.StorageRewrite(body) assert (isinstance(body.body.body, tvm.stmt.Allocate)) ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="t") as i: ib.scope_attr( tvm.const(1, "int32") , "pragma_scope", tvm.make.StringImm("parallel_launch_point")) with ib.for_range(0, n, name="i", for_type="parallel") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("float32", n, name="A", scope="global") A[j] = A[j] + 2 body = ib.get() body = tvm.ir_pass.StorageRewrite(body) assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate))
def test_alloc_seq_type2(): scope_tb = "local.L0A2" max_bits=1024 * 1024 * 1024 register_mem(scope_tb, max_bits) ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("float32", 200, name="A", scope=scope_tb) A[j] = 1.2 with ib.for_range(0, 20, name="j") as j: B = ib.allocate("int16", 400, name="B", scope=scope_tb) B[j] = tvm.const(1, "int16") with ib.for_range(0, 10, name="j") as j: C = ib.allocate("float32", 200, name="C", scope=scope_tb) C[j] = 1.2 body = ib.get() body = tvm.ir_pass.StorageRewrite(body) num_alloc = [0] def verify(n): if isinstance(n, tvm.stmt.Allocate): num_alloc[0] += 1 assert n.extents[0].value == 200 tvm.ir_pass.PostOrderVisit(body, verify) assert num_alloc[0] == 1
def test_scan_group(): m = tvm.var("m") n = tvm.var("n") x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x") s_state = tvm.placeholder((m, n)) s_init = tvm.compute((1, n), lambda _, i: x[0, i]) s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i]) s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1) s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1) res = tvm.scan(s_init, s_update3, s_state, inputs=x) s = tvm.create_schedule(res.op) assert s[s_update1].group is not None assert s[s_update2].group == s[s_update1].group # Assign within group, is valid s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1]) # create a new group, for [s_update2 and s_update1] g2 = s.create_group(outputs=s_update2, inputs=[s_state, x]) assert g2.group is not None assert g2.group == s[s_update3].group assert s[s_update2].group == g2 assert s[s_update1].group == g2 g2.compute_at(s[s_update3], s_update3.op.axis[1]) assert g2.attach_stage == s[s_update3] try: # compute outside group error. s[s_update2].compute_at(s[s_init], s_init.op.axis[0]) assert False except tvm.TVMError: pass
def test_in_bounds_vectorize_llvm(): n = 512 lanes = 2 A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes) B = tvm.compute((n,), lambda i: A[i], name='B') C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], nparts=2) _, xi = s[C].split(xi, factor=2) s[C].parallel(xo) s[C].vectorize(xi) s[B].compute_at(s[C], xo) xo, xi = s[B].split(B.op.axis[0], factor=2) s[B].vectorize(xi) # build and invoke the kernel. lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False) print (lowered_func.body) f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((n,), A.dtype).copyfrom( np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), C.dtype, ctx) f(a, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
def _check_compact(buf): ndim = len(buf.shape) size = tvm.const(1, buf.shape[0].dtype) for i in reversed(range(ndim)): if not util.equal_const_int(size - buf.strides[i], 0): raise RuntimeError( "Cannot prove compact: shape=%s, strides=%s" % (buf.shape, buf.strides)) size = size * buf.shape[i]
def test_deduce(): a = tvm.var('a') b = tvm.var('b') c = tvm.var('c') d = tvm.var('d') b_s = tvm.arith.intset_interval(2, 3) c_s = tvm.arith.intset_interval(10, 15) d_s = tvm.arith.intset_interval(-3, -1) zero = tvm.const(0, "int32") e0 = (-b)*a+c-d res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((d - c) /(b*-1)) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) # expression containing variable a is on rhs res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) e0 = d*a+c-d res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((0-c)/d + 1) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) # expression containing variable a is on rhs res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0) e1 = (a*4+b < c) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) ans1 = (((c - b) + -1)/4) assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1) # expression containing variable a is on rhs e1 = (c > a*4+b) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1) e2 = (tvm.max(5, a * 4) < 0) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max()) == "neg_inf" assert str(res2.min()) == "pos_inf" # expression containing variable a is on rhs e2 = (zero < tvm.max(5, a * 4)) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max()) == "neg_inf" assert str(res2.min()) == "pos_inf" e3 = (-b)+a*c-d res3 = tvm.arith.DeduceBound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s}) ans3 = 2/c+1 assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3) res3 = tvm.arith.DeduceBound(a, zero <= e3, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s}) assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)
def check_mod(start, end, divisor, dtype): T = tvm.compute((end - start,), lambda i: tvm.expr.Cast(dtype, (start + i)) % tvm.const(divisor, dtype)) s = tvm.create_schedule([T.op]) f = tvm.build(s, [T], "llvm") a = tvm.nd.empty((end - start,), dtype) f(a) ref = [int(math.fmod(i, divisor)) for i in range(start, end)] tvm.testing.assert_allclose(a.asnumpy(), ref)
def test_llvm_lookup_intrin(): ib = tvm.ir_builder.create() m = tvm.var("m") A = ib.pointer("uint8x8", name="A") x = tvm.call_llvm_intrin("uint8x8", "llvm.ctpop.i8", tvm.const(1, 'uint32'), A) ib.emit(x) body = ib.get() func = tvm.ir_pass.MakeAPI(body, "ctpop", [A], 1, True) fcode = tvm.build(func, None, "llvm")
def _instr(index): irb = tvm.ir_builder.create() if index == 1: irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) return irb.get() cnts8 = [None] * 8 cnts4 = [None] * 4 cnts2 = [None] * 2 for bw in range(w_b): for bx in range(x_b): if k_i == 16: for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16') cnts = tvm.popcount(ands) upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts) lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts) cnts8[i] = upper_half + lower_half for i in range(m//2): cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts8[i*2], cnts8[i*2+1]) for i in range(m//4): cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts4[i*2], cnts4[i*2+1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw+bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) else: # ki == 8 for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8') cnts8[i] = tvm.popcount(ands) for i in range(m//2): cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts8[i*2], cnts8[i*2+1]) for i in range(m//4): cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts4[i*2], cnts4[i*2+1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw+bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) irb.emit(zz.vstore(0, out)) return irb.get()
def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype): """Transposed 2D convolution nchw forward operator. Parameters ---------- cfg: ConfigEntity The config for this template Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [in_channel, num_filter, filter_height, filter_width] strides : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] out_dtype: str The output type. This is used in mixed precision Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape) _, out_channels, kernel_height, kernel_width = get_const_tuple( kernel.shape) stride_height, stride_width = stride cfg.stride = stride pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple( padding, (kernel_height, kernel_width)) out_width = (inp_width - 1) * stride_width + \ kernel_width - pad_left - pad_right pad_left = kernel_width - 1 - pad_left pad_right = kernel_width - 1 - pad_right dilated_width = stride_width * (inp_width - 1) + 1 out_height = (inp_height - 1) * stride_height + \ kernel_height - pad_top - pad_bottom pad_top = kernel_height - 1 - pad_top pad_bottom = kernel_height - 1 - pad_bottom dilated_height = stride_height * (inp_height - 1) + 1 # compute pad data = tvm.compute( (batch, inp_channels, pad_top + dilated_height + pad_bottom, pad_left + dilated_width + pad_right), lambda n, c, y, x: tvm.if_then_else( tvm.all(x >= pad_left, x < pad_left + dilated_width, tvm.indexmod(x - pad_left, stride_width).equal(0), y >= pad_top, y < pad_top + dilated_height, tvm.indexmod(y - pad_top, stride_height).equal(0)), data[ n, c, tvm.indexdiv(y - pad_top, stride_height), tvm.indexdiv(x - pad_left, stride_width)], tvm.const(0., "float32")), name='data_pad') # compute transposed conv dc = tvm.reduce_axis((0, inp_channels), name='dc') dh = tvm.reduce_axis((0, kernel_height), name='dh') dw = tvm.reduce_axis((0, kernel_width), name='dw') data_out = tvm.compute( (batch, out_channels, out_height, out_width), lambda b, c, h, w: tvm.sum(data[b, dc, h + dh, w + dw].astype( out_dtype) * kernel[dc, c, kernel_height - 1 - dh, kernel_width - 1 - dw].astype(out_dtype), axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") return data_out
def test_tensor_comm_reducer_overload(): m = tvm.var('m') n = tvm.var('n') mysum = tvm.comm_reducer(lambda x, y: x + y, lambda t: tvm.const(0, dtype=t)) sum_res = mysum(m, n)
def test_bound(): m = tvm.var('m') vrange = tvm.convert({m: tvm.Range(tvm.const(0), tvm.const(10))}) ret = tvm.ir_pass.Simplify(m % 10, vrange) assert ret == m
out_channel = 512 in_size = 14 kernel = 3 pad = 1 stride = 1 # Algorithm A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A') W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W') out_size = (in_size - kernel + 2 * pad) // stride + 1 # Pad input Apad = tvm.compute( (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch), lambda yy, xx, cc, nn: tvm.select( tvm.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size), A[yy - pad, xx - pad, cc, nn], tvm.const(0.)), name='Apad') # Create reduction variables rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel), name='ry') rx = tvm.reduce_axis((0, kernel), name='rx') # Compute the convolution B = tvm.compute( (out_size, out_size, out_channel, batch), lambda yy, xx, ff, nn: tvm.sum(Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff], axis=[ry, rx, rc]), name='B') ############################################################################### # Memory Hierarchy
def test_deduce(): a = tvm.var('a') b = tvm.var('b') c = tvm.var('c') d = tvm.var('d') b_s = tvm.arith.IntervalSet(2, 3) c_s = tvm.arith.IntervalSet(10, 15) d_s = tvm.arith.IntervalSet(-3, -1) zero = tvm.const(0, "int32") e0 = (-b) * a + c - d res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((d - c) / (b * -1) + (-1)) assert_expr_equal(res0.max_value, ans0) # expression containing variable a is on rhs res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res0.max_value, ans0) e0 = d * a + c - d res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((d - c) / d - 1) assert_expr_equal(res0.max_value, ans0) # expression containing variable a is on rhs res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res0.max_value, ans0) e1 = (a * 4 + b < c) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) ans1 = (((c - b) + -1) / 4 - 1) assert_expr_equal(res1.max_value, ans1) # expression containing variable a is on rhs e1 = (c > a * 4 + b) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res1.max_value, ans1) e2 = (tvm.max(5, a * 4) < 0) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max_value) == "neg_inf" assert str(res2.min_value) == "pos_inf" # expression containing variable a is on rhs e2 = (zero < tvm.max(5, a * 4)) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max_value) == "neg_inf" assert str(res2.min_value) == "pos_inf" e3 = (-b) + a * c - d res3 = tvm.arith.DeduceBound(a, e3 >= 0, { b: b_s, c: c_s, d: d_s }, { b: b_s, d: d_s }) ans3 = 2 / c + 1 assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3) res3 = tvm.arith.DeduceBound(a, zero <= e3, { b: b_s, c: c_s, d: d_s }, { b: b_s, d: d_s }) assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)
def _compute(attrs, x, _): x = x[0] scalar = attrs.get_float("scalar") scalar = tvm.const(scalar, x.dtype) return tvm.compute(x.shape, lambda *i: f(x(*i), scalar))
def f(n): rv = tvm.reduce_axis((0, n)) init = lambda dtype: tvm.expr.Select(n > 1, tvm.const(0, dtype), n.astype(dtype)) sum = tvm.comm_reducer(lambda x, y: tvm.max(x + y, n.astype('float32')), init, name='sum') return sum(X[rv], axis=rv)
def check(start, end, dstart, dend, dtype, floor_div=False): div = tvm.floordiv if floor_div else tvm.truncdiv mod = tvm.floormod if floor_div else tvm.truncmod # A are dividends, B are divisors. Note that we add 1 to make include end in the range. A = tvm.placeholder((end - start + 1,), name="A", dtype=dtype) B = tvm.placeholder((dend - dstart + 1,), name="B", dtype=dtype) # We clip values with min and max so that simplifiers know the ranges of values clipa = lambda x: tvm.min(tvm.const(end, dtype), tvm.max(tvm.const(start, dtype), x)) clipb = lambda x: tvm.min(tvm.const(dend, dtype), tvm.max(tvm.const(dstart, dtype), x)) # If the range is just a single point, use the constant itself if start == end: clipa = lambda x: tvm.const(start, dtype) if dstart == dend: clipb = lambda x: tvm.const(dstart, dtype) # D are division results and M are modulo results [D, M] = tvm.compute((end - start + 1, dend - dstart + 1), lambda i, j: (div(clipa(A[i]), clipb(B[j])), mod(clipa(A[i]), clipb(B[j])))) s = tvm.create_schedule([D.op, M.op]) f = tvm.build(s, [A, B, D, M], "llvm") # Fill input arrays with values A_arr = tvm.nd.empty((end - start + 1,), dtype) B_arr = tvm.nd.empty((dend - dstart + 1,), dtype) A_arr.copyfrom(np.arange(start, end + 1, dtype=dtype)) B_np = np.arange(dstart, dend + 1, dtype=dtype) # If the range of the divisor contains 0, replace it with 1 to avoid division by zero if dend >= 0 and dstart <= 0: B_np[-dstart] = 1 B_arr.copyfrom(B_np) D_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype) M_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype) # Run the function and convert the results to numpy f(A_arr, B_arr, D_arr, M_arr) D_arr = D_arr.asnumpy() M_arr = M_arr.asnumpy() # This helper just prints additional info on failure def _show_info(): print("dtype: {}".format(dtype)) print("dividend range: [{}, {}]".format(start, end)) print("divisor range: [{}, {}]".format(dstart, dend)) lowered = tvm.lower(s, [A, B, D, M], simple_mode=True) print("Lowered code:") print(lowered) # Check that the computed values are correct for i in range(start, end + 1): for j in range(dstart, dend + 1): if j == 0: continue if floor_div: dref = i // j mref = i % j else: dref = int(float(i) / j) mref = int(math.fmod(i, j)) if D_arr[i - start, j - dstart] != dref: _show_info() raise AssertionError("Incorrect division result: {}({}, {}) is {} " "but should be {}".format(div.__name__, i, j, D_arr[i - start, j - dstart], dref)) if M_arr[i - start, j - dstart] != mref: _show_info() raise AssertionError("Incorrect modulo result: {}({}, {}) is {} " "but should be {}".format(mod.__name__, i, j, M_arr[i - start, j - dstart], mref))
def pool(data, kernel, stride, padding, pool_type, ceil_mode=False): """Perform pooling on the data Parameters ---------- data : tvm.Tensor 4-D with shape [batch, channel, in_height, in_width] kernel : list/tuple of two ints Kernel size, [kernel_height, kernel_width] stride : list/tuple of two ints Stride size, [stride_height, stride_width] paddding : list/tuple of two ints Pad size, [pad_height, pad_width] pool_type : str Pool type, 'max' or 'avg' ceil_mode : bool Whether to use ceil when caculate output size. Returns ------- output : tvm.Tensor 4-D with shape [batch, channel, out_height, out_width] """ assert len(data.shape) == 4, "only support 4-dim pooling" assert len(stride) == 2, "only support 2-dim stride" kernel_height, kernel_width = kernel stride_height, stride_width = stride batch, channel, height, width = data.shape pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (kernel_height, kernel_width)) pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] if ceil_mode: # Additional padding to ensure we do ceil instead of floor when divide stride. pad_down += stride_height - 1 pad_right += stride_width - 1 out_height = util.simplify((height - kernel_height + pad_top + pad_down) // stride_height + 1) out_width = util.simplify((width - kernel_width + pad_left + pad_right) // stride_width + 1) dheight = tvm.reduce_axis((0, kernel_height)) dwidth = tvm.reduce_axis((0, kernel_width)) if pool_type == 'max': temp = pad(data, pad_before, pad_after, name="pad_temp", \ pad_value=tvm.min_value(data.dtype)) return tvm.compute((batch, channel, out_height, out_width), \ lambda n, c, h, w: \ tvm.max(temp[n, c, h*stride_height+dheight, w*stride_width+dwidth], \ axis=[dheight, dwidth]), \ tag="pool_max") elif pool_type == 'avg': temp = pad(data, pad_before, pad_after, name="pad_temp", \ pad_value=tvm.const(0.).astype(data.dtype)) tsum = tvm.compute((batch, channel, out_height, out_width), \ lambda n, c, h, w: \ tvm.sum(temp[n, c, h*stride_height+dheight, w*stride_width+dwidth], \ axis=[dheight, dwidth]), \ tag="pool_avg") return tvm.compute((batch, channel, out_height, out_width), \ lambda n, c, h, w: \ tsum[n, c, h, w] / (kernel_height*kernel_width), \ tag=tag.ELEMWISE) else: raise ValueError("Pool type should be 'avg' or 'max'.")
def test_logical_simplify(): ck = RewriteChecker() x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z") ck.verify(tvm.expr.And(tvm.expr.EQ(x, y), tvm.expr.NE(x, y)), tvm.const(False, "bool")) ck.verify(tvm.expr.And(tvm.expr.NE(x, y), tvm.expr.EQ(x, y)), tvm.const(False, "bool")) ck.verify(tvm.expr.And(x > 1, tvm.expr.Not(x > 1)), tvm.const(False, "bool")) ck.verify(tvm.expr.And(x <= y, y < x), tvm.const(False, "bool")) ck.verify(tvm.expr.And(y < x, y <= x), tvm.const(False, "bool")) ck.verify(tvm.expr.And(x < 1, 0 < x), tvm.const(False, "bool")) ck.verify(tvm.expr.And(x < 0, 1 < x), tvm.const(False, "bool")) ck.verify(tvm.expr.And(x < 1, 1 <= x), tvm.const(False, "bool")) ck.verify(tvm.expr.And(x <= 1, 1 < x), tvm.const(False, "bool")) ck.verify(tvm.expr.And(1 <= x, x < 1), tvm.const(False, "bool")) ck.verify(tvm.expr.And(1 < x, x <= 1), tvm.const(False, "bool")) ck.verify(tvm.expr.And(x <= 1, 2 <= x), tvm.const(False, "bool")) ck.verify(tvm.expr.And(2 <= x, x <= 1), tvm.const(False, "bool")) ck.verify(tvm.expr.And(x == 1, x != 2), x == 1) ck.verify(tvm.expr.Or(tvm.expr.EQ(x, y), tvm.expr.NE(x, y)), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(tvm.expr.NE(x, y), tvm.expr.EQ(x, y)), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(x > y, tvm.expr.Not(x < y)), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(x <= y, y < x), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(y < x, y <= x), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(x < 1, 0 < x), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(0 < x, x < 1), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(x < 1, 1 <= x), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(x <= 1, 1 < x), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(1 <= x, x < 1), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(1 < x, x <= 1), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(x <= 1, 2 <= x), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(2 <= x, x <= 1), tvm.const(True, "bool")) ck.verify(tvm.expr.Or(x != 1, x == 2), x != 1)
def test_cmp_simplify(): ck = RewriteChecker() x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z") # const int bound ck.verify((x % 2 + 10).equal(0), tvm.const(0, "bool")) ck.verify(tvm.expr.NE(x % 2 + 10, 0), tvm.const(1, "bool")) ck.verify(x % 2 + 10 > 1, tvm.const(1, "bool")) ck.verify(x % 2 + 10 <= 1, tvm.const(0, "bool")) ck.verify(x * 3 + 10 == 0, tvm.const(0, "bool")) ck.verify(x * 3 + 10 != 0, tvm.const(1, "bool")) # canonicalization ck.verify((x - 10).equal(0), x.equal(10)) ck.verify((10 - x).equal(0), x.equal(10)) ck.verify((x * y).equal(0), tvm.expr.Or(x.equal(0), y.equal(0))) # cmp bound ck.verify(x + y < x + z, y < z) ck.verify(x + y < z + x, y < z) ck.verify(y + x < x + z, y < z) ck.verify(y + x < z + x, y < z) ck.verify(y - x < z - x, y < z) ck.verify(x - y < x - z, z < y) ck.verify(x < z + x, tvm.expr.LT(0, z)) ck.verify(x < x + z, tvm.expr.LT(0, z)) ck.verify(100 < x + 1, tvm.expr.LT(99, x)) ck.verify(1 < 100 - x, tvm.expr.LT(x, 99)) ck.verify(x * 3 < y * 3, x < y) ck.verify(x * (-3) < y * (-3), y < x) ck.verify(x * 3 >= y * 3, y <= x) ck.verify(x * 4 >= 2, tvm.expr.LE(1, x)) ck.verify(x * 2 >= 50, tvm.expr.LE(25, x)) ck.verify(x / 2 < 3, x < 6) ck.verify(x * 4 <= 2, x <= 0) ck.verify(3 < x / 2, tvm.expr.LT(7, x)) ck.verify(x / 4 * 4 < x, tvm.expr.LT(0, x % 4)) ck.verify(x / 4 * 4 >= x, tvm.expr.LE(x % 4, 0)) ck.verify(x / 4 * 4 < x + y, tvm.expr.LT(0, x % 4 + y)) ck.verify(x / 4 * 4 < x - y, tvm.expr.LT(y, x % 4)) ck.verify((x + 2) / 4 * 4 >= x, tvm.expr.LE((x + 2) % 4, 2)) ck.verify((x + 2) / 4 * 4 >= x + y, tvm.expr.LE((x + 2) % 4 + y, 2)) ck.verify((x + 2) / 4 * 4 >= x - y, tvm.expr.LE((x + 2) % 4 + (-2), y)) ck.verify(tvm.min(x, 11) < 10, x < 10) ck.verify(tvm.min(x, 8) < 10, tvm.const(1, "bool")) ck.verify(tvm.max(8, x) > 10, tvm.expr.LT(10, x)) ck.verify(x + 1 < tvm.max(8, x), x < 7) ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 10), override=True) ck.analyzer.update(y, tvm.arith.ConstIntBound(-10, 0), override=True) ck.analyzer.update(z, tvm.arith.ConstIntBound(-5, 5), override=True) ck.verify(x < 11, tvm.const(1, "bool")) ck.verify(x <= 10, tvm.const(1, "bool")) ck.verify(z <= 5, tvm.const(1, "bool")) ck.verify(x + y <= 10, tvm.const(1, "bool")) ck.verify(x + y >= -10, tvm.const(1, "bool")) ck.verify(z - 5 <= y + 10, tvm.const(1, "bool")) ck.verify(tvm.all(x > -1, z <= x + 5), tvm.const(1, "bool")) ck.verify(x * y <= 0, tvm.const(1, "bool")) ck.verify((x + 1) * (y - 1) < 0, tvm.const(1, "bool")) ck.verify(y * y >= 0, tvm.const(1, "bool"))
def test_util(): x = tvm.const(100, "int32") assert util.get_const_int(x) == 100 assert util.get_const_tuple((x, x)) == (100, 100)
def fidentity(t0, t1): return tvm.const(-1, t0), tvm.min_value(t1)
def test_deduce(): a = tvm.var('a') b = tvm.var('b') c = tvm.var('c') d = tvm.var('d') b_s = tvm.arith.IntervalSet(2, 3) c_s = tvm.arith.IntervalSet(10, 15) d_s = tvm.arith.IntervalSet(-3, -1) zero = tvm.const(0, "int32") e0 = (-b) * a + c - d res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((d - c) / (b * -1) + (-1)) assert_expr_equal(res0.max_value, ans0) # expression containing variable a is on rhs res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res0.max_value, ans0) e0 = d * a + c - d res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {}) ans0 = ((d - c) / d - 1) assert_expr_equal(res0.max_value, ans0) # expression containing variable a is on rhs res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res0.max_value, ans0) e1 = (a * 4 + b < c) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) ans1 = (((c - b) + -1) / 4 - 1) assert_expr_equal(res1.max_value, ans1) # expression containing variable a is on rhs e1 = (c > a * 4 + b) res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res1.max_value, ans1) e2 = (tvm.max(5, a * 4) < 0) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max_value) == "neg_inf" assert str(res2.min_value) == "pos_inf" # expression containing variable a is on rhs e2 = (zero < tvm.max(5, a * 4)) res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {}) assert str(res2.max_value) == "neg_inf" assert str(res2.min_value) == "pos_inf" e3 = (-b) + a * c - d res3 = tvm.arith.DeduceBound(a, e3 >= 0, { b: b_s, c: c_s, d: d_s }, { b: b_s, d: d_s }) ans3 = 2 / c + 1 assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3) res3 = tvm.arith.DeduceBound(a, zero <= e3, { b: b_s, c: c_s, d: d_s }, { b: b_s, d: d_s }) assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3) # tests for `EQ` op res4 = tvm.arith.DeduceBound(a, a == b, {}, {}) assert_expr_equal(res4.max_value, b) assert_expr_equal(res4.min_value, b) # Unsatisfiable `EQ`, variable as one of the Operand res5 = tvm.arith.DeduceBound(a, (a == b), {b: b_s}, {b: b_s}) assert str(res5.max_value) == "neg_inf" assert str(res5.min_value) == "pos_inf" # variable `a` on the RHS side res6 = tvm.arith.DeduceBound(a, 10 == a, {}, {}) assert_expr_equal(res6.max_value, 10) assert_expr_equal(res6.min_value, 10) # Add, Sub in `EQ` e4 = ((a - c) == (b + d)) ans4 = (b + d + c) res7 = tvm.arith.DeduceBound(a, e4, {b: b_s, c: c_s, d: d_s}, {}) assert_expr_equal(res7.max_value, ans4) assert_expr_equal(res7.min_value, ans4) # Satisfiable Mul in `EQ` with negative sign res8 = tvm.arith.DeduceBound(a, (5 * a == -10), {}, {}) assert_expr_equal(res8.max_value, -2) assert_expr_equal(res8.min_value, -2) # Unsatisfiable Mul in `EQ` e5 = (4 * a == b) res9 = tvm.arith.DeduceBound(a, e5, {b: b_s}, {}) assert str(res9.max_value) == "neg_inf" assert str(res9.min_value) == "pos_inf" # Unsatisfiable Mul in `EQ` res10 = tvm.arith.DeduceBound( a, (b * a == b), {b: b_s}, {}) # simplifier is not able to prove that (b % b == 0) assert str(res10.max_value) == "neg_inf" assert str(res10.min_value) == "pos_inf"
def _intrin_func(ins, outs): ww, xx = ins zz = outs[0] vpadd = "llvm.arm.neon.vpadd.v8u8" vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16" args_1 = tvm.const(1, 'uint32') args_2 = tvm.const(2, 'uint32') def _instr(index): irb = tvm.ir_builder.create() if index == 1: irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) return irb.get() cnts8 = [None] * 8 cnts4 = [None] * 4 cnts2 = [None] * 2 for bw in range(w_b): for bx in range(x_b): if k_i == 16: for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload( [bx, 0], 'uint8x16') cnts = tvm.popcount(ands) upper_half = tvm.call_pure_intrin( 'uint8x8', 'vectorhigh', cnts) lower_half = tvm.call_pure_intrin( 'uint8x8', 'vectorlow', cnts) cnts8[i] = upper_half + lower_half for i in range(m // 2): cnts4[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw + bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) else: # ki == 8 for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload( [bx, 0], 'uint8x8') cnts8[i] = tvm.popcount(ands) for i in range(m // 2): cnts4[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts8[i * 2], cnts8[i * 2 + 1]) for i in range(m // 4): cnts2[i] = tvm.call_llvm_intrin( 'uint8x8', vpadd, args_1, cnts4[i * 2], cnts4[i * 2 + 1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw + bx, dtype) out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) irb.emit(zz.vstore(0, out)) return irb.get() # body, reset, update return _instr(0), _instr(1), _instr(2)
def argmax_init(idx_typ, val_typ): return tvm.const(-1, idx_typ), tvm.min_value(val_typ)
def _compute(*indices): value = x(*indices) calpha = tvm.const(alpha, value.dtype) return tvm.select(value > 0, value, value * calpha)
def test_const(): x = tvm.const(1, "int32") print(x.dtype) assert x.dtype == tvm.int32 assert isinstance(x, tvm.tir.IntImm)
def test_cmp_simplify(): ck = RewriteChecker() x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z") flm = tvm.floormod fld = tvm.floordiv # const int bound ck.verify((x % 2 + 10).equal(0), tvm.const(0, "bool")) ck.verify(tvm.expr.NE(x % 2 + 10, 0), tvm.const(1, "bool")) ck.verify(x % 2 + 10 > 1, tvm.const(1, "bool")) ck.verify(x % 2 + 10 <= 1, tvm.const(0, "bool")) ck.verify(flm(x, 2) + 2 > 1, tvm.const(1, "bool")) ck.verify(flm(x, 2) + 10 <= 1, tvm.const(0, "bool")) ck.verify(x * 3 + 10 == 0, tvm.const(0, "bool")) ck.verify(x * 3 + 10 != 0, tvm.const(1, "bool")) # canonicalization ck.verify((x - 10).equal(0), x.equal(10)) ck.verify((10 - x).equal(0), x.equal(10)) ck.verify((x * y).equal(0), tvm.expr.Or(x.equal(0), y.equal(0))) # cmp bound ck.verify(x + y < x + z, y < z) ck.verify(x + y < z + x, y < z) ck.verify(y + x < x + z, y < z) ck.verify(y + x < z + x, y < z) ck.verify(y - x < z - x, y < z) ck.verify(x - y < x - z, z < y) ck.verify(x < z + x, tvm.expr.LT(0, z)) ck.verify(x < x + z, tvm.expr.LT(0, z)) ck.verify(100 < x + 1, tvm.expr.LT(99, x)) ck.verify(1 < 100 - x, tvm.expr.LT(x, 99)) ck.verify(x * 3 < y * 3, x < y) ck.verify(x * (-3) < y * (-3), y < x) ck.verify(x * 3 >= y * 3, y <= x) ck.verify(x * 4 >= 2, tvm.expr.LE(1, x)) ck.verify(x * 2 >= 50, tvm.expr.LE(25, x)) ck.verify(x * 4 <= 2, x <= 0) ck.verify((0 - x * 3) <= 0, tvm.expr.LE(0, x)) ck.verify((0 - x * 3) >= 0, tvm.expr.LE(x, 0)) ck.verify(2 * x <= 0, x <= 0) ck.verify(x * 2 >= 3, tvm.expr.LE(2, x)) ck.verify(x * 2 >= 2, tvm.expr.LE(1, x)) ck.verify(x * 2 >= 1, tvm.expr.LE(1, x)) ck.verify(x * 2 >= 0, tvm.expr.LE(0, x)) ck.verify(x * 2 >= -1, tvm.expr.LE(0, x)) ck.verify(x * 2 >= -2, tvm.expr.LE(-1, x)) ck.verify(x * 2 >= -3, tvm.expr.LE(-1, x)) ck.verify(x * 2 <= 3, tvm.expr.LE(x, 1)) ck.verify(x * 2 <= 2, tvm.expr.LE(x, 1)) ck.verify(x * 2 <= 1, tvm.expr.LE(x, 0)) ck.verify(x * 2 <= 0, tvm.expr.LE(x, 0)) ck.verify(x * 2 <= -1, tvm.expr.LE(x, -1)) ck.verify(x * 2 <= -2, tvm.expr.LE(x, -1)) ck.verify(x * 2 <= -3, tvm.expr.LE(x, -2)) ck.verify(x * (-2) >= 3, tvm.expr.LE(x, -2)) ck.verify(x * (-2) >= 2, tvm.expr.LE(x, -1)) ck.verify(x * (-2) >= 1, tvm.expr.LE(x, -1)) ck.verify(x * (-2) >= 0, tvm.expr.LE(x, 0)) ck.verify(x * (-2) >= -1, tvm.expr.LE(x, 0)) ck.verify(x * (-2) >= -2, tvm.expr.LE(x, 1)) ck.verify(x * (-2) >= -3, tvm.expr.LE(x, 1)) ck.verify(x * (-2) <= 3, tvm.expr.LE(-1, x)) ck.verify(x * (-2) <= 2, tvm.expr.LE(-1, x)) ck.verify(x * (-2) <= 1, tvm.expr.LE(0, x)) ck.verify(x * (-2) <= 0, tvm.expr.LE(0, x)) ck.verify(x * (-2) <= -1, tvm.expr.LE(1, x)) ck.verify(x * (-2) <= -2, tvm.expr.LE(1, x)) ck.verify(x * (-2) <= -3, tvm.expr.LE(2, x)) # DivMod rules # truc div ck.verify(x / 2 < 3, x < 6) ck.verify(3 < x / 2, tvm.expr.LT(7, x)) ck.verify(x / 3 >= 0, tvm.expr.LE(-2, x)) ck.verify(x / 2 >= 1, tvm.expr.LE(2, x)) ck.verify(x / 2 >= 0, tvm.expr.LE(-1, x)) ck.verify(x / 2 >= -1, tvm.expr.LE(-3, x)) ck.verify(x / 2 <= 1, tvm.expr.LE(x, 3)) ck.verify(x / 2 <= 0, tvm.expr.LE(x, 1)) ck.verify(x / 2 <= -1, tvm.expr.LE(x, -2)) ck.verify(x / 4 * 4 < x, tvm.expr.LT(0, x % 4)) ck.verify(x / 4 * 4 >= x, tvm.expr.LE(x % 4, 0)) ck.verify(x / 4 * 4 < x + y, tvm.expr.LT(0, x % 4 + y)) ck.verify(x / 4 * 4 < x - y, tvm.expr.LT(y, x % 4)) ck.verify((x + 2) / 4 * 4 >= x, tvm.expr.LE((x + 2) % 4, 2)) ck.verify((x + 2) / 4 * 4 >= x + y, tvm.expr.LE((x + 2) % 4 + y, 2)) ck.verify((x + 2) / 4 * 4 >= x - y, tvm.expr.LE((x + 2) % 4 + (-2), y)) # floor div ck.verify(fld(x, 2) < 3, x < 6) ck.verify(3 < fld(x, 2), tvm.expr.LT(7, x)) ck.verify(-3 < fld(x, 2), tvm.expr.LT(-5, x)) ck.verify(fld(x, 3) >= 0, tvm.expr.LE(0, x)) ck.verify(fld(x, 2) >= 1, tvm.expr.LE(2, x)) ck.verify(fld(x, 2) >= 0, tvm.expr.LE(0, x)) ck.verify(fld(x, 2) >= -1, tvm.expr.LE(-2, x)) ck.verify(fld(x, 2) <= 1, tvm.expr.LE(x, 3)) ck.verify(fld(x, 2) <= 0, tvm.expr.LE(x, 1)) ck.verify(fld(x, 2) <= -1, tvm.expr.LE(x, -1)) ck.verify(fld(x, 4) * 4 < x, tvm.expr.LT(0, flm(x, 4))) ck.verify(fld(x, 4) * 4 >= x, tvm.expr.LE(flm(x, 4), 0)) ck.verify(fld(x, 4) * 4 < x + y, tvm.expr.LT(0, flm(x, 4) + y)) ck.verify(fld(x, 4) * 4 < x - y, tvm.expr.LT(y, flm(x, 4))) ck.verify(fld(x + 2, 4) * 4 >= x, tvm.expr.LE(flm(x + 2, 4), 2)) ck.verify(fld(x + 2, 4) * 4 >= x + y, tvm.expr.LE(flm(x + 2, 4) + y, 2)) ck.verify(fld(x + 2, 4) * 4 >= x - y, tvm.expr.LE(flm(x + 2, 4) + (-2), y)) # End DivMod Rules ck.verify(tvm.min(x, 11) < 10, x < 10) ck.verify(tvm.min(x, 8) < 10, tvm.const(1, "bool")) ck.verify(tvm.max(8, x) > 10, tvm.expr.LT(10, x)) ck.verify(x + 1 < tvm.max(8, x), x < 7) ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 10), override=True) ck.analyzer.update(y, tvm.arith.ConstIntBound(-10, 0), override=True) ck.analyzer.update(z, tvm.arith.ConstIntBound(-5, 5), override=True) ck.verify(x < 11, tvm.const(1, "bool")) ck.verify(x <= 10, tvm.const(1, "bool")) ck.verify(z <= 5, tvm.const(1, "bool")) ck.verify(x + y <= 10, tvm.const(1, "bool")) ck.verify(x + y >= -10, tvm.const(1, "bool")) ck.verify(z - 5 <= y + 10, tvm.const(1, "bool")) ck.verify(tvm.all(x > -1, z <= x + 5), tvm.const(1, "bool")) ck.verify(x*y <= 0, tvm.const(1, "bool")) ck.verify((x + 1)*(y - 1) < 0, tvm.const(1, "bool")) ck.verify(y*y >= 0, tvm.const(1, "bool")) ck.verify(x*6 <= -3, tvm.const(0, "bool")) ck.verify((y - 1) % 3 == 0, (y + (-1)) % 3 == 0)
def deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation, deformable_groups, groups, out_dtype): """Deformable conv2D operator in NCHW layout. The deformable convolution operation is described in https://arxiv.org/abs/1703.06211 Parameters ---------- data : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] offset : tvm.Tensor 4-D with shape [batch, deformable_groups * filter_height * filter_width * 2, out_height, out_width]. kernel : tvm.Tensor 4-D with shape [num_filter, in_channel, filter_height, filter_width] strides : int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding : int or a list/tuple of two ints padding size, or [pad_height, pad_width] dilation : int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] deformable_groups : int number of deformable groups groups : int number of groups Returns ------- output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ if out_dtype is None: out_dtype = data.dtype if isinstance(strides, int): stride_h = stride_w = strides else: stride_h, stride_w = strides if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel, in_height, in_width = get_const_tuple(data.shape) out_channel, channel, kernel_h, kernel_w = get_const_tuple(kernel.shape) _, _, out_height, out_width = get_const_tuple(offset.shape) assert in_channel % deformable_groups == 0, "Input cahnnels must divide deformable group size" assert groups == 1, "deformable_conv2d_nchw does not support groups > 1" ic_per_dgroup = channel // deformable_groups dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, _, _ = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') zero = tvm.const(0.0, data.dtype) def _bilinear(n, c, h, w): outside = tvm.any(h < 0, w < 0, h >= in_height, w >= in_width) val = bilinear_sample_nchw(data, (n, c, h, w), in_height - 1, in_width - 1) return tvm.if_then_else(outside, zero, val) data_deform = \ tvm.compute((batch, in_channel, kernel_h, kernel_w, out_height, out_width), lambda n, c, kh, kw, y, x: _bilinear(n, c, y * stride_h - pad_top + kh * dilation_h + offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) + (kh * kernel_w + kw) * 2, y, x], x * stride_w - pad_left + kw * dilation_w + offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) + (kh * kernel_w + kw) * 2 + 1, y, x])) return tvm.compute( (batch, out_channel, out_height, out_width), lambda n, f, y, x: tvm.sum(data_deform[n, rc, ry, rx, y, x].astype( out_dtype) * kernel[f, rc, ry, rx].astype(out_dtype), axis=[rc, ry, rx]), tag="deformable_conv2d_nchw")
def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if len(kernel.shape) == 4: pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") if tile_size == 4: G_data = np.array( [[1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0], [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]], out_dtype) B_data = np.array( [[4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0], [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]], out_dtype) A_data = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1], [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]], out_dtype) elif tile_size == 2: G_data = np.array([[1, 0, 0], [1.0 / 2, 1.0 / 2, 1.0 / 2], [1.0 / 2, -1.0 / 2, 1.0 / 2], [0, 0, 1]], out_dtype) B_data = np.array( [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype) A_data = np.array([[1, 0], [1, 1], [1, -1], [0, -1]], out_dtype) else: raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) m = A_data.shape[1] r = 3 alpha = m + r - 1 H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW ##### space definition begin ##### tile_bna_candidates = [1, 2, 4, 8, 16] factors = get_factors(CO) cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors]) cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16]) cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128) cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128) cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8) cfg.define_knob('yt', [1, 2, 4, 8, 16, 32]) ##### space definition end ##### if cfg.is_fallback: cfg['tile_bnb'].val = 4 cfg['tile_bna'].val = 4 while CO % cfg['tile_bna'].val != 0: cfg['tile_bna'].val //= 2 cfg['yt'].val = 8 cfg.fallback_split('tile_t1', [-1, 128]) cfg.fallback_split('tile_t2', [-1, 128]) cfg.fallback_split('c_unroll', [-1, 8]) bna = cfg['tile_bna'].val bnb = cfg['tile_bnb'].val P_round = (P + bnb - 1) // bnb * bnb assert CO % bna == 0 and P_round % bnb == 0 # pack input tile input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \ tvm.select(b * bnb + bb < P, data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps] [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d') # transform kernel if pre_computed: U = kernel else: G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute( (alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco: tvm.sum(kernel[co * bna + vco][ci][ r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image B = const_matrix(B_data, 'B') r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp: tvm.sum(input_tile[ci][p][r_a][ r_b][vp] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='V') # batch gemm ci = tvm.reduce_axis((0, CI), name='c') M = tvm.compute((alpha, alpha, CO, P_round), lambda eps, nu, co, p: tvm.sum(U[eps][nu][co // bna][ci][ co % bna] * V[eps][nu][p // bnb][ci][p % bnb], axis=ci), name='M') A = const_matrix(A_data, 'A') r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') Y = tvm.compute( (CO, P, m, m), lambda co, p, vh, vw: tvm.sum( M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='Y') # unpack output output = tvm.compute( (N, CO, H, W), lambda n, co, h, w: Y[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m] # thw following term is used to make the padding effective, # otherwise the padding will be eliminated by bound inference + tvm.const(0, out_dtype) * M[alpha - 1][alpha - 1][CO - 1][P_round - 1 ], name='output', tag='winograd_conv2d_output', attrs={ 'workload': _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype, tile_size) }) # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * CO * H * W * KH * KW * CI) return output
def _do_fold(stmt): def _equal(x, y): return tvm.ir_pass.Equal(tvm.ir_pass.Simplify(x - y), 0) def _flatten_loop(src_coeff, dst_coeff, extents): src_coeff = list(src_coeff) dst_coeff = list(dst_coeff) extents = list(extents) rev_src_coeff = [src_coeff.pop()] rev_dst_coeff = [dst_coeff.pop()] rev_extents = [] assert src_coeff vsrc = src_coeff.pop() vdst = dst_coeff.pop() vext = extents.pop() while src_coeff: next_src = src_coeff.pop() next_dst = dst_coeff.pop() next_ext = extents.pop() if _equal(next_src, vsrc * vext) and _equal( next_dst, vdst * vext): vext = tvm.ir_pass.Simplify(vext * next_ext) else: rev_src_coeff.append(vsrc) rev_dst_coeff.append(vdst) rev_extents.append(vext) vsrc = next_src vdst = next_dst vext = next_ext rev_src_coeff.append(vsrc) rev_dst_coeff.append(vdst) rev_extents.append(vext) rev_src_coeff.reverse() rev_dst_coeff.reverse() rev_extents.reverse() return rev_src_coeff, rev_dst_coeff, rev_extents if _match_pragma(stmt, "alu"): # Get to the innermost loop body loop_body = stmt.body nest_size = 0 while isinstance(loop_body, tvm.stmt.For): loop_body = loop_body.body nest_size += 1 # Get the src/dst arguments dst_var = loop_body.buffer_var dst_idx = loop_body.index # Derive loop variables and extents tmp_body = stmt.body indices = [] extents = [] for _ in range(nest_size): indices.append(tmp_body.loop_var) extents.append(tmp_body.extent) tmp_body = tmp_body.body # Derive opcode if isinstance(loop_body.value, tvm.expr.Add): alu_opcode = env.dev.ALU_OPCODE_ADD lhs = loop_body.value.a rhs = loop_body.value.b elif isinstance(loop_body.value, tvm.expr.Sub): alu_opcode = env.dev.ALU_OPCODE_SUB lhs = loop_body.value.a rhs = loop_body.value.b elif isinstance(loop_body.value, tvm.expr.Mul): alu_opcode = env.dev.ALU_OPCODE_MUL lhs = loop_body.value.a rhs = loop_body.value.b elif isinstance(loop_body.value, tvm.expr.Min): alu_opcode = env.dev.ALU_OPCODE_MIN lhs = loop_body.value.a rhs = loop_body.value.b elif isinstance(loop_body.value, tvm.expr.Max): alu_opcode = env.dev.ALU_OPCODE_MAX lhs = loop_body.value.a rhs = loop_body.value.b elif isinstance(loop_body.value, tvm.expr.Call): if loop_body.value.name == 'shift_left': alu_opcode = env.dev.ALU_OPCODE_SHR lhs = loop_body.value.args[0] rhs = tvm.ir_pass.Simplify(-loop_body.value.args[1]) elif loop_body.value.name == 'shift_right': alu_opcode = env.dev.ALU_OPCODE_SHR lhs = loop_body.value.args[0] rhs = loop_body.value.args[1] else: raise RuntimeError("Function call not recognized %s" % (loop_body.value.name)) elif isinstance(loop_body.value, tvm.expr.Load): alu_opcode = env.dev.ALU_OPCODE_SHR lhs = loop_body.value rhs = tvm.const(0, "int32") else: raise RuntimeError( "Expression not recognized %s, %s, %s" % (type(loop_body.value), str(loop_body.value), str(stmt))) # Derive array index coefficients dst_coeff = tvm.arith.DetectLinearEquation(dst_idx, indices) # Check if lhs/rhs is immediate use_imm = False imm_val = None if isinstance(rhs, tvm.expr.IntImm): assert lhs.buffer_var.same_as(dst_var) src_coeff = tvm.arith.DetectLinearEquation(lhs.index, indices) use_imm = True imm_val = rhs if isinstance(lhs, tvm.expr.IntImm): assert rhs.buffer_var.same_as(dst_var) src_coeff = tvm.arith.DetectLinearEquation(rhs.index, indices) use_imm = True imm_val = lhs if imm_val is None: imm_val = 0 assert lhs.buffer_var.same_as( dst_var) and rhs.buffer_var.same_as(dst_var) src_lhs_coeff = tvm.arith.DetectLinearEquation( lhs.index, indices) src_rhs_coeff = tvm.arith.DetectLinearEquation( rhs.index, indices) # Determine which side has the same coefficients lhs_equal = True rhs_equal = True for i, coef in enumerate(dst_coeff): if not tvm.ir_pass.Equal(coef, src_lhs_coeff[i]): lhs_equal = False if not tvm.ir_pass.Equal(coef, src_rhs_coeff[i]): rhs_equal = False # Make sure at least one of the source is identical to the # destination (in-place computation) assert lhs_equal or rhs_equal # Assign the source coefficients if lhs_equal: src_coeff = src_rhs_coeff else: src_coeff = src_lhs_coeff # Ensure that we have the proper tensor dimensions in the # innermost loop (pattern match) src_coeff = list(src_coeff) dst_coeff = list(dst_coeff) extents = list(extents) assert len(src_coeff) > 1 assert len(dst_coeff) > 1 assert len(extents) != 0 assert tvm.ir_pass.Equal( tvm.ir_pass.Simplify(src_coeff[-1] % (env.BATCH * env.BLOCK_OUT)), 0) assert tvm.ir_pass.Equal( tvm.ir_pass.Simplify(dst_coeff[-1] % (env.BATCH * env.BLOCK_OUT)), 0) assert tvm.ir_pass.Equal(src_coeff[-2], 1) assert tvm.ir_pass.Equal(dst_coeff[-2], 1) if env.BATCH > 1: assert len(src_coeff) > 2 assert len(dst_coeff) > 2 assert len(extents) > 1 assert tvm.ir_pass.Equal(src_coeff[-3], env.BLOCK_OUT) assert tvm.ir_pass.Equal(dst_coeff[-3], env.BLOCK_OUT) # Apply tensorization of the loop coefficients src_offset = src_coeff[-1] dst_offset = dst_coeff[-1] if env.BATCH == 1: src_coeff = src_coeff[:-2] dst_coeff = dst_coeff[:-2] extents = extents[:-1] else: src_coeff = src_coeff[:-3] dst_coeff = dst_coeff[:-3] extents = extents[:-2] src_coeff.append(src_offset) dst_coeff.append(dst_offset) src_coeff = [ tvm.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in src_coeff ] dst_coeff = [ tvm.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in dst_coeff ] # Flatten the outer loops if extents: src_coeff, dst_coeff, extents = _flatten_loop( src_coeff, dst_coeff, extents) # Insert ALU micro-ops irb = tvm.ir_builder.create() for idx, extent in enumerate(extents): irb.emit( tvm.call_extern("int32", "VTAUopLoopBegin", extent, dst_coeff[idx], src_coeff[idx], 0)) use_imm = int(use_imm) irb.emit( tvm.call_extern("int32", "VTAUopPush", 1, 0, dst_coeff[len(dst_coeff) - 1], src_coeff[len(src_coeff) - 1], 0, alu_opcode, use_imm, imm_val)) for extent in extents: irb.emit(tvm.call_extern("int32", "VTAUopLoopEnd")) return irb.get() return stmt
def test_ir(): x = tvm.const(1, "int32") y = tvm.tir.IntImm('int32', 1) z = x + y stmt = tvm.tir.Evaluate(z) assert isinstance(stmt, tvm.tir.Evaluate)
def argmin_identity(t0, t1): return tvm.const(-1, t0), tvm.max_value(t1)
import nnpu import tvm import topi from nnpu.utils import ScheduleProcHelper import numpy as np with (ScheduleProcHelper()): env = nnpu.get_env() nnpu.set_device(env, type='S0') dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] a = tvm.placeholder((2, 4, 16), dtype_n, 'a') a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a') pad_buf = tvm.compute((2, 6, 16), lambda i, j, k: tvm.expr.Select( j >= 2, a_buf[i, j - 2, k], tvm.const(0, dtype_n)), 'pad') nnpu.utils.MarkScope(pad_buf) nnpu.utils.PragmaCopy(pad_buf) tile_host, _ = nnpu.utils.CopyBufToH(pad_buf, 'tile') s = nnpu.create_schedule([tile_host.op]) print(tvm.lower(s, [a, tile_host], simple_mode=True)) print(nnpu.lower(s, [a, tile_host], simple_mode=True)) # exit(0) func = nnpu.build(s, [a, tile_host], 'nnpu', 'llvm', name='nnpu_func') ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=(2, 4, 16), dtype=a.dtype, low=-128,
def _do_fold(op): if _match_pragma(op, "conv2d_transpose_gemm"): is_init = ".init" in str(op) tvm.ir_pass.PostOrderVisit(op, _find_basics) if is_init: # create inner most block irb = tvm.ir_builder.create() dev = env.dev irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE)) irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop) irb.emit( tvm.call_extern("int32", "VTAUopPush", 0, 1, dout.access_ptr("rw", "int32"), 0, 0, 0, 0, 0)) inner = irb.get() args = op.body.body.args res_tensor = op.body.body.func.output(0) tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT) inner = tvm.make.AttrStmt( [dout, res_tensor], 'buffer_bind_scope', tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) return inner else: conv_call, data_call, kernel_call = calls[-3:] pad_data_tensor = data_call.func.output(0) kernel_tensor = kernel_call.func.output(0) res_tensor = conv_call.func.output(0) if selects: condition = selects[0].condition else: condition = tvm.const(1, 'int') # create inner most block irb = tvm.ir_builder.create() with irb.if_scope(condition): dev = env.dev irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE)) irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop) irb.emit( tvm.call_extern("int32", "VTAUopPush", 0, 0, dout.access_ptr("rw", "int32"), dinp.access_ptr("r", "int32"), dwgt.access_ptr("r", "int32"), 0, 0, 0)) inner = irb.get() args = conv_call.args tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT) inner = tvm.make.AttrStmt( [dout, res_tensor], 'buffer_bind_scope', tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) args = kernel_call.args tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, env.BLOCK_OUT, 0, env.BLOCK_IN) inner = tvm.make.AttrStmt( [dwgt, kernel_tensor], 'buffer_bind_scope', tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) args = data_call.args tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_IN) inner = tvm.make.AttrStmt( [dinp, pad_data_tensor], 'buffer_bind_scope', tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner) return inner return None
def test_make(): x = tvm.const(1, "int32") y = tvm.var("x") z = x + y assert isinstance(tvm.max(x, y), tvm.tir.Max) assert isinstance(tvm.min(x, y), tvm.tir.Min)
def _compute(*indices): value = x(*indices) const_min = tvm.const(a_min, value.dtype) const_max = tvm.const(a_max, value.dtype) return tvm.max(tvm.min(value, const_max), const_min)
# Reduction axes kh = tvm.reduce_axis((0, kernel_h), name='kh') kw = tvm.reduce_axis((0, kernel_w), name='kw') ic = tvm.reduce_axis((0, in_channels // block_size), name='ic') ii = tvm.reduce_axis((0, block_size), name='ii') # Algorithm A = tvm.placeholder(data_shape, name='A', dtype="float16") W = tvm.placeholder(kernel_shape, name='W', dtype="float16") Apad = tvm.compute( (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size, block_size), lambda n, h, w, i, nn, ii: tvm.if_then_else( tvm.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")), name='Apad') Conv = tvm.compute( output_shape, lambda n, h, w, o, nn, oo: tvm.sum(Apad[ n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype( "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"), axis=[ic, kh, kw, ii]), name="Conv") s = tvm.create_schedule(Conv.op) s[Apad].compute_inline() ############################################################################### # Memory Scope # ----------------
out_channel = 512 in_size = 14 kernel = 3 pad = 1 stride = 1 # Algorithm A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A') W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W') out_size = (in_size - kernel + 2 * pad) // stride + 1 # Pad input Apad = tvm.compute( (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch), lambda yy, xx, cc, nn: tvm.if_then_else( tvm.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size), A[yy - pad, xx - pad, cc, nn], tvm.const(0., "float32")), name='Apad') # Create reduction variables rc = tvm.reduce_axis((0, in_channel), name='rc') ry = tvm.reduce_axis((0, kernel), name='ry') rx = tvm.reduce_axis((0, kernel), name='rx') # Compute the convolution B = tvm.compute( (out_size, out_size, out_channel, batch), lambda yy, xx, ff, nn: tvm.sum(Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff], axis=[ry, rx, rc]), name='B') # Designate the memory hierarchy s = tvm.create_schedule(B.op)