def _bilinear_sample(n, c, h, w): y, x = _compute_source_index(n, h, w) y0 = te.floor(y).astype("int32") x0 = te.floor(x).astype("int32") y1 = y0 + tir.const(1, "int32") x1 = x0 + tir.const(1, "int32") return ( _get_pixel_value(n, c, y0, x0) * (1.0 - (y - y0)) * (1.0 - (x - x0)) + _get_pixel_value(n, c, y0, x1) * (1.0 - (y - y0)) * (x - x0) + _get_pixel_value(n, c, y1, x0) * (y - y0) * (1.0 - (x - x0)) + _get_pixel_value(n, c, y1, x1) * (y - y0) * (x - x0) )
def _bilinear_sample(n, c, h, w): x = grid[n, 0, h, w] y = grid[n, 1, h, w] y = (y + 1) * (in_height - 1) / 2 x = (x + 1) * (in_width - 1) / 2 x0 = te.floor(x).astype('int32') y0 = te.floor(y).astype('int32') x1 = x0 + tir.const(1, 'int32') y1 = y0 + tir.const(1, 'int32') return _get_pixel_value(n, c, y0, x0) * (1.0 - (y - y0)) * (1.0 - (x - x0)) \ + _get_pixel_value(n, c, y0, x1) * (1.0 - (y - y0)) * (x - x0) \ + _get_pixel_value(n, c, y1, x0) * (y - y0) * (1.0 - (x - x0)) \ + _get_pixel_value(n, c, y1, x1) * (y - y0) * (x - x0)
def common_reduce(name, args=(0,)): if not isinstance(args, tuple) and not isinstance(args, list): args = (args, ) def reduce_op(x, y): assert x.dtype == y.dtype , "Reduing elements that don't have same data type: %s v.s. %s" % (x.dtype, y.dtype) return tir.call_pure_extern(x.dtype, name, x, y, *args[1:]) return te.comm_reducer(reduce_op, lambda t: tir.const(args[0], dtype=t), name=name)
def test_ret_const(): a = tir.const(0) b = tir.ret(a) b = tir.Evaluate(b) func = tir.PrimFunc([], b) func = build_tir_func(func) out = func() assert out == 0
def test_underflow(): int_min = tir.const(-(1 << 31), 'int32') constant_fold_res = (int_min - 1).value int_min_np = np.int32(-(1 << 31)) res_np = int_min_np - np.int32(1) print(constant_fold_res, res_np) assert constant_fold_res == res_np
def _trilinear_sample(n, c, d, h, w): z, y, x = _compute_source_index(n, d, h, w) z0 = te.floor(z).astype("int32") y0 = te.floor(y).astype("int32") x0 = te.floor(x).astype("int32") z1 = z0 + tir.const(1, "int32") y1 = y0 + tir.const(1, "int32") x1 = x0 + tir.const(1, "int32") return ( _get_pixel_value(n, c, z0, y0, x0) * (1 - (x - x0)) * (1 - (y - y0)) * (1 - (z - z0)) + _get_pixel_value(n, c, z0, y0, x1) * (x - x0) * (1 - (y - y0)) * (1 - (z - z0)) + _get_pixel_value(n, c, z1, y1, x0) * (1 - (x - x0)) * (y - y0) * (z - z0) + _get_pixel_value(n, c, z1, y1, x1) * (x - x0) * (y - y0) * (z - z0) + _get_pixel_value(n, c, z0, y1, x0) * (1 - (x - x0)) * (y - y0) * (1 - (z - z0)) + _get_pixel_value(n, c, z1, y0, x1) * (x - x0) * (1 - (y - y0)) * (z - z0) + _get_pixel_value(n, c, z1, y0, x0) * (1 - (x - x0)) * (1 - (y - y0)) * (z - z0) + _get_pixel_value(n, c, z0, y1, x1) * (x - x0) * (y - y0) * (1 - (z - z0)) )
def test_reduce(): def check(m, target_bits, target_dtype): A = te.placeholder((m, ), name="A", dtype="float32") k = te.reduce_axis((0, m), "k") B = te.compute((), lambda *idx: te.sum(A[k], axis=k), name="B") s = te.create_schedule(B.op) stmt = lower_sch(s, [A, B], target_bits) assert stmt[1].loop_var.dtype == target_dtype # i32 -> i32 check(const(64, dtype="int32"), 32, "int32") # i64 -> i32 check(const(64, dtype="int64"), 32, "int32") # i32 -> i16 check(const(64, dtype="int32"), 16, "int16") check(const(2**16, dtype="int32"), 16, "int32") # symbolic check(te.var("n", dtype="int32"), 32, "int32") check(te.var("n", dtype="int64"), 32, "int64")
def test_reduce(): def check(m, target_bits, target_dtype): A = te.placeholder((m, ), name='A', dtype='float32') k = te.reduce_axis((0, m), "k") B = te.compute((), lambda *idx: te.sum(A[k], axis=k), name='B') s = te.create_schedule(B.op) stmt = lower_sch(s, [A, B], target_bits) assert stmt.body[1].loop_var.dtype == target_dtype # i32 -> i32 check(const(64, dtype='int32'), 32, 'int32') # i64 -> i32 check(const(64, dtype='int64'), 32, 'int32') # i32 -> i16 check(const(64, dtype='int32'), 16, 'int16') check(const(2**16, dtype='int32'), 16, 'int32') # symbolic check(te.var('n', dtype='int32'), 32, 'int32') check(te.var('n', dtype='int64'), 32, 'int64')
def gen_ir( data_ptr, n_fft, hop_length, win_length, window_ptr, normalized, onesided, output_ptr, loop_kind, ): ib = tir.ir_builder.create() data = ib.buffer_ptr(data_ptr) window = ib.buffer_ptr(window_ptr) output = ib.buffer_ptr(output_ptr) # https://librosa.org/doc/0.7.2/_modules/librosa/core/spectrum.html#stft with ib.for_range(0, output_ptr.shape[0] * output_ptr.shape[1], kind="parallel") as batch_row: with ib.for_range(0, output_ptr.shape[2], kind=loop_kind) as col: batch = ib.allocate("int32", (1), name="batch", scope="local") row = ib.allocate("int32", (1), name="row", scope="local") batch = tir.floordiv(batch_row, output_ptr.shape[1]) row = tir.floormod(batch_row, output_ptr.shape[1]) output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0) output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0) with ib.for_range(0, win_length) as wlen: output[batch, row, col, 0] += (window[wlen] * data[batch, col * hop_length + wlen] * tir.cos(2 * pi * row * wlen / win_length)) output[batch, row, col, 1] -= (window[wlen] * data[batch, col * hop_length + wlen] * tir.sin(2 * pi * row * wlen / win_length)) with ib.if_scope(normalized): output[batch, row, col, 0] /= tir.sqrt(tir.const(n_fft, "float32")) output[batch, row, col, 1] /= tir.sqrt(tir.const(n_fft, "float32")) return ib.get()
def _check(variables, formulas, coef=(-5, 5), bounds=(-20, 20)): vs = [te.var("x" + str(i)) for i in range(variables)] fs = [] for i in range(formulas): s1 = sum([v * random.randint(coef[0], coef[1]) for v in vs]) s1 += random.randint(coef[0], coef[1]) s2 = sum([v * random.randint(coef[0], coef[1]) for v in vs]) s2 += random.randint(coef[0], coef[1]) op = random.choice([tir.expr.EQ, tir.expr.LE, tir.expr.LT, tir.expr.GE, tir.expr.GT]) fs.append(op(s1, s2)) vranges = {v: tvm.ir.expr.Range(bounds[0], bounds[1] + 1) for v in vs} before = te.all(tir.const(1, "bool"), *fs) after = arith._ffi_api.SolveInequalitiesAsCondition(vs, vranges, fs) after = te.all(tir.const(1, "bool"), *after) testing.check_bool_expr_is_true(before == after, vranges) solution = arith.solve_linear_inequalities(fs, vs, vranges, deskew_range=True) testing.check_int_constraints_trans_consistency(solution)
def test_convert_ssa(): zero = tir.const(0) nop = tir.Evaluate(zero) v = tir.Var("i1", "int32") for_stmt = tir.For(v, zero, zero, tir.ForKind.SERIAL, nop) load = tir.Evaluate(tir.Load("int32", v, zero)) seq = tir.SeqStmt([for_stmt, for_stmt, load]) func = tir.PrimFunc([], seq) mod = tvm.IRModule({"main": func}) mod = tir.transform.InjectVirtualThread()( mod ) # Use pass InjectVirtualThread to invoke ConvertSSA
def test_relay_take(): engine = relay.backend.compile_engine.get() def check(shape, index, target_bits, target_dtype): x = relay.var("x", shape=shape) y = relay.op.take(x, indices=index) func = relay.Function([x], y) mod = tvm.IRModule.from_expr(func) func = mod["main"] z = engine.lower(func, "llvm") stmt = lower_sch(z.schedule, tuple(z.inputs) + tuple(z.outputs), 32) assert stmt.value.index.dtype == target_dtype check((const(2**16, 'int64'), const(2**15 + 1, 'int64')), relay.const(0, dtype="int64"), target_bits=32, target_dtype="int32") check((const(2**16, 'int64'), const(2**15 + 1, 'int64')), relay.const(2**31, dtype="int64"), target_bits=32, target_dtype="int64")
def test_relay_basic(): engine = relay.backend.compile_engine.get() def check(shapex, shapey, target_bits, target_dtype): x = relay.var("x", shape=shapex) y = relay.var("y", shape=shapey) z = relay.add(x, y) func = relay.Function([x, y], z) mod = tvm.IRModule.from_expr(func) mod = relay.transform.InferType()(mod) func = mod["main"] z = engine.lower(func, "llvm") stmt = lower_sch(z.schedule, tuple(z.inputs) + tuple(z.outputs), 32) # outer loop assert stmt.loop_var.dtype == target_dtype # inner loop if len(shapex) > 1 or len(shapey) > 1: assert stmt.body.loop_var.dtype == target_dtype check( (const(2**16, "int64"), const(2**15 + 1, "int64")), (1, const(2**15 + 1, "int64")), target_bits=32, target_dtype="int64", ) check( (const(2**16, "int64"), const(2**15, "int64")), (1, const(2**15, "int64")), target_bits=32, target_dtype="int32", ) check((const(2**31, "int64"), ), (const(2**31, "int64"), ), target_bits=32, target_dtype="int32") check( (const(2**31 + 1, "int64"), ), (const(2**31 + 1, "int64"), ), target_bits=32, target_dtype="int64", )
def test_multilanes(): def check(m, lanes, target_bits, target_dtype): ib = tvm.tir.ir_builder.create() Ab = tvm.tir.decl_buffer((m, ), dtype="float32x{}".format(lanes), name="A") A = ib.buffer_ptr(Ab) Bb = tvm.tir.decl_buffer((m, ), dtype="float32x{}".format(lanes), name="B") B = ib.buffer_ptr(Bb) with ib.for_range(0, m, name="i", dtype=m.dtype) as i: B[i] = A[i] + 1 stmt = ib.get() stmt = lower_stmt([Ab, Bb], stmt, target_bits) assert stmt.loop_var.dtype == target_dtype # i32 -> i32 check(const(2**10, dtype="int32"), 2, target_bits=32, target_dtype="int32") check(const(2**32, dtype="int32"), 2, target_bits=32, target_dtype="int32") # i64 -> i32 check(const(2**10, dtype="int64"), 2, target_bits=32, target_dtype="int32") check(const(2**32, dtype="int64"), 2, target_bits=32, target_dtype="int64") # i32 -> i16 check(const(2**10, dtype="int32"), 2, target_bits=16, target_dtype="int16") check(const(2**16, dtype="int32"), 2, target_bits=16, target_dtype="int32")
def test_thread_axis(): def check(m, n, target_bits, target_dtype): ib = tvm.tir.ir_builder.create() Ab = tvm.tir.decl_buffer((m, n), name="A") A = ib.buffer_ptr(Ab) Bb = tvm.tir.decl_buffer((m, n), name="B") B = ib.buffer_ptr(Bb) bx = te.thread_axis("blockIdx.x") tx = te.thread_axis("threadIdx.x") ib.scope_attr(bx, "thread_extent", m) ib.scope_attr(tx, "thread_extent", n) B[bx * n + tx] = A[bx * n + tx] + 1 stmt = ib.get() stmt = lower_stmt([Ab, Bb], stmt, target_bits) assert stmt.node.var.dtype == target_dtype assert stmt.body.node.var.dtype == target_dtype # i32 -> i32 check(2, 32, target_bits=32, target_dtype="int32") check( 2**30, 32, # i32 + i32 is not promoted to i64 even in the case of overflow target_bits=32, target_dtype="int32", ) # i64 -> i32 check(const(2, dtype="int64"), const(32, dtype="int64"), target_bits=32, target_dtype="int32") check( const(2**30, dtype="int64"), const(32, dtype="int64"), target_bits=32, target_dtype="int64", ) # i32 -> i16 check(2, 32, target_bits=16, target_dtype="int16") check(2**14, 32, target_bits=16, target_dtype="int32")
def _get_pixel_value(n, c, h, w): if padding_mode == "zeros": return te.if_then_else( te.all(h >= 0, w >= 0, h < in_height, w < in_width), data[n, c, h, w], tir.const(0.0, dtype=data.dtype), ) if padding_mode == "border": h_b = te.max(te.min(h, in_height - 1), 0) w_b = te.max(te.min(w, in_width - 1), 0) return data[n, c, h_b, w_b] raise AssertionError("unsupported padding_mode")
def affine_grid(data, target_shape): """affine_grid operator that generates 2D sampling grid. This operation is described in https://arxiv.org/pdf/1506.02025.pdf. It generates a uniform sampling grid within the target shape and normalizes it to [-1, 1]. The provided affine transformation is then applied on the sampling grid. Parameters ---------- data : tvm.Tensor 3-D with shape [batch, 2, 3]. The affine matrix. target_shape: list/tuple of two int Specifies the output shape (H, W). Returns ------- Output : tvm.Tensor 4-D with shape [batch, 2, target_height, target_width] """ assert target_shape is not None assert len(target_shape) == 2 assert ( target_shape[0] > 1 and target_shape[1] > 1 ), "target height/width should be greater than 1" dtype = data.dtype y_step = tir.const((2.0 - 1e-7) / (target_shape[0] - 1), dtype=dtype) x_step = tir.const((2.0 - 1e-7) / (target_shape[1] - 1), dtype=dtype) start = tir.const(-1.0, dtype=dtype) def _compute(n, dim, i, j): y = start + i * y_step x = start + j * x_step return data[n, dim, 0] * x + data[n, dim, 1] * y + data[n, dim, 2] oshape = (data.shape[0], len(target_shape), *target_shape) return te.compute(oshape, _compute, tag="affine_grid")
def test_convert_ssa(): dtype = "int32" zero = tir.const(0) nop = tir.Evaluate(zero) var_type = ir.PointerType(ir.PrimType(dtype)) v = tir.Var("i1", var_type) buf = tir.decl_buffer([16], dtype=dtype, data=v) let = tir.LetStmt(v, v, nop) load = tir.Evaluate(tir.BufferLoad(buf, [zero])) seq = tir.SeqStmt([let, let, load]) func = tir.PrimFunc([], seq) mod = tvm.IRModule({"main": func}) mod = tir.transform.InjectVirtualThread()( mod) # Use pass InjectVirtualThread to invoke ConvertSSA
def _dilate(*indices): not_zero = [] index_tuple = [] for i in range(n): if not strides[i] == 1: index_tuple.append(idx_div(indices[i], strides[i])) not_zero.append(idx_mod(indices[i], strides[i]).equal(0)) else: index_tuple.append(indices[i]) if not_zero: not_zero = te.all(*not_zero) return te.if_then_else(not_zero, padded(*index_tuple), tir.const(0.0, padded.dtype)) return padded(*index_tuple)
def _check_forward(constraints1, constraints2, varmap, backvarmap): ana = tvm.arith.Analyzer() all_vranges = vranges.copy() all_vranges.update({v: r for v, r in constraints1.ranges.items()}) # Check that the transformation is injective cond_on_vars = tir.const(1, 'bool') for v in constraints1.variables: # variable mapping is consistent v_back = ana.simplify(tir.stmt_functor.substitute(varmap[v], backvarmap)) cond_on_vars = te.all(cond_on_vars, v == v_back) # Also we have to check that the new relations are true when old relations are true cond_subst = tir.stmt_functor.substitute( te.all(tir.const(1, 'bool'), *constraints2.relations), backvarmap) # We have to include relations from vranges too for v in constraints2.variables: if v in constraints2.ranges: r = constraints2.ranges[v] range_cond = te.all(v >= r.min, v < r.min + r.extent) range_cond = tir.stmt_functor.substitute(range_cond, backvarmap) cond_subst = te.all(cond_subst, range_cond) cond_subst = ana.simplify(cond_subst) check_bruteforce(te.all(cond_subst, cond_on_vars), all_vranges, cond=te.all(tir.const(1, 'bool'), *constraints1.relations))
result : int or float or bool or str result of evaluation or "Runtime Error" if there was an error during evaluation """ try: return expr() except Exception: traceback.print_exc() return "Runtime Exception" def compare_results(result_one, result_two): if (isinstance(result_one, (float, np.float32)) or isinstance(result_two, (float, np.float32))): if np.isnan(result_one) and np.isnan(result_two): return True elif np.isinf(result_one) and np.isinf(result_two): if result_one > 0 and result_two > 0: return True elif result_one < 0 and result_two < 0: return True return False return np.isclose(result_one, result_two) return result_one == result_two if (__name__ == "__main__"): a = te.var(name="a", dtype='int32') assert evaluate_tvm_expr(tir.const(5) + a, [a], {"a": 5}) == 10
def _get_pixel_value(n, c, h, w): return te.if_then_else( te.all(h >= 0, w >= 0, h < in_height, w < in_width), data[n, c, h, w], tir.const(0.0, dtype=data.dtype), )
# 2/14/2020 # Bug descrption: Bitshifting by a float is the identity function # PR: https://github.com/apache/incubator-tvm/pull/4892 import tvm from tvm import tir, te import numpy as np import sys hadError = False try: shape = (1, 1) a = tir.const(dtype='int32', value=10) c = te.compute( shape, lambda i, j: a << 2.0 ) #this should either be impossible or materialize as a * (2 ** 1.5) s = tvm.create_schedule([c.op]) f = tvm.build(s, [c]) c_tvm = tvm.nd.array(np.zeros(shape, dtype='float32')) f(c_tvm) print(c_tvm) assert False except tvm.TVMError: pass
# 2/15/2020 # Bug descrption: And'ing by a float causes codegen crash # PR: https://github.com/apache/incubator-tvm/pull/4892 import tvm from tvm import tir,te import numpy as np import sys try: shape = (5,5) a = tir.const(dtype='float32',value=10) c = te.compute(shape,lambda i,j: a & 1.5 ^ a | ~a) #Also affects | , &, ^, ~ s = tvm.create_schedule([c.op]) f = tvm.build(s,[c]) c_tvm= tvm.nd.array(np.zeros(shape,dtype='float32')) f(c_tvm) print(c_tvm) assert false except tvm.TVMError: pass