def gen_ir( data_ptr, n_fft, hop_length, win_length, window_ptr, normalized, onesided, output_ptr, ): ib = tir.ir_builder.create() data = ib.buffer_ptr(data_ptr) window = ib.buffer_ptr(window_ptr) output = ib.buffer_ptr(output_ptr) max_threads = _get_max_threads(output_ptr.shape[0] * output_ptr.shape[1]) output_size = output_ptr.shape[0] * output_ptr.shape[ 1] * output_ptr.shape[2] with ib.new_scope(): nthread_tx = max_threads nthread_bx = ceil_div(output_size, max_threads) tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx with ib.if_scope(tid < output_size): matrix_size = output_ptr.shape[1] * output_ptr.shape[2] batch = tir.floordiv(tid, matrix_size) row = tir.floordiv(tir.indexmod(tid, matrix_size), output_ptr.shape[2]) col = tir.indexmod(tir.indexmod(tid, matrix_size), output_ptr.shape[2]) output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0) output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0) with ib.for_range(0, win_length) as wlen: output[batch, row, col, 0] += (window[wlen] * data[batch, col * hop_length + wlen] * tir.cos(2 * pi * row * wlen / win_length)) output[batch, row, col, 1] -= (window[wlen] * data[batch, col * hop_length + wlen] * tir.sin(2 * pi * row * wlen / win_length)) with ib.if_scope(normalized): output[batch, row, col, 0] /= tir.sqrt(tir.const(n_fft, "float32")) output[batch, row, col, 1] /= tir.sqrt(tir.const(n_fft, "float32")) return ib.get()
def _calc_adjacent_diff_ir(data, output, binop=tir.Sub): """Low level IR to calculate adjacent difference in an 1-D array. Parameters ---------- data : Buffer Input 1-D Buffer. output: Buffer A buffer to store adjacent difference, of the same shape as data. The adjacent difference is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1]) where i > 0 and i < len(data). binop: function, optional A binary associative op to use for calculating adjacent difference. The function takes two TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to compute the adjacent difference. """ ib = tir.ir_builder.create() data_ptr = ib.buffer_ptr(data) output_ptr = ib.buffer_ptr(output) with ib.for_range(0, data.shape[0], kind="parallel") as i: with ib.if_scope(i == 0): output_ptr[0] = 0 with ib.else_scope(): output_ptr[i] = tir.Cast(output.dtype, binop(data_ptr[i], data_ptr[i - 1])) return ib.get()
def test_illegal_cast(): analyzer = tvm.arith.Analyzer() try: analyzer.rewrite_simplify(1 * tir.Cast('bool', 77)) assert False except tvm.TVMError: pass try: analyzer.rewrite_simplify(1 * tir.Cast('int8', 171)) assert False except tvm.TVMError: pass try: analyzer.rewrite_simplify(1 * tir.Cast('int32', 2**32)) assert False except tvm.TVMError: pass
def batch_matmul_nkkm_f16( # pylint: disable=invalid-name,missing-docstring B: int, N: int, M: int, K: int, ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]: x = te.placeholder((B, N, K), name="X", dtype="float16") y = te.placeholder((B, K, M), name="Y", dtype="float16") k = te.reduce_axis((0, K), name="k") z = te.compute( # pylint: disable=invalid-name (B, N, M), lambda b, i, j: te.sum(tir.Cast("float32", x[b][i][k]) * tir.Cast( "float32", y[b][k][j]), axis=[k]), name="Z", ) return (x, y, z)
def gen_ir( data_ptr, n_fft, hop_length, win_length, window_ptr, normalized, onesided, output_ptr, loop_kind, ): ib = tir.ir_builder.create() data = ib.buffer_ptr(data_ptr) window = ib.buffer_ptr(window_ptr) output = ib.buffer_ptr(output_ptr) # https://librosa.org/doc/0.7.2/_modules/librosa/core/spectrum.html#stft with ib.for_range(0, output_ptr.shape[0] * output_ptr.shape[1], kind="parallel") as batch_row: with ib.for_range(0, output_ptr.shape[2], kind=loop_kind) as col: batch = ib.allocate("int32", (1), name="batch", scope="local") row = ib.allocate("int32", (1), name="row", scope="local") batch = tir.floordiv(batch_row, output_ptr.shape[1]) row = tir.floormod(batch_row, output_ptr.shape[1]) output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0) output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0) with ib.for_range(0, win_length) as wlen: output[batch, row, col, 0] += (window[wlen] * data[batch, col * hop_length + wlen] * tir.cos(2 * pi * row * wlen / win_length)) output[batch, row, col, 1] -= (window[wlen] * data[batch, col * hop_length + wlen] * tir.sin(2 * pi * row * wlen / win_length)) with ib.if_scope(normalized): output[batch, row, col, 0] /= tir.sqrt(tir.const(n_fft, "float32")) output[batch, row, col, 1] /= tir.sqrt(tir.const(n_fft, "float32")) return ib.get()
def test_scalar_add(): # All these types should be interchangeable with each other # E.g. float16 + float32 upconverts the float16 --> float32 # Meanwhile if an int or float or together the int will be # cast to the float type. lhs_types = ["float32", "float16", "int32", "int64"] rhs_types = ["float32", "float16"] for lhs_type, rhs_type in itertools.product(lhs_types, rhs_types): # Input vars should be float32, we will cast to test for upcasting between them lhs_input = tir.Var("lhs", "float32") rhs_input = tir.Var("rhs", "float32") lhs = tir.Cast(lhs_type, lhs_input) rhs = tir.Cast(rhs_type, rhs_input) output = lhs + rhs output = tir.ret(output) output = tir.Evaluate(output) func = tir.PrimFunc([lhs_input, rhs_input], output) func = build_tir_func(func) out = func(1.0, 2.0) assert out == 3.0
def conv2d_nhwc_f16( # pylint: disable=invalid-name,missing-docstring N: int, H: int, W: int, CI: int, CO: int, kernel_size: int, stride: int = 1, padding: int = 0, dilation: int = 1, groups: int = 1, ): inputs = te.placeholder((N, H, W, CI), name="inputs", dtype="float16") weight = te.placeholder((kernel_size, kernel_size, CI // groups, CO), name="weight", dtype="float16") batch_size, in_h, in_w, _ = inputs.shape k_h, k_w, channel_per_group, out_channel = weight.shape out_channel_per_group = out_channel // groups out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 rh = te.reduce_axis((0, k_h), name="rh") rw = te.reduce_axis((0, k_w), name="rw") rc = te.reduce_axis((0, channel_per_group), name="rc") padded = topi.nn.pad(inputs, [0, padding, padding, 0]) output = te.compute( (batch_size, out_h, out_w, out_channel), lambda n, h, w, co: te.sum( (tir.Cast( value=padded[n, h * stride + rh * dilation, w * stride + rw * dilation, co // out_channel_per_group * channel_per_group + rc, ], dtype="float32", ) * tir.Cast(value=weight[rh, rw, rc, co], dtype="float32")), axis=[rh, rw, rc], ), name="conv2d_nhwc", ) return (inputs, weight, output)
def _calc_adjacent_diff_ir(data, output, binop=tir.Sub): """Low level IR to calculate adjacent difference in an 1-D array. Parameters ---------- data : Buffer Input 1-D Buffer. output: Buffer A buffer to store adjacent difference, of the same shape as data. The adjacent difference is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1]) where i > 0 and i < len(data). binop: function, optional A binary associative op to use for calculating adjacent difference. The function takes two TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to compute the adjacent difference. """ ib = tir.ir_builder.create() data_ptr = ib.buffer_ptr(data) output_ptr = ib.buffer_ptr(output) batch_size = data.shape[0] max_threads = tir.min( batch_size, tvm.target.Target.current(allow_none=False).max_num_threads) with ib.new_scope(): nthread_tx = max_threads nthread_bx = ceil_div(batch_size, max_threads) tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx with ib.if_scope(tid < batch_size): with ib.if_scope(tid == 0): output_ptr[tid] = 0 with ib.else_scope(): output_ptr[tid] = tir.Cast( output.dtype, binop(data_ptr[tid], data_ptr[tid - 1])) return ib.get()
def f_compute(i, j): v_a = tir.Cast(dtype="float32", value=a[i, k]) v_b = tir.Cast(dtype="float32", value=b[k, j]) return te.sum(v_a * v_b, axis=[k])
def apply(*args): bool_args = list( map(lambda x: tir.Cast('bool', _clamp_tvm(x, 0, 1)), args)) return tir.all(*bool_args)
def apply(condition, true_expr, false_expr): return tir.if_then_else(tir.Cast('bool', _clamp_tvm(condition, 0, 1)), true_expr, false_expr)
def apply(condition, true_expr, false_expr): return tir.Select(tir.Cast('bool', _clamp_tvm(condition, 0, 1)), _force_int(true_expr), _force_int(false_expr))