def CaffePReLU(device="llvm", lib_path="./", ndim=None, dtype=None, channel_shared=None): ''' caffe prelu Args: device: lib_path: ndim: dtype: channel_shared: Returns: ''' shape = [tvm.var("n" + str(i)) for i in range(ndim)] channel = 1 if channel_shared else shape[1] opname = "CaffePReLU_ndim%d_%s_%s" % ( ndim, dtype, "channelShared" if channel_shared else "channelNotShared") print(opname) in_tensor = tvm.placeholder(shape, dtype=dtype, name='in_tensor') slope = tvm.placeholder((channel, ), dtype=dtype, name='slope') if channel_shared: out_tensor = tvm.compute(shape, lambda *idx: tvm.if_then_else(in_tensor[idx] >= 0, in_tensor[idx],\ in_tensor[idx] * slope[0])) else: out_tensor = tvm.compute(shape, lambda *idx: tvm.if_then_else(in_tensor[idx] >= 0, in_tensor[idx],\ in_tensor[idx] * slope[idx[1]])) tensor_list = [in_tensor, slope, out_tensor] s = tvm.create_schedule(out_tensor.op) Genlib(s, tensor_list, device, opname, lib_path)
def test_if_then_else(): cases = [[(tvm.var('cond', dtype='bool'), 'bool', 'int32'), 'int32'], [(True, 'int32', 'float32'), 'float32'], [(False, 'int32', 'int64'), 'int64'], [(tvm.var('cond', dtype='bool'), 'uint32', 'int32'), 'int32'], [(tvm.var('cond', dtype='int32'), 'uint32', 'int32'), 'int32']] for (cond, lhs_dtype, rhs_dtype), out_dtype in cases: lhs = tvm.var('lhs', dtype=lhs_dtype) rhs = tvm.var('rhs', dtype=rhs_dtype) if cond is True or cond is False: out = tvm.if_then_else(cond, lhs, rhs) out2 = tvm.if_then_else(not cond, rhs, lhs) out3 = tvm.if_then_else(not cond, lhs, rhs) assert tvm.ir_pass.Equal(out, out2) == 1 if cond: assert tvm.ir_pass.Equal(out, lhs.astype(out_dtype)) == 1 assert tvm.ir_pass.Equal(out3, rhs.astype(out_dtype)) == 1 else: assert tvm.ir_pass.Equal(out, rhs.astype(out_dtype)) == 1 assert tvm.ir_pass.Equal(out3, lhs.astype(out_dtype)) == 1 elif cond.dtype == 'bool': out = tvm.if_then_else(cond, lhs, rhs) assert out.dtype == out_dtype assert out.args[1].dtype == out_dtype assert out.args[2].dtype == out_dtype elif cond.dtype != 'bool': check_throws(lambda: tvm.if_then_else(cond, lhs, rhs)) else: raise ValueError('Unknown combinations')
def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): """Low level IR routing for multibox_prior operator. Parameters ---------- data : Buffer Input data buffer. out : Buffer Output buffer. sizes : tuple of float Tuple of sizes for anchor boxes. ratios : tuple of float Tuple of ratios for anchor boxes. steps : Tuple of float Priorbox step across y and x, -1 for auto calculation. offsets : tuple of int Priorbox center offsets, y and x respectively. Returns ------- stmt : Stmt The result IR statement. """ ib = tvm.ir_builder.create() p_out = ib.buffer_ptr(out) in_height = data.shape[2] in_width = data.shape[3] num_sizes = len(sizes) num_ratios = len(ratios) size_ratio_concat = sizes + ratios steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width offset_h = offsets[0] offset_w = offsets[1] with ib.for_range(0, in_height, for_type="parallel", name="i") as i: center_h = (i + offset_h) * steps_h with ib.for_range(0, in_width, name="j") as j: center_w = (j + offset_w) * steps_w for k in range(num_sizes + num_ratios - 1): w = tvm.if_then_else(k < num_sizes, size_ratio_concat[k] * in_height / in_width / 2.0, size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0) h = tvm.if_then_else( k < num_sizes, size_ratio_concat[k] / 2.0, size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0) count = (i * in_width * (num_sizes + num_ratios - 1) + j * (num_sizes + num_ratios - 1) + k) * 4 p_out[count] = center_w - w p_out[count + 1] = center_h - h p_out[count + 2] = center_w + w p_out[count + 3] = center_h + h return ib.get()
def test_simplify_if_then_else(): ck = CanonicalChecker() x = tvm.var("x") y = tvm.var("y") # simplification that takes condition into account. res = tvm.if_then_else( (x * 4 + y) >= 466036, tvm.if_then_else(24512 <= ((((x * 4) + y) - 466036) % 24528), (((((x * 4) + y) - 466036) % 24528) - 24512) % 16, x), y) res2 = tvm.if_then_else( (x * 4) >= 466036 - y, tvm.if_then_else(24512 <= ((((x * 4) + y) - 466036) % 24528), (((((x * 4) + y) - 466036) % 24528) - 24512) % 16, x), y) expected = tvm.if_then_else( tvm.expr.LE(466036, (x * 4 + y)), tvm.if_then_else(tvm.expr.LE(24512, ((((x * 4) + y) - 4) % 24528)), (((x * 4) + y) - 4) % 16, x), y) ck.verify(res, expected) ck.verify(res2, expected) # can only simplify if condition res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3, (x + 100) % 3) expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3, (x + 100) % 3) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 > 2, x, 0), 0) expected = tvm.expr.Select(x >= 10, x, 0) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 < 2, x, 0), 0) ck.verify(res, 0)
def within_index(b, e, s, i): """Return a boolean value that indicates if i is within the given index. Parameter --------- b : Expr beginning of the index e : Expr end of the index s : Expr strides of index i : Expr array position Returns ------- selected: Expr bool expression that is True is the array position would be selected by the index and False otherwise """ bc = tvm.expr.Select(s < 0, i <= e, i < b) ec = tvm.expr.Select(s < 0, i > b, i >= e) ss = tvm.if_then_else(s < 0, ((i - e) + (e % tvm.abs(s)) + 1) % tvm.abs(s), (i - b) % s) return tvm.expr.Select(tvm.expr.Or(bc, ec), tvm.const(False), ss.equal(0))
def conv3d_channel_batch(B, N, M, P, C, K, L, Q, O, stride=1, padding=0, dtype="float32"): A = tvm.placeholder((B, N, M, P, C), dtype=dtype, name="A") W = tvm.placeholder((K, L, Q, C, O), dtype=dtype, name="W") N_out = max(0, (N + padding * 2 - K) // stride) + 1 M_out = max(0, (M + padding * 2 - L) // stride) + 1 P_out = max(0, (P + padding * 2 - Q) // stride) + 1 Apad = tvm.compute( (B, N + 2 * padding, M + 2 * padding, P + 2 * padding, C), lambda b, i, j, k, c: tvm.if_then_else( tvm.all(i >= padding, j >= padding, k >= padding, i < N + padding, j < M + padding, k < P + padding), A[ b, i - padding, j - padding, k - padding, c], 0.0), name="Apad") rx, ry, rz, rc = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L), name="ry"), \ tvm.reduce_axis((0, Q), name="rz"), tvm.reduce_axis((0, C), name="rc") Output = tvm.compute((B, N_out, M_out, P_out, O), lambda b, i, j, k, o: tvm.sum( Apad[b, i * stride + rx, j * stride + ry, k * stride + rz, rc] * W[rx, ry, rz, rc, o], axis=[rx, ry, rz, rc]), name="Output") return Output.op, [A, W, Output]
def zero_pad2d(inputs, padding=0): """Zero padding for 2d tensor Args: ----------------------------- inputs : tvm.tensor.Tensor shape [batch, channel, height, width] padding: (optional:0) int or tuple expected: (h_pad_up, h_pad_down, w_pad_up, w_pad_down) ----------------------------- Returns: ----------------------------- tvm.tensor.Tensor shape [batch, channel, padded_height, padded_width] ----------------------------- """ padding = (padding, padding, padding, padding) if isinstance(padding, (int, tvm.expr.IntImm)) else padding assert_print(isinstance(padding, tuple), "type(padding)={}".format(type(padding))) if len(padding) == 2: padding = (padding[0], padding[0], padding[1], padding[1]) assert_print(len(padding) == 4) padding_zero = 0.0 if "float" in inputs.dtype else 0 batch_size, in_channel, height, width = inputs.shape return tvm.compute( (batch_size, in_channel, height + padding[0] + padding[1], width + padding[2] + padding[3]), lambda b, c, h, w: tvm.if_then_else( tvm.all(h >= padding[0], h < height + padding[0], w >= padding[2], w < width + padding[2]), inputs[b, c, h - padding[ 0], w - padding[2]], padding_zero))
def test_copy_pad_split(): m = 4 * 3 A = tvm.placeholder((m, ), name="A") Apad = tvm.compute( (m + 2, ), lambda i: tvm.if_then_else(tvm.all(i >= 1, i <= m), A[i - 1], 0.0), "Apad") B = tvm.compute((m, ), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2]) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=4) s[Apad].compute_at(s[B], xo) s[Apad].pragma(s[Apad].op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.CanonicalSimplify(stmt) def cb(src, dst, pad_before, pad_after, pad_value): assert (dst.elem_offset.value == 0) assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1) rpad_before = tvm.max(1 - xo * 4, 0) rpad_after = tvm.max(xo * 4 - 7, 0) assert_expr_equal(pad_before[0], rpad_before) assert_expr_equal(pad_after[0], rpad_after) assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after) return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_copy_pad(): m = tvm.var('m') l = tvm.var('l') A = tvm.placeholder((m, l), name='A') B = tvm.compute((m + 2, l), lambda i, j: tvm.if_then_else(tvm.all(i >= 1, i < m + 1), A[i - 1, j], 1.0), name='B') s = tvm.create_schedule(B.op) s[B].pragma(B.op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) def cb(src, dst, pad_before, pad_after, pad_value): assert tvm.ir_pass.Simplify(src.elem_offset).value == 0 assert pad_before[0].value == 1 assert pad_before[1].value == 0 assert pad_after[0].value == 1 assert pad_after[1].value == 0 assert pad_value.value == 1.0 return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_copy_pad_split(): m = 4 * 3 A = tvm.placeholder((m, ), name="A") Apad = tvm.compute((m + 2,), lambda i: tvm.if_then_else(tvm.all(i >= 1, i <= m), A[i - 1], 0.0), "Apad") B = tvm.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2]) s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=4) s[Apad].compute_at(s[B], xo) s[Apad].pragma(s[Apad].op.axis[0], "memcpy") bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) Ab = tvm.decl_buffer(A.shape, A.dtype, name='A') Bb = tvm.decl_buffer(B.shape, B.dtype, name='B') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64) stmt = tvm.ir_pass.Simplify(stmt) stmt = tvm.ir_pass.CanonicalSimplify(stmt) def cb(src, dst, pad_before, pad_after, pad_value): assert(dst.elem_offset.value == 0) assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1) rpad_before = tvm.max(1 - xo * 4, 0) rpad_after = tvm.max(xo * 4 - 7, 0) assert_expr_equal(pad_before[0], rpad_before) assert_expr_equal(pad_after[0], rpad_after) assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after) return tvm.make.Evaluate(0) stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def compute_sinc(dtype, ndim): A = tvm.placeholder([tvm.var() for _ in range(ndim)], name='input', dtype=dtype) if dtype in ['float16', 'float32', 'float64']: var = tvm.const(np.pi, dtype) B = tvm.compute([tvm.var() for _ in range(ndim)], lambda *index: tvm.if_then_else(A[index] == 0, tvm.const(1, dtype), tvm.sin(var * A[index]) / (A[index] * var)), name='output') else: var = tvm.const(np.pi, "float64") B = tvm.compute([tvm.var() for _ in range(ndim)], lambda *index: tvm.if_then_else(A[index] == 0, tvm.const(1, 'float64'), tvm.sin(var * A[index].astype('float64')) / (A[index].astype("float64") * var)), name='output') s = tvm.create_schedule(B.op) return s, A, B
def padding(X, ph, pw): assert len(X.shape) >= 2 nh, nw = X.shape[-2:] return tvm.compute( (*X.shape[:-2], nh + ph * 2, nw + pw * 2), lambda *i: tvm.if_then_else( tvm.any(i[-2] < ph, i[-2] >= nh + ph, i[-1] < pw, i[-1] >= nw + pw ), 0, X[i[:-2] + (i[-2] - ph, i[-1] - pw)]), name='PaddedX')
def _gaussian_map_sum(i, j): # i is row, j is col x, y = data[ni, 0], data[ni, 1] sigma = data[ni, 2] sigma2 = sigma * sigma v = tvm.if_then_else( tvm.all(x >= 0, x < cols, y >= 0, y < rows), tvm.exp(-(topi.power((x - j), 2) + topi.power( (y - i), 2)) / (2 * sigma2)) / (2 * pi * sigma2), 0) return tvm.sum(v, axis=ni)
def _select(*indices): from_val = [] index_tuple = [] for i in range(n): from_val.append( within_index(begin[i], end[i], strides[i], indices[i])) index_tuple.append( make_idx(begin[i], end[i], strides[i], a.shape[i], indices[i])) return tvm.if_then_else(tvm.all(*from_val), v(*index_tuple), a(*indices))
def conv2d_channel_batch(B, N, M, C, K, L, O, stride=1, padding=0, dtype="float32"): A = tvm.placeholder((B, N, M, C), dtype=dtype, name="A") W = tvm.placeholder((K, L, C, O), dtype=dtype, name="W") N_out = max(0, (N + padding * 2 - K) // stride) + 1 M_out = max(0, (M + padding * 2 - L) // stride) + 1 Apad = tvm.compute( (B, N + 2 * padding, M + 2 * padding, C), lambda b, i, j, k: tvm.if_then_else( tvm.all(i >= padding, j >= padding, i < N + padding, j < M + padding), A[b, i - padding, j - padding, k], 0.0), name="Apad") rx, ry = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L), name="ry") rc = tvm.reduce_axis((0, C), name="rc") Output = tvm.compute( (B, N_out, M_out, O), lambda b, i, j, k: tvm.sum(Apad[b, i * stride + rx, j * stride + ry, rc ] * W[rx, ry, rc, k], axis=[rx, ry, rc]), name="Output") s = tvm.create_schedule(Output.op) s[Apad].compute_inline() CL = s.cache_write(Output, "local") n, h, w, c = s[Output].op.axis out = s[Output].fuse(h, w) cfg = autotvm.get_config() cfg.define_split("split_n", n, num_outputs=2) cfg.define_split("split_c", c, num_outputs=2) no, ni = cfg["split_n"].apply(s, Output, n) co, ci = cfg["split_c"].apply(s, Output, c) s[Output].reorder(no, out, co, ni, ci) s[Output].parallel(out) # schedule CL s[CL].compute_at(s[Output], co) ni, hi, wi, ci = s[CL].op.axis xi, yi, ki = s[CL].op.reduce_axis cfg.define_split("split_k", ki, num_outputs=2) ko, ki = cfg["split_k"].apply(s, CL, ki) s[CL].reorder(ko, xi, yi, ni, ki, ci) s[CL].unroll(ki) s[CL].vectorize(ci) return s, [A, W, Output]
def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype): """Transposed 1D convolution ncw forward operator. Parameters ---------- cfg: ConfigEntity The config for this template Input : tvm.Tensor 3-D with shape [batch, in_channel, inp_width] Filter : tvm.Tensor 3-D with shape [in_channel, num_filter, kernel_size] stride : tuple of one int The spatial stride along width padding : int, tuple, or string int: padding size tuple of 2 ints: (pad_left, pad_right) for left and right padding string: ['VALID', 'SAME'] out_dtype: str The output type. This is used in mixed precision Returns ------- Output : tvm.Tensor u 3-D with shape [batch, out_channel, out_width] """ if isinstance(stride, (tuple, list)): stride = stride[0] cfg.stride = stride batch, inp_channels, inp_width = get_const_tuple(data.shape) _, out_channels, kernel_size = get_const_tuple(kernel.shape) pad_left, pad_right = nn.get_pad_tuple1d(padding, kernel_size) out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right pad_left = kernel_size - 1 - pad_left pad_right = kernel_size - 1 - pad_right dilated_width = stride * (inp_width - 1) + 1 data = tvm.compute( (batch, inp_channels, pad_left + dilated_width + pad_right), lambda n, c, x: tvm.if_then_else( tvm.all(x >= pad_left, x < pad_left + dilated_width, tvm.indexmod(x - pad_left, stride).equal(0)), data[ n, c, tvm.indexdiv(x - pad_left, stride)], tvm.const(0., "float32")), name='data_pad') dc = tvm.reduce_axis((0, inp_channels), name='dc') dw = tvm.reduce_axis((0, kernel_size), name='dw') data_out = tvm.compute( (batch, out_channels, out_width), lambda b, c, w: tvm.sum(data[b, dc, w + dw].astype(out_dtype) * kernel[ dc, c, kernel_size - 1 - dw].astype(out_dtype), axis=[dc, dw]), tag="conv1d_transpose_ncw") return data_out
def make_relu_gradient(shape, tgt, tgt_host, func_name, dtype="float32"): """Hint: use tvm.select""" A = tvm.placeholder(shape, dtype=dtype, name="A") B = tvm.placeholder(shape, dtype=dtype, name="B") C = tvm.compute( A.shape, lambda *i: tvm.if_then_else( A(*i) > tvm.const(0, A.dtype), B(*i), tvm.const(0, A.dtype))) s = tvm.create_schedule(C.op) f = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name=func_name) return f
def test_schedule_bound_condition(): A = tvm.placeholder((64,), name='A', dtype="float32") Apad = tvm.compute((66,), lambda i: tvm.if_then_else( tvm.all(i>0, i < 65), A[i-1], tvm.const(0., "float32")), name='Apad') Apad2 = tvm.compute((66,), lambda i: Apad[i]*2, name='Apad2') s = tvm.create_schedule(Apad2.op) AL1 = s.cache_read(A,"local",[Apad]) s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds) stmt = tvm.ir_pass.Simplify(stmt) assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
def _pad(*indices): index_tuple = [] above = [] below = [] for i in range(n): if equal_const_int(pad_before[i], 0) and equal_const_int( pad_after[i], 0): index_tuple.append(indices[i]) above.append(False) below.append(False) else: index_tuple.append(indices[i] - pad_before[i]) above.append(indices[i] >= data.shape[i] + pad_before[i]) below.append(indices[i] < pad_before[i]) mapped_tuple = [] for i, axis in enumerate(index_tuple): mapped_axis = tvm.if_then_else(below[i], -axis - mode, axis) mapped_axis = tvm.if_then_else( above[i], (2 * (data.shape[i] - 1)) - axis + mode, mapped_axis) mapped_tuple.append(mapped_axis) return data(*mapped_tuple)
def _decl_winograd_kernel_transform(kernel, tile_size, G): """Declare a Winograd kernel transform This exists separately to allow for precomputation The precomputation will most often happen on CPU Parameters ---------- kernel : tvm.Tensor The kernel to transform tile_size : int The size of the tile to use for the Winograd filter Returns ------- U : tvm.Tensor Transformed kernel """ CO, CI, KH, KW = [get_const_int(x) for x in kernel.shape] # Only support 32 bit floats out_dtype = 'float32' alpha = G.shape[0] K = CO C = CI def upround(x, align): return (x + align - 1) // align * align ALIGN = 16 K_round = upround(K, ALIGN) # Padded Kernel [K_round, C, KH, KW] # Pad the number of kernels to multiple of ALIGN padded_kernel = tvm.compute((K_round, C, KH, KW), lambda k, c, h, w: tvm.if_then_else(k < K, kernel[k][c][h][w], tvm.const(0, out_dtype)), name='padded_kernel') # U [alpha, alpha, K_round, C] # Perform the kernel transform r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute((alpha, alpha, K_round, C), lambda eps, nu, k, c: tvm.sum(padded_kernel[k][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') return U
def test_simplify_if_then_else(): ck = CanonicalChecker() x = tvm.var("x") y = tvm.var("y") # simplification that takes condition into account. res = tvm.if_then_else((x * 4 + y) >= 466036, tvm.if_then_else(24512 <= ((((x*4) + y) - 466036) % 24528), (((((x*4) + y) - 466036) % 24528) -24512) % 16, x), y) expected = tvm.if_then_else( tvm.expr.LE(466036, (x * 4 + y)), tvm.if_then_else(tvm.expr.LE(24512, ((((x*4) + y) - 4) % 24528)), (((x*4) + y) - 4) % 16, x), y) ck.verify(res, expected) # can only simplify if condition res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3, (x + 100) % 3) expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3, (x + 100) % 3) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 > 2, x, 0), 0) expected = tvm.expr.Select(x >= 10, x, 0) ck.verify(res, ck.analyzer.canonical_simplify(expected)) res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 < 2, x, 0), 0) ck.verify(res, 0)
def make_idx(b, e, s, z, i): """Return the array position in the selection that corresponds to an array position in the full array. The returned value is only meaningful if within_index() returns True for the same set of parameters. Parameter --------- b : Expr beginning of the index e : Expr end of the index s : Expr strides of index z : Expr size of the indexed dimension i : Expr array position Returns ------- postion: Expr int expression that corresponds to an array position in the selection. """ bc = tvm.expr.Select(s < 0, i <= e, i < b) ec = tvm.expr.Select(s < 0, i > b, i >= e) # Clamp to array size b = tvm.expr.Select(z < b, z - 1, b) ss = tvm.if_then_else(s < 0, (b - i) // tvm.abs(s), (i - b) // s) return tvm.if_then_else(tvm.expr.Or(bc, ec), 88, ss)
def compute_backward_sinc(dtype, ndim, req): A = tvm.placeholder([tvm.var() for _ in range(ndim)], name='A', dtype=dtype) B = tvm.placeholder([tvm.var() for _ in range(ndim)], name='B', dtype=dtype) C = tvm.placeholder([tvm.var() for _ in range(ndim)], name='C', dtype=dtype) var = tvm.const(np.pi, dtype) D = tvm.compute([tvm.var() for _ in range(ndim)], lambda *index: tvm.if_then_else(B[index] == 0, tvm.const(0, dtype), (tvm.cos(var * B[index]) / B[index] - C[index] / B[index]) * A[index]), name='in_grad') in_grad_a, in_grad = assign_by_req(D, req) s = tvm.create_schedule(in_grad.op) s[D].compute_inline() return s, A, B, C, in_grad_a, in_grad
def padding(X, ph, pw): """Pad X with 0s in 2-D ph, pw : height and width padding """ assert len(X.shape) >= 2 nh, nw = X.shape[-2], X.shape[-1] return tvm.compute( (*X.shape[0:-2], nh+ph*2, nw+pw*2), lambda *i: tvm.if_then_else( tvm.any(i[-2]<ph, i[-2]>=nh+ph, i[-1]<pw, i[-1]>=nw+pw), 0, X[i[:-2]+(i[-2]-ph, i[-1]-pw)]), name='PaddedX')
def _dilate(*indices): not_zero = [] index_tuple = [] for i in range(n): if not equal_const_int(strides[i], 1): index_tuple.append(indices[i] // strides[i]) not_zero.append((indices[i] % strides[i]).equal(0)) else: index_tuple.append(indices[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype)) return data(*index_tuple)
def _dilate(*indices): not_zero = [] index_tuple = [] for i in range(n): if not equal_const_int(strides[i], 1): index_tuple.append(idxdiv(indices[i], strides[i])) not_zero.append(idxmod(indices[i], strides[i]).equal(0)) else: index_tuple.append(indices[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype)) return data(*index_tuple)
def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh): """Transform prior anchor box to output box through location predictions. """ al = anchor[anchor_base_idx] at = anchor[anchor_base_idx + 1] ar = anchor[anchor_base_idx + 2] ab = anchor[anchor_base_idx + 3] aw = ar - al ah = ab - at ax = (al + ar) / 2.0 ay = (at + ab) / 2.0 px = loc[loc_base_idx] py = loc[loc_base_idx + 1] pw = loc[loc_base_idx + 2] ph = loc[loc_base_idx + 3] ox = px * vx * aw + ax oy = py * vy * ah + ay ow = tvm.exp(pw * vw) * aw / 2.0 oh = tvm.exp(ph * vh) * ah / 2.0 return tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \ tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \ tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \ tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
def _pad(*indices): not_zero = [] index_tuple = [] for i in range(n): if equal_const_int(pad_before[i], 0) and equal_const_int(pad_after[i], 0): index_tuple.append(indices[i]) else: index_tuple.append(indices[i] - pad_before[i]) not_zero.append(indices[i] >= pad_before[i]) not_zero.append(indices[i] < data.shape[i] + pad_before[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.if_then_else(not_zero, data(*index_tuple), pad_value) return data(*index_tuple)
def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh): """Transform prior anchor box to output box through location predictions. """ al = anchor[anchor_base_idx] at = anchor[anchor_base_idx + 1] ar = anchor[anchor_base_idx + 2] ab = anchor[anchor_base_idx + 3] aw = ar - al ah = ab - at ax = (al + ar) / 2.0 ay = (at + ab) / 2.0 px = loc[loc_base_idx] py = loc[loc_base_idx + 1] pw = loc[loc_base_idx + 2] ph = loc[loc_base_idx + 3] ox = px * vx * aw + ax oy = py * vy * ah + ay ow = exp(pw * vw) * aw / 2.0 oh = exp(ph * vh) * ah / 2.0 return tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, ox - ow)), ox - ow), \ tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, oy - oh)), oy - oh), \ tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, ox + ow)), ox + ow), \ tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, oy + oh)), oy + oh)
def check_if_then_else(ctx, n, dtype): A = tvm.placeholder((n,), name='A', dtype=dtype) true_value = tvm.const(1, dtype=dtype) false_value = tvm.const(3, dtype=dtype) max_lhs = tvm.const(2, dtype=dtype) max_rhs = tvm.if_then_else(A[0] > 0, true_value, false_value) C = tvm.compute((n,), lambda i: tvm.max(max_lhs, max_rhs), name='C') s = tvm.create_schedule(C.op) s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, C], target) a = tvm.nd.empty((n,), A.dtype, ctx) c = tvm.nd.empty((n,), A.dtype, ctx) # Only need to test compiling here fun(a, c)
def check_llvm(n, offset): if not tvm.runtime.enabled("llvm"): return A = tvm.placeholder((n, ), name='A') C = tvm.compute((n,), lambda i: tvm.if_then_else(i >= offset, A[i], 0.0), name='C') s = tvm.create_schedule(C.op) # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) c = tvm.nd.empty((n,), A.dtype, ctx) f(a, c) c_np = a.asnumpy() c_np[:offset] = 0 tvm.testing.assert_allclose(c.asnumpy(), c_np)
def gaussian_blur2d(M, N, k, dtype="float32"): A = tvm.placeholder((M, N), dtype=dtype, name="A") pad = k // 2 number = k * k Apad = tvm.compute((M + 2 * pad, N + 2 * pad), lambda i, j: tvm.if_then_else( tvm.all(i >= pad, i < M + pad, j >= pad, j < N + pad ), A[i - pad, j - pad], 0.0), name="Apad") rx = tvm.reduce_axis((0, k), name="rx") ry = tvm.reduce_axis((0, k), name="ry") B = tvm.compute( (M, N), lambda i, j: tvm.sum(Apad[i + rx, j + ry] / number, axis=[rx, ry]), name="B") return B.op, [A, B]
def check_llvm(n, offset): if not tvm.module.enabled("llvm"): return A = tvm.placeholder((n, ), name='A') C = tvm.compute((n,), lambda i: tvm.if_then_else(i >= offset, A[i], 0.0), name='C') s = tvm.create_schedule(C.op) # build and invoke the kernel. f = tvm.build(s, [A, C], "llvm") ctx = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) c = tvm.nd.empty((n,), A.dtype, ctx) f(a, c) c_np = a.asnumpy() c_np[:offset] = 0 tvm.testing.assert_allclose(c.asnumpy(), c_np)
def poolingb(Image, Index, POutput): """ reverse 2*2 max pooling revised Parameters ---------- Image : tvm.tensor.Tensor 4-D with shape [batch_size, image_height, image_width, in_channels] Index : tvm.tensor.Tensor, specify where Output[i,j,k,l] is from, this follows the convention of Numpy and PyTorch. You will need this tensor to compute the gradient. ------------------------------------------ For example, if Image is of shape [1, 4, 4, 1] (batch 1 and channel 1), then the slice Image[0, :, :, 0] is [[0.7243, 0.3236, 0.0124, 0.4314], [0.4104, 0.3997, 0.4534, 0.1791], [0.0973, 0.2673, 0.6907, 0.9207], [0.9268, 0.6590, 0.0312, 0.2364]] and Index is of shape [1, 2, 2, 1] and the slice Index[0, :, :, 0] is [[ 0, 6], [12, 11]] because 0 = 0 * 4 + 0 (0, 0) 6 = 1 * 4 + 2 (1, 2) 12= 3 * 4 + 0 (3, 0) 11= 2 * 4 + 3 (2, 3) -------------------------------------------- 4-D with shape [batch_size, out_height, out_width, in_channels] POutput:tvm.tensor.Tensor, gradient of Output 4-D with shape [batch_size, out_height, out_width, in_channels] Returns ------- PImage: tvm.tensor.Tensor, gradient of Image 4-D with shape (Image.shape) """ _, _, W, _ = Image.shape PImage = tvm.compute( Image.shape, lambda n, i, j, c: tvm.if_then_else( tvm.all(i == Index[n, i // 2, j // 2, c] // W, j == Index[ n, i // 2, j // 2, c] % W), POutput[n, i // 2, j // 2, c], 0.0) ) return PImage
def conv2db(Image, Filter, POutput): """ convolution with NHWC layout backward Parameters ---------- Image : tvm.tensor.Tensor 4-D with shape [batch_size, image_height, image_width, in_channels] Filter: tvm.tensor.Tensor 4-D with shape [out_channels, in_channels, kernel_height, kernel_width] POutput:tvm.tensor.Tensor, gradient of Output 4-D with shape [batch_size, out_height, out_width, out_channels] Returns ------- PImage :tvm.tensor.Tensor, gradient of Image 4-D with shape (Image.shape) PFilter:tvm.tensor.Tensor, gradient of Filter 4-D with shape (Filter.shape) """ N, H, W, C = Image.shape K, _, Hk, Wk = Filter.shape rx = tvm.reduce_axis((0, H - (Hk - 1)), name='rx') ry = tvm.reduce_axis((0, W - (Wk - 1)), name='ry') rn = tvm.reduce_axis((0, N), name='rn') PFilter = tvm.compute( Filter.shape, lambda o, c, h, w: tvm.sum(Image[rn, h + rx, w + ry, c] * POutput[rn, rx, ry, o], axis=[rn, rx, ry])) rx_k = tvm.reduce_axis((0, Hk), name='rx_k') ry_k = tvm.reduce_axis((0, Wk), name='ry_k') ro = tvm.reduce_axis((0, K), name='ro') PImage = tvm.compute( Image.shape, lambda n, h, w, c: tvm. sum(Filter[ro, c, Hk - rx_k - 1, Wk - ry_k - 1] * tvm.if_then_else( tvm.all(h + rx_k >= Hk - 1, h + rx_k < H, w + ry_k >= Wk - 1, w + ry_k < W), POutput[n, h + rx_k - (Hk - 1), w + ry_k - (Wk - 1), ro], 0.0), axis=[rx_k, ry_k, ro])) return (PImage, PFilter)
def _bilinear(n, c, h, w): outside = tvm.any(h < 0, w < 0, h >= in_height, w >= in_width) val = bilinear_sample_nchw(data, (n, c, h, w), in_height - 1, in_width - 1) return tvm.if_then_else(outside, zero, val)
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) assert layout == 'NCHW' assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") if tile_size == 4: G_data = np.array([ [1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0], [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]], out_dtype) B_data = np.array([ [4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0], [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]], out_dtype) A_data = np.array([ [1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1], [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]], out_dtype) elif tile_size == 2: G_data = np.array([ [1, 0, 0], [1.0/2, 1.0/2, 1.0/2], [1.0/2, -1.0/2, 1.0/2], [0, 0, 1]], out_dtype) B_data = np.array([ [1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype) A_data = np.array([ [1, 0], [1, 1], [1, -1], [0, -1]], out_dtype) else: raise ValueError("Unsupported tile size for winograd: " + str(tile_size)) m = A_data.shape[1] r = 3 alpha = m + r - 1 H = (IH + 2 * HPAD - 3) // HSTR + 1 W = (IW + 2 * WPAD - 3) // WSTR + 1 nH, nW = (H + m-1) // m, (W + m-1) // m P = N * nH * nW ##### space definition begin ##### tile_bna_candidates = [1, 2, 4, 8, 16] factors = get_factors(CO) cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors]) cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16]) cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128) cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128) cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8) cfg.define_knob('yt', [1, 2, 4, 8, 16, 32]) ##### space definition end ##### if cfg.is_fallback: cfg['tile_bnb'].val = 4 cfg['tile_bna'].val = 4 while CO % cfg['tile_bna'].val != 0: cfg['tile_bna'].val //= 2 cfg['yt'].val = 8 cfg.fallback_split('tile_t1', [-1, 128]) cfg.fallback_split('tile_t2', [-1, 128]) cfg.fallback_split('c_unroll', [-1, 8]) bna = cfg['tile_bna'].val bnb = cfg['tile_bnb'].val P_round = (P + bnb - 1) // bnb * bnb assert CO % bna == 0 and P_round % bnb == 0 # pack input tile input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \ tvm.if_then_else( b * bnb + bb < P, data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps] [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d') # transform kernel if pre_computed: U = kernel else: G = const_matrix(G_data, 'G') r_kh = tvm.reduce_axis((0, KH), 'r_kh') r_kw = tvm.reduce_axis((0, KW), 'r_kw') U = tvm.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco: tvm.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image B = const_matrix(B_data, 'B') r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp: tvm.sum(input_tile[ci][p][r_a][r_b][vp] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='V') # batch gemm ci = tvm.reduce_axis((0, CI), name='c') M = tvm.compute((alpha, alpha, CO, P_round), lambda eps, nu, co, p: tvm.sum(U[eps][nu][co // bna][ci][co % bna] * V[eps][nu][p // bnb][ci][p % bnb], axis=ci), name='M') A = const_matrix(A_data, 'A') r_a = tvm.reduce_axis((0, alpha), 'r_a') r_b = tvm.reduce_axis((0, alpha), 'r_b') Y = tvm.compute((CO, P, m, m), lambda co, p, vh, vw: tvm.sum(M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='Y') # unpack output output = tvm.compute((N, CO, H, W), lambda n, co, h, w: Y[co][n * nH * nW + (h//m) * nW + w//m][h % m][w % m] # The following hack term is used to make the padding in batch gemm ("M") # effective, otherwise the padding will be eliminated by bound inference. # Use `tvm.expr.Mul` instead of `*` to avoid issues in const folding. + tvm.expr.Mul(tvm.const(0, out_dtype), M[alpha-1][alpha-1][CO-1][P_round-1]), name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * CO * H * W * KH * KW * CI) return output
def _bilinear(i, c, y, x): outside = tvm.any(y < -1.0, x < -1.0, y > height, x > width) y = tvm.max(y, 0.0) x = tvm.max(x, 0.0) val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1) return tvm.if_then_else(outside, 0.0, val)