def _get_pixel_value(n, c, h, w): if padding_mode == "zeros": return te.if_then_else( te.all(h >= 0, w >= 0, h < in_height, w < in_width), data[n, c, h, w], tir.const(0.0, dtype=data.dtype), ) if padding_mode == "border": h_b = te.max(te.min(h, in_height - 1), 0) w_b = te.max(te.min(w, in_width - 1), 0) return data[n, c, h_b, w_b] raise AssertionError("unsupported padding_mode")
def batch_matmul(x, y, oshape=None, auto_scheduler_rewritten_layout=""): """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data in batch. Supports broadcasting for batch dimension. Parameters ---------- x : tvm.te.Tensor 3-D with shape [batch, M, K] y : tvm.te.Tensor 3-D with shape [batch, N, K] oshape : List[Optional] Explicit intended output shape of the computation. Can be useful in cases with dynamic input shapes. auto_scheduler_rewritten_layout: str = "" The layout after auto-scheduler's layout rewrite pass. Returns ------- output : tvm.te.Tensor 3-D with shape [batch, M, N] """ x_shape = get_const_tuple(x.shape) if auto_scheduler_rewritten_layout: # Infer shape for the rewritten layout y_shape = auto_scheduler.get_shape_from_rewritten_layout( auto_scheduler_rewritten_layout, ["b", "j", "k"]) auto_scheduler.remove_index_check(y) else: y_shape = get_const_tuple(y.shape) assert len(x_shape) == 3 and len( y_shape) == 3, "only support 3-dim batch_matmul" XB = x_shape[0] YB = y_shape[0] _, M, K = x.shape k = te.reduce_axis((0, K), name="k") if oshape is None: assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match" assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant" batch = te.max(XB, YB) N = y.shape[1] oshape = (batch, M, N) output = te.compute( oshape, lambda b, i, j: te.sum(x[b if XB != 1 else 0, i, k] * y[b if YB != 1 else 0, j, k], axis=k), tag="batch_matmul", attrs={"layout_free_placeholders": [y]}, ) if auto_scheduler_rewritten_layout: output = auto_scheduler.rewrite_compute_body( output, auto_scheduler_rewritten_layout) return output
def check_target(device, m, n): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return # compute placeholder_a = te.placeholder((m, n), name="A") axis_k = te.reduce_axis((0, n)) placeholder_b = te.compute( (m,), lambda i: te.max(placeholder_a[i][axis_k], axis=axis_k), name="B" ) schedule = te.create_schedule(placeholder_b.op) # schedule axis_k = schedule[placeholder_b].op.reduce_axis[0] axis_ko, _ = schedule[placeholder_b].split(axis_k, nparts=nthx) schedule[placeholder_b].bind(axis_ko, thread_x) axis_xo, axis_xi = schedule[placeholder_b].split( schedule[placeholder_b].op.axis[0], factor=nthy ) schedule[placeholder_b].bind(axis_xi, thread_y) schedule[placeholder_b].bind(axis_xo, block_x) tvm.lower(schedule, [placeholder_a, placeholder_b], simple_mode=True) # validation func = tvm.build(schedule, [placeholder_a, placeholder_b], device, name="warp_reduction") a_np = np.random.uniform(size=(m, n)).astype(placeholder_a.dtype) b_np = np.zeros((m,), dtype=placeholder_a.dtype) buff_a = tvm.nd.array(a_np, dev) buff_b = tvm.nd.array(b_np, dev) b_np = np.max(a_np, axis=1) func(buff_a, buff_b) tvm.testing.assert_allclose(buff_b.numpy(), b_np, rtol=1e-3, atol=1e-3)
def check_target(device, m, n): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return # compute A = te.placeholder((m, n), name="A") k = te.reduce_axis((0, n)) B = te.compute((m, ), lambda i: te.max(A[i][k], axis=k), name="B") s = te.create_schedule(B.op) # schedule k = s[B].op.reduce_axis[0] ko, _ = s[B].split(k, nparts=nthx) s[B].bind(ko, thread_x) xo, xi = s[B].split(s[B].op.axis[0], factor=nthy) s[B].bind(xi, thread_y) s[B].bind(xo, block_x) tvm.lower(s, [A, B], simple_mode=True) # validation func = tvm.build(s, [A, B], device, name="warp_reduction") a_np = np.random.uniform(size=(m, n)).astype(A.dtype) b_np = np.zeros((m, ), dtype=A.dtype) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) b_np = np.max(a_np, axis=1) func(a, b) tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
def batch_matmul(cfg, x, y, out_shape=None, out_dtype=None): """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data in batch. Supports broadcasting in batch dimension. Parameters ---------- cfg : ConfigSpace Autotvm tuning space config file x : tvm.te.Tensor 3-D with shape [batch, M, K] y : tvm.te.Tensor 3-D with shape [batch, N, K] out_shape : tuple or None Shape of the outputs Returns ------- output : tvm.te.Tensor 3-D with shape [batch, M, N] """ assert len(x.shape) == 3 and len( y.shape) == 3, "only support 3-dim batch_matmul" XB, M, XK = get_const_tuple(x.shape) YB, N, YK = get_const_tuple(y.shape) assert (XB == YB) or (YB == 1) or (XB == 1), "batch dimension doesn't match" assert XK == YK, "shapes of x and y is inconsistent" B = te.max(XB, YB) K = XK if out_shape is not None: assert out_shape[0] == B, "got invalid output shape" assert out_shape[1] == M, "got invalid output shape" assert out_shape[2] == N, "got invalid output shape" if cfg.is_fallback: _default_batch_matmul_config(cfg, M, N, K) k = te.reduce_axis((0, K), name="k") if out_dtype is None or out_dtype == x.dtype: C = te.compute( (B, M, N), lambda b, i, j: te.sum(x[b if XB != 1 else 0, i, k] * y[ b if YB != 1 else 0, j, k], axis=k), tag="batch_matmul", ) else: C = te.compute( (B, M, N), lambda b, i, j: te.sum( x[b if XB != 1 else 0, i, k].astype(out_dtype) * y[ b if YB != 1 else 0, j, k].astype(out_dtype), axis=k, ), tag="batch_matmul", ) return C
def _sample(i, c, ph, pw): roi = rois[i] batch_index = roi[0].astype("int32") roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[ 3], roi[4] roi_start_h *= spatial_scale roi_end_h *= spatial_scale roi_start_w *= spatial_scale roi_end_w *= spatial_scale # force malformed ROIs to be 1x1 roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype)) roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype)) bin_h = roi_h / pooled_size_h bin_w = roi_w / pooled_size_w if sample_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const( sample_ratio, "int32") else: roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32") roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32") count = roi_bin_grid_h * roi_bin_grid_w rh = te.reduce_axis((0, roi_bin_grid_h)) rw = te.reduce_axis((0, roi_bin_grid_w)) roi_start_h += ph * bin_h roi_start_w += pw * bin_w if avg_mode: return te.sum( _bilinear( batch_index, c, roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w, ) / count, axis=[rh, rw], ) # max mode return te.max( _bilinear( batch_index, c, roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w, ), axis=[rh, rw], )
def _compute_intn(dtype, value, *indices): assert output_scale is not None and output_zero_point is not None const_min = tvm.tir.min_value(dtype) const_max = tvm.tir.max_value(dtype) # Use indexmod to handle both scalar and per-channel QNN parameters. scale_idx = tir.indexmod(indices[axis], topi.shape(output_scale)[0]) zp_idx = tir.indexmod(indices[axis], topi.shape(output_zero_point)[0]) return te.max( te.min( te.round(value[indices] / output_scale[scale_idx]) + output_zero_point[zp_idx], const_max, ), const_min, )
def stmt_calc(t, n, c, h, w, i, j): if trace_mode.mode == 'tvm': if pool_type == 'max': t['out'][n, c, h, w] = te.max( t['out'][n, c, h, w], t['x'][n, c, h * stride_height + i, w * stride_width + j]) else: t['out'][n, c, h, w] = t['out'][n, c, h, w] \ + t['x'][n, c, h * stride_height + i, w * stride_width + j] elif trace_mode.mode == 'tensor_access': t['out'][n, c, h, w] = t['x'][n, c, h * stride_height + i, w * stride_width + j] else: if pool_type == 'max': t['out'][n, c, h, w] = max(t['x'][n, c, h, w], t['x'][n, c, h * stride_height + i, w * stride_width + j]) else: t['out'][n, c, h, w] = t['out'][n, c, h, w] \ + t['x'][n, c, h * stride_height + i, w * stride_width + j]
def compute(n, ho, wo, co, hi, wi, ci): # Construct blockized strided maxpool height indices h = ho * block_H + hi h_contig = h * stride[0] + rh h_block_id = h_contig // block_H h_block_offset = h_contig % block_H # Construct blockized strided maxpool width indices w = wo * block_W + wi w_contig = w * stride[1] + rw w_block_id = w_contig // block_W w_block_offset = w_contig % block_W return te.max( X_packed[n, h_block_id, w_block_id, co, h_block_offset, w_block_offset, ci], axis=[rh, rw], )
def compute(batch, h_outer, w_outer, c_outer, h_inner, w_inner, c_inner): # Construct blockized strided maxpool height indices h = h_outer * block_h + h_inner h_contig = h * stride[0] + reduce_h h_block_id = h_contig // block_h h_block_offset = h_contig % block_h # Construct blockized strided maxpool width indices w_idx = w_outer * block_w + w_inner w_contig = w_idx * stride[1] + reduce_w w_block_id = w_contig // block_w w_block_offset = w_contig % block_w return te.max( x_packed[batch, h_block_id, w_block_id, c_outer, h_block_offset, w_block_offset, c_inner], axis=[reduce_h, reduce_w], )
def batch_matmul(lhs, rhs, transa=False, transb=False, iterative=False, **kwargs): """Create an extern op that compute batched matrix mult of A and rhs with CBLAS This function serves as an example on how to call external libraries. Parameters ---------- lhs: Tensor The left matrix operand rhs: Tensor The right matrix operand transa: bool Whether transpose lhs transb: bool Whether transpose rhs Returns ------- C: Tensor The result tensor. """ b = te.max(lhs.shape[0], rhs.shape[0]) n = lhs.shape[2] if transa else lhs.shape[1] m = rhs.shape[1] if transb else rhs.shape[2] return te.extern( (b, n, m), [lhs, rhs], lambda ins, outs: tvm.tir.call_packed( "tvm.contrib.cblas.batch_matmul" if not iterative else "tvm.contrib.cblas.batch_matmul_iterative", ins[0], ins[1], outs[0], transa, transb, ), name="C", **kwargs, )
def max_pool2d_compute(A, out_shape, kernel, stride, dilation): """max_pool2d compute""" kh, kw = kernel rh = te.reduce_axis((0, kh), name="rh") rw = te.reduce_axis((0, kw), name="rw") ob, oh, ow, oc = out_shape if isinstance(ob, int): validate_out_shape(out_shape, A.shape, kernel, stride, dilation) sh, sw = stride dh, dw = dilation Max = te.compute( out_shape, lambda b, h, w, c: te.max(A[b, h * sh + dh * rh, w * sw + dw * rw, c]. astype(A.dtype), axis=[rh, rw]), name="max", ) return Max
def pool(pool_type, c, nh, nw, kh, kw, ph=0, pw=0, sh=1, sw=1): """2D pooling pool_type: pooling type, 'max' or 'avg' c : channels nh, nw : input width and height kh, kw : kernel width and height ph, pw : height and width padding sizes, default 0 sh, sw : height and width strides, default 1 """ # reduction axes rkh = te.reduce_axis((0, kh), name='rkh') rkw = te.reduce_axis((0, kw), name='rkw') # output height and weights oh = d2ltvm.conv_out_size(nh, kh, ph, sh) ow = d2ltvm.conv_out_size(nw, kw, pw, sw) # pad X and then compute Y X = te.placeholder((c, nh, nw), name='X') if pool_type == 'max': PaddedX = d2ltvm.padding(X, ph, pw, val=te.min_value(X.dtype)) \ if ph * pw != 0 else X Y = te.compute((c, oh, ow), \ lambda c, h, w: \ te.max(PaddedX[c, h*sh+rkh, w*sw+rkw], \ axis=[rkh, rkw]), \ tag="pool_max", name='PoolMax') elif pool_type == 'avg': PaddedX = d2ltvm.padding(X, ph, pw) if ph * pw != 0 else X tsum = te.compute((c, oh, ow), \ lambda c, h, w: \ te.sum(PaddedX[c, h*sh+rkh, w*sw+rkw], \ axis=[rkh, rkw]), \ tag="pool_avg1", name='PoolSum') Y = te.compute((c, oh, ow), \ lambda c, h, w: \ tsum[c, h, w] / (kh*kw), \ tag='pool_avg2', name='PoolAvg') else: raise ValueError("Pool type should be 'avg' or 'max'.") return X, Y, PaddedX
def batch_matmul(x, y, oshape=None): """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data in batch. Supports broadcasting for batch dimension. Parameters ---------- x : tvm.te.Tensor 3-D with shape [batch, M, K] y : tvm.te.Tensor 3-D with shape [batch, N, K] oshape : List[Optional] Explicit intended output shape of the computation. Can be useful in cases with dynamic input shapes. Returns ------- output : tvm.te.Tensor 3-D with shape [batch, M, N] """ assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul" x_shape = get_const_tuple(x.shape) y_shape = get_const_tuple(y.shape) XB = x_shape[0] YB = y_shape[0] _, M, K = x.shape k = te.reduce_axis((0, K), name="k") if oshape is None: assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match" assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant" batch = te.max(XB, YB) N = y.shape[1] oshape = (batch, M, N) return te.compute( oshape, lambda b, i, j: te.sum(x[b if XB != 1 else 0, i, k] * y[b if YB != 1 else 0, j, k], axis=k), tag="batch_matmul", )
def process_post_ops(layer_idx, Input, Bias, post_op, pack=False, out_dtype="float32"): if pack: _, _, _, _, OC_vec = Input.shape BiasAdd = te.compute( Input.shape, lambda n, c_chunk, h, w, c_vec: Input[ n, c_chunk, h, w, c_vec] + Bias[c_chunk * OC_vec + c_vec], name='FusedConv2D_BiasAdd_{}'.format(layer_idx), tag='biasadd') else: BiasAdd = te.compute(Input.shape, lambda n, h, w, c: Input[n, h, w, c] + Bias[c], name='FusedConv2D_BiasAdd_{}'.format(layer_idx), tag='biasadd') # TODO: Recover this # if block_input is not None: # inputs = block_input if isinstance(block_input, list) else [block_input] # First = inputs[0] # TODO: Support multiple branches addition later # Last = self.stages[-1][-1] # Output if post_op is None, BiasAdd if it's not None # assert sorted(get_const_tuple(First.shape)) == sorted(get_const_tuple(Last.shape)), '{} is not the same as {}'.format(First.shape, Last.shape) # if self.pack: # Output = te.compute(self.output_shape, # lambda n, c_chunk, h, w, c_vec: (First[n, c_chunk, h, w, c_vec] + (Last[n, c_chunk, h, w, c_vec])), # name='ElementwiseAddOutput_{}'.format(self.layer_idx), # tag='elem_{}'.format(tag_suffix)) # else: # Output = te.compute(self.output_shape, # lambda n, h, w, c: (First[n, h, w, c] + (Last[n, h, w, c])), # name='ElementwiseAddOutput_{}'.format(self.layer_idx), # tag='elem_{}'.format(tag_suffix)) # self.stages[-1].append(Output) # Last = self.stages[-1][-1] # BiasAdd if it's not a block, Output if it's a block # Else: only bias_add Last = BiasAdd if post_op == 'relu': Last = te.compute( Last.shape, lambda *i: te.max(Last(*i), tvm.runtime.const(0, Last.dtype)), name='FusedConv2D_ReLU_{}'.format(layer_idx), tag='relu') elif post_op == 'sigmoid': Last = te.compute(Last.shape, lambda *i: te.sigmoid(Last(*i)), name='FusedConv2D_Sigmoid_{}'.format(layer_idx), tag='sigmoid') elif post_op == 'relu6': Last = te.compute( Last.shape, lambda *i: te.min( te.max(Last(*i), tvm.runtime.const(0, Last.dtype)), tvm.runtime.const(6, Last.dtype)), name='FusedConv2D_ReLU6_{}'.format(layer_idx), tag='relu6') return Last
def pooling_compute( ifm: te.Tensor, lut: te.Tensor, pooling_type: str, ifm_scale: float, ifm_zero_point: int, ofm_scale: float, ofm_zero_point: int, pool_shape: Tuple[int, int], ofm_channels: int, strides: Tuple[int, int], padding: Tuple[int, int, int, int], activation: str, clip_min: int, clip_max: int, rounding_mode: str, upscale: str, ifm_layout: str, ofm_layout: str, ) -> te.Tensor: """A compute operator representing the capabilities of pooling for the NPU. Parameters ---------- ifm : te.Tensor The Input Feature Map tensor (IFM). lut : te.Tensor The look-up table of values to use if activation = "LUT". pooling_type: str The type of the pooling. "AVG" - average pool, "MAX" - max pool. ifm_scale : float The quantization scale for the Input Feature Map tensor. ifm_zero_point : int The quantization zero point for the Input Feature Map tensor. ofm_scale : float The quantization scale for the Output Feature Map tensor. ofm_zero_point : int The quantization zero point for the Output Feature Map tensor. pool_shape : Tuple[int, int] The 2 dimensional pool shape as (pool_shape_height, pool_shape_width). ofm_channels : int The number of the Output Feature Map channels strides : Tuple[int, int] The 2 dimensional strides as (stride_height, stride_width). padding : Tuple[int, int, int, int] The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right). activation : str The activation function to use. "NONE" - no activation function. "CLIP" - clip the output between clip_min and clip_max. "TANH" - tanh activation function. "SIGMOID" - sigmoid activation function. "LUT" - use a look-up table to perform the activation function. clip_min : int The minimum clipping value if activation = "CLIP". clip_max : int The maximum clipping value if activation = "CLIP". rounding_mode : str The rounding mode to apply to the Output Feature Map tensor. "TFL" - Tensorflow Lite rounding scheme. "TRUNCATE" - Truncate towards zero. "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity. upscale : str The 2x2 upscaling mode to apply to the Input Feature Map tensor. "NONE" - no upscaling. "NEAREST" - upscale using nearest neighbour. "ZEROS" - upscale using zeros. ifm_layout : str The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16". ofm_layout : str The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16". Returns ------- te.Tensor The OFM tensor. """ stride_h, stride_w = strides pool_shape_h, pool_shape_w = pool_shape # Compute operation for the IFM DMA pipeline dmaed_ifm = dma_ifm_compute(ifm, ifm_layout, ifm_zero_point, ifm_scale, ofm_channels, padding) # Pooling compute operation ofm_height = (dmaed_ifm.shape[1] - pool_shape_h) // stride_h + 1 ofm_width = (dmaed_ifm.shape[2] - pool_shape_w) // stride_w + 1 rh = te.reduce_axis((0, pool_shape_h), name="ry") rw = te.reduce_axis((0, pool_shape_w), name="rx") pooling_attrs = { "op": "ethosu_pooling", "pooling_type": pooling_type, "stride_h": stride_h, "stride_w": stride_w, "activation": activation, "clip_min": clip_min, "clip_max": clip_max, "rounding_mode": rounding_mode, "upscale": upscale, } # This is a trick to insert the LUT tensor into the TE graph if LUT is present lut_expr = (lut[0] + lut[255]).astype(ifm.dtype) if activation in ("TANH", "LUT") else 0 # Add the LUT tensor to the attributes to be able to later tell which tensor is the LUT if activation in ("TANH", "LUT"): pooling_attrs["lut"] = lut pooling = te.compute( (1, ofm_height, ofm_width, ofm_channels), lambda nn, hh, ww, cc: te.max( (dmaed_ifm(nn, hh * stride_h + rh, ww * stride_w + rw, cc) + lut_expr).astype(ifm.dtype), axis=[rh, rw], ), name="ethosu_pooling", attrs=pooling_attrs, ) # Compute operation for the OFM DMA pipeline return dma_ofm_compute(pooling, ofm_layout, ofm_zero_point, ofm_scale, ofm_channels)
def dilation2d_nhwc(input, filter, stride, padding, dilations, out_dtype=None): """Morphological 2d dilation NHWC layout. Parameters ---------- input : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] filter : tvm.Tensor 3-D with shape [filter_height, filter_width, in_channel] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int Padding size dilations: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype : Optional[str] Specifies the output data type. Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_height, out_width, in_channel] """ if out_dtype is None: out_dtype = input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilations, int) or len(dilations) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilations, int): dilation_h = dilation_w = dilations else: dilation_h, dilation_w = dilations batch, in_height, in_width, in_channel = input.shape kernel_h, kernel_w, channel = filter.shape assert in_channel.value == channel.value, \ "For Dilation2D input and filter channels should be same." # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] padded_input = pad(input, pad_before, pad_after, name="padded_input") ry = te.reduce_axis((0, kernel_h), name='ry') rx = te.reduce_axis((0, kernel_w), name='rx') return te.compute((batch, out_height, out_width, in_channel), lambda nn, yy, xx, ff: te.max(padded_input[ nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ff].astype(out_dtype) + filter[ ry, rx, ff].astype(out_dtype), axis=[ry, rx]), tag="dilation2d_nhcw")
def test_basic_operation(): np.random.seed(0) shape = (10, 10) x = te.var("x", dtype='float32') k = te.reduce_axis((0, 10), name="k") l = te.reduce_axis((0, 10), name="l") A0 = te.placeholder(shape, name='A0') A1 = te.placeholder(shape, name='A1') zeros = np.zeros(shape) B = te.compute(shape, lambda i, j: A0[i, j], name='B') check_grad(B, [A0]) B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name='B') check_grad(B, [A0, A1]) B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name='B') check_grad(B, A0, desired_grads=[zeros]) B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name='B') check_grad(B, A0, desired_grads=[zeros]) B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name='B') check_grad(B, A0, desired_grads=[zeros]) B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name='B') check_grad(B, A0, desired_grads=[zeros]) B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name='B') check_grad(B, A0) B = te.compute( shape, lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]), name='B') check_grad(B, A0, data_range=(0.1, 10)) B = te.compute(shape, lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]), name='B') check_grad(B, A0, data_range=(-4, 4)) B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name='B') check_grad(B, A0) B = te.compute((10, ), lambda i: te.sum(A0[i, k] * A0[k, i], axis=k), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k), name='B') check_grad(B, A0) B = te.compute(shape, lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]), name='B') check_grad(B, [A0, A1]) B = te.compute(shape, lambda i, j: te.sum( A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k), name='B') check_grad(B, A0) def fcombine(x, y): return x * y def fidentity(t0): return tvm.tir.const(1, t0) prod = te.comm_reducer(fcombine, fidentity, name='prod') B = te.compute((10, 10), lambda i, j: prod(A0[i, k] + A0[k, i], axis=k), name='B') check_grad(B, A0) X = te.placeholder((10, ), name='X') A = te.compute((10, ), lambda i: X[i] + X[9 - i]) B = te.compute((10, ), lambda i: X[i] * X[9 - i]) Y = topi.tensordot(A, B, 1) check_grad(Y, X)
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" A_=te.placeholder(shape,dtype=dtype,name="A_") A=te.placeholder(shape,dtype=dtype,name="A") #desined by myself k = te.reduce_axis((0, A.shape[1]), name="k") A_max = te.compute((A.shape[0],), lambda i: te.max(A[i, k], axis=k)) A_ex = te.compute(shape, lambda i, j: te.exp(A[i, j] - A_max[i])) k1 = te.reduce_axis((0, A.shape[1]), name="k1") A_ex_sum = te.compute((A.shape[0],), lambda i: te.sum(A_ex[i, k1], axis=k1)) A_logsoftmax = te.compute(shape, lambda i, j: te.log(A_ex[i, j] / A_ex_sum[i])) k2=te.reduce_axis((0,shape[1]),name="k2") A_logsoftmax_sum=te.compute((shape[0],0),lambda i:te.sum(A_logsoftmax[i,k2]*A_[i,k2],axis=k2)) k3=te.reduce_axis((0,shape[0]),name="k3") B=te.compute((1,),lambda i: te.sum(-A_logsoftmax_sum[k3],axis = k3)) B1=te.compute((1,), lambda i: B[i] / shape[0]) s=te.create_schedule(B1.op) if tgt=="cuda": #I'dont know why it can't work? s = te.create_schedule(B1.op) num_thread = 64 block_x = te.thread_axis("blockIdx.x") thread_x = te.thread_axis((0, num_thread), "threadIdx.x") s[A_ex].bind(A_ex.op.axis[0], block_x) s[A_max].bind(A_max.op.axis[0], block_x) k_ex_sum = A_ex_sum.op.reduce_axis[0] ko, ki = s[A_ex_sum].split(k_ex_sum, factor=num_thread) EF = s.rfactor(A_ex_sum, ki) s[A_ex_sum].bind(s[A_ex_sum].op.axis[0], block_x) s[A_ex_sum].bind(s[A_ex_sum].op.reduce_axis[0], thread_x) s[EF].compute_at(s[A_ex_sum], s[A_ex_sum].op.reduce_axis[0]) s[A_ex_sum].set_store_predicate(thread_x.var.equal(0)) tx, xi = s[A_logsoftmax].split(A_logsoftmax.op.axis[1], nparts=num_thread) s[A_logsoftmax].bind(A_logsoftmax.op.axis[0], block_x) s[A_logsoftmax].bind(tx, thread_x) k_logsoftmax_sum = A_logsoftmax_sum.op.reduce_axis[0] klso, klsi = s[A_logsoftmax_sum].split(k_logsoftmax_sum, factor=num_thread) lsEF = s.rfactor(A_logsoftmax_sum, klsi) s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.axis[0], block_x) s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.reduce_axis[0], thread_x) s[lsEF].compute_at(s[A_logsoftmax_sum], s[A_logsoftmax_sum].op.reduce_axis[0]) s[A_logsoftmax_sum].set_store_predicate(thread_x.var.equal(0)) k_B=B.op.reduce_axis[0] kbo,kbi=s[B].split(k_B,factor=num_thread) bEF=s.rfactor(B,kbi) s[B].bind(s[B].op.reduce_axis[0],thread_x) s[bEF].compute_at(s[B],s[B].op.reduce_axis[0]) s[B].set_store_predicate(block_x.var.equal(0)) s[B1].set_store_predicate(block_x.var.equal(0)) print(tvm.lower(s, [A, A_,B1], simple_mode=True)) f=tvm.build(s,[A,A_,B1],tgt,tgt_host,name=func_name) return f
def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None): """Morphological dilation operator in NCHW layout. Parameters ---------- input : tvm.te.Tensor 4-D with shape [batch, in_channel, in_height, in_width] filter : tvm.te.Tensor 3-D with shape [ in_channel, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size dilations: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype : Optional[str] Specifies the output data type. Returns ------- Output : tvm.te.Tensor 4-D with shape [batch, in_channel, out_height, out_width] """ if out_dtype is None: out_dtype = input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilations, int) or len(dilations) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilations, int): dilation_h = dilation_w = dilations else: dilation_h, dilation_w = dilations batch, in_channel, in_height, in_width = input.shape channel, kernel_h, kernel_w = filter.shape assert (in_channel.value == channel.value ), "For Dilation2D input and filter channels should be same." # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] temp = pad(input, pad_before, pad_after, name="pad_temp") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") return te.compute( (batch, in_channel, out_height, out_width), lambda nn, ff, yy, xx: te.max( temp[nn, ff, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype(out_dtype) + filter[ff, ry, rx].astype( out_dtype), axis=[ry, rx], ), tag="dilation2d_nchw", )
def batch_matmul( tensor_a, tensor_b, oshape=None, out_dtype=None, transpose_a=False, transpose_b=True, auto_scheduler_rewritten_layout="", meta_schedule_original_shape=None, ): """Compute batch matrix multiplication of `tensor_a` and `tensor_b`. Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT format (transpose_a=False, transpose_b=True) by default. Parameters ---------- tensor_a : tvm.te.Tensor 3-D with shape [batch, M, K] or [batch, K, M]. tensor_b : tvm.te.Tensor 3-D with shape [batch, K, N] or [batch, N, K]. oshape : List[Optional] Explicit intended output shape of the computation. Can be useful in cases with dynamic input shapes. out_dtype : Optional[str] Specifies the output data type for mixed precision batch matmul. transpose_a : Optional[bool] = False Whether the first tensor is in transposed format. transpose_b : Optional[bool] = True Whether the second tensor is in transposed format. auto_scheduler_rewritten_layout: Optional[str] = "" The layout after auto-scheduler's layout rewrite pass. meta_schedule_original_shape: Optional[List[PrimExpr]] = None The original shape of the tensor Returns ------- output : tvm.te.Tensor 3-D with shape [batch, M, N] """ assert len(tensor_a.shape) == 3, "tensor_a only support 3-dim" if transpose_a: XB, XK, XI = get_const_tuple(tensor_a.shape) else: XB, XI, XK = get_const_tuple(tensor_a.shape) if auto_scheduler_rewritten_layout: # Infer shape for the rewritten layout YB, YK, YJ = auto_scheduler.get_shape_from_rewritten_layout( auto_scheduler_rewritten_layout, ["b", "k", "j"]) auto_scheduler.remove_index_check(tensor_b) elif meta_schedule_original_shape: auto_scheduler.rewrite_tensor_shape(tensor_b, meta_schedule_original_shape) if transpose_b: YB, YJ, YK = get_const_tuple(tensor_b.shape) else: YB, YK, YJ = get_const_tuple(tensor_b.shape) else: assert len(tensor_b.shape) == 3, "tensor_b only support 3-dim" if transpose_b: YB, YJ, YK = get_const_tuple(tensor_b.shape) else: YB, YK, YJ = get_const_tuple(tensor_b.shape) assert XK == YK or isinstance( YK, tvm.tir.expr.Var), "shapes of x and y are inconsistent" k = te.reduce_axis((0, XK), name="k") if oshape is None: assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match" batch = (tvm.tir.expr.SizeVar("batch", "int32") if isinstance(XB, tvm.tir.expr.Var) or isinstance(YB, tvm.tir.expr.Var) else te.max(XB, YB)) oshape = (batch, XI, YJ) if out_dtype is None: out_dtype = tensor_a.dtype if tensor_a.dtype != tensor_b.dtype: logger.warning( "tensor_a has different data type with tensor_b: %s, %s", tensor_a.dtype, tensor_b.dtype, ) if (transpose_a, transpose_b) == (True, True): compute_lambda = lambda b, i, j: te.sum( tensor_a[b if XB != 1 else 0, k, i].astype(out_dtype) * tensor_b[ b if YB != 1 else 0, j, k].astype(out_dtype), axis=k, ) compute_name = "T_batch_matmul_TT" elif (transpose_a, transpose_b) == (True, False): compute_lambda = lambda b, i, j: te.sum( tensor_a[b if XB != 1 else 0, k, i].astype(out_dtype) * tensor_b[ b if YB != 1 else 0, k, j].astype(out_dtype), axis=k, ) compute_name = "T_batch_matmul_TN" elif (transpose_a, transpose_b) == (False, True): compute_lambda = lambda b, i, j: te.sum( tensor_a[b if XB != 1 else 0, i, k].astype(out_dtype) * tensor_b[ b if YB != 1 else 0, j, k].astype(out_dtype), axis=k, ) compute_name = "T_batch_matmul_NT" else: # (transpose_a, transpose_b) == (False, False): compute_lambda = lambda b, i, j: te.sum( tensor_a[b if XB != 1 else 0, i, k].astype(out_dtype) * tensor_b[ b if YB != 1 else 0, k, j].astype(out_dtype), axis=k, ) compute_name = "T_batch_matmul_NN" output = te.compute( oshape, compute_lambda, name=compute_name, tag="batch_matmul", attrs={"layout_free_placeholders": [tensor_b]}, ) if auto_scheduler_rewritten_layout: output = auto_scheduler.rewrite_compute_body( output, auto_scheduler_rewritten_layout) return output
def _clip_coordinates(x, size): return te.min(te.max(x, 0), size - 1)