def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Depthwise convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, channel_chunk, in_height, in_width, channel_block = Input.shape _, channel_multiplier, kernel_h, kernel_w, _ = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel_chunk = simplify(channel_chunk * channel_multiplier) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") if autotvm.GLOBAL_SCOPE.in_tuning: # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c] # NCHWc x CMRS # texture: NCH|W|c # texture: C|MRS|c Filter_tx = te.compute( (channel_chunk, channel_multiplier * kernel_h * kernel_w, channel_block), lambda ffc, mrs, ffb: Filter[ffc, mrs // (kernel_h * kernel_w), ( mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb], name="packed_filter") conv = te.compute( (batch, out_channel_chunk, out_height, out_width, channel_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, ffc // channel_multiplier, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffb] * Filter_tx[ffc // channel_multiplier, ( (ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx, ffb]).astype(args["accumulator"]), axis=[ry, rx], ), tag="depthwise_conv2d_nchwc_kcrsk_texture", ) else: conv = te.compute( (batch, out_channel_chunk, out_height, out_width, channel_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, ffc // channel_multiplier, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffb] * Filter[ffc // channel_multiplier, ffc % channel_multiplier, ry, rx, ffb]).astype(args["accumulator"]), axis=[ry, rx], ), tag="depthwise_conv2d_nchwc_kcrsk", ) return te.compute( conv.shape, lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype), tag="cast_from_acc" + args["accumulator"][-2:])
def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Depthwise convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, channels = Input.shape kernel_h, kernel_w, _, _ = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height_orig = out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) channel_block = 4 channel_chunk = channels // channel_block num_filter_chunk = 1 # compute: Input = te.compute( [batch, in_height, in_width, channel_chunk, channel_block], lambda nn, yy, xx, icc, icb: Input[nn, yy, xx, icc * 4 + icb], name="input_pack", tag="input_pack", ) Filter = te.compute( [kernel_h, kernel_w, channel_chunk, num_filter_chunk, channel_block], lambda kh, kw, ifc, nfc, cb: Filter[kh, kw, ifc * 4 + cb, nfc], name="filter_pack", tag="filter_pack", ) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which valuses would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 # compute graph pad_before = [0, pad_top, pad_left, 0, 0] pad_after = [0, pad_down, pad_right, 0, 0] # calculation of real used input size: input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[3] + pad_after[3]: pad_after[ 3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w if input_latest_h < in_height + pad_before[2] + pad_after[2]: pad_after[ 2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch, out_height, out_width, channel_chunk, channel_block), lambda nn, yy, xx, ffc, ffb: te.sum( (temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffc, ffb] * Filter[ry, rx, ffc, 0, ffb]).astype( args["accumulator"]), axis=[ry, rx], ), tag="depthwise_conv2d_nhwc", ) dummy_cast = te.compute( (batch, out_height_orig, out_width_orig, channel_chunk, channel_block), lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype), tag="dummy_cast") return te.compute((batch, out_height_orig, out_width_orig, channels), lambda n, y, x, c: dummy_cast[n, y, x, c // 4, c % 4], tag="cast_from_acc" + args["accumulator"][-2:])
def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Convolution operator in NHWC layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape kernel_h, kernel_w, _, out_channels = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height_orig = out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) in_channel_block = 4 in_channel_tail = in_channel % in_channel_block in_channel_chunk = in_channel // in_channel_block num_filter_block = 4 num_filter_tail = out_channels % num_filter_block num_filter_chunk = out_channels // num_filter_block pad_value = tvm.tir.const(0, Input.dtype) # compute: if in_channel_tail == 0: Input = te.compute( [batch, in_height, in_width, in_channel_chunk, in_channel_block], lambda nn, yy, xx, icc, icb: Input[nn, yy, xx, icc * in_channel_block + icb], name="input_pack", tag="input_pack", ) else: in_channel_chunk += 1 def _reorder_data(*indices): condition = [] condition.append(indices[3] == in_channel_chunk - 1) condition.append(indices[4] >= in_channel_tail) condition = tvm.tir.all(*condition) return tvm.tir.if_then_else( condition, pad_value, Input[indices[0], indices[1], indices[2], indices[3] * in_channel_block + indices[4]]) Input = te.compute( [batch, in_height, in_width, in_channel_chunk, in_channel_block], _reorder_data, name="input_pack", tag="input_pack_expanded", ) if num_filter_tail == 0 and in_channel_tail == 0: Filter = te.compute( [ kernel_h, kernel_w, in_channel, num_filter_chunk, num_filter_block ], lambda kh, kw, ic, nfc, nfb: Filter[kh, kw, ic, nfc * num_filter_block + nfb], name="filter_pack", tag="filter_pack", ) else: num_filter_chunk += 1 # HWIO def _reorder_weights(*indices): conditionA = [] conditionA.append(indices[3] == num_filter_chunk - 1) conditionA.append(indices[4] >= num_filter_block) conditionAT = tvm.tir.all(*conditionA) conditionO = [] conditionO.append(conditionAT) conditionO.append( indices[2] >= in_channel_chunk * in_channel_block + in_channel_tail) conditionOT = tvm.tir.any(*conditionO) return tvm.tir.if_then_else( conditionOT, pad_value, Filter[indices[0], indices[1], indices[2], indices[3] * num_filter_block + indices[4]]) Filter = te.compute( [ kernel_h, kernel_w, in_channel, num_filter_chunk, num_filter_block ], _reorder_weights, name="filter_pack", tag="filter_pack_expanded", ) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which valuses would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 # compute graph pad_before = [0, pad_top, pad_left, 0, 0] pad_after = [0, pad_down, pad_right, 0, 0] # calculation of real used input size: input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[3] + pad_after[3]: pad_after[ 3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w if input_latest_h < in_height + pad_before[2] + pad_after[2]: pad_after[ 2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") rcc = te.reduce_axis((0, in_channel_chunk), name="rc") rcb = te.reduce_axis((0, in_channel_block), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch, out_height, out_width, num_filter_chunk, num_filter_block), lambda nn, yy, xx, fc, fb: te.sum( (temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcc, rcb] * Filter[ry, rx, rcc * in_channel_block + rcb, fc, fb]).astype(args[ "accumulator"]), axis=[ry, rx, rcc, rcb], ), tag="conv2d_nhwc", ) dummy_cast = te.compute( (batch, out_height_orig, out_width_orig, num_filter_chunk, num_filter_block), lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype), tag="dummy_cast") return te.compute((batch, out_height_orig, out_width_orig, out_channels), lambda n, y, x, c: dummy_cast[n, y, x, c // 4, c % 4], tag="cast_from_acc" + args["accumulator"][-2:])
def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height_orig = out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which valuses would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] # calculation of real used input size: input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[3] + pad_after[3]: pad_after[ 3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w if input_latest_h < in_height + pad_before[2] + pad_after[2]: pad_after[ 2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") rcc = te.reduce_axis((0, in_channel_chunk), name="rc") rcb = te.reduce_axis((0, in_channel_block), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") # When tuning, insert a cache_read("texture") stage to properly test # performance of kernels that utlize texture inputs. The cache_read # is not needed when using the graph_runtime which supports passing # in external texture buffers. This can be removed once AutoTVM tuning # supports capturing this runtime information during task extraction # or once texture lowering in tir.TextureFlatten supports cache_read # cancellation when padding is utilized. if autotvm.GLOBAL_SCOPE.in_tuning: # NCHWc x KCRSk # texture: NCH|W|c # texture: K|CRS|k Filter_tx = te.compute( (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), ( crs // kernel_w) % kernel_h, crs % kernel_w, ffb], name="packed_filter") conv = te.compute( (batch, num_filter_chunk, out_height, out_width, num_filter_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * Filter_tx[ffc, ( (rcc * in_channel_block + rcb) * kernel_h + ry ) * kernel_w + rx, ffb]).astype(args["accumulator"]), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc", ) else: conv = te.compute( (batch, num_filter_chunk, out_height, out_width, num_filter_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * Filter[ ffc, rcc * in_channel_block + rcb, ry, rx, ffb]). astype(args["accumulator"]), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc", ) return te.compute( (batch, num_filter_chunk, out_height_orig, out_width_orig, num_filter_block), lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype), tag="cast_from_acc" + args["accumulator"][-2:])
def add_pad( data, layout, out_height, out_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w, ): """Computes required padding values by the parameters of conv2d and adds compute for extending of original tensor Parameters ---------- data: tvm.te.Tensor 5d tensor, the layout of spatial dimensions are defined as separate argument layout: string Layout of origin 4d tensor out_height: int Height of the output feature map out_width: int Width of the output feature map kernel_h: int Height of the conv2d kernel kernel_w: int Width of the conv2d kernel dilation_h: int Height dilation value from conv2d attributes dilation_w: int Width dilation value from conv2d attributes padding: list / tuple of n ints Padding values from conv2d attributes stride_h: int Height stride value from conv2d attributes stride_w: int Width stride value from conv2d attributes Returns ------- Output : tvm.te.Tensor n-D, the same layout as Input. """ dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) # compute graph if layout == "NCHW": y_axis = 2 x_axis = 3 if len(data.shape) == 4: _, _, in_height, in_width = data.shape else: _, _, in_height, in_width, _ = data.shape elif layout == "NHWC": y_axis = 1 x_axis = 2 if len(data.shape) == 4: _, in_height, in_width, _ = data.shape else: _, in_height, in_width, _, _ = data.shape else: assert False, "not supported layout in adreno util add_pad" pad_before = [0, 0, 0, 0, 0] pad_after = [0, 0, 0, 0, 0] pad_before[y_axis] = pad_top pad_before[x_axis] = pad_left pad_after[y_axis] = pad_down pad_after[x_axis] = pad_right # calculation of real used input size: input_latest_w = (out_width - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[x_axis] + pad_after[x_axis]: pad_after[x_axis] -= in_width + pad_before[x_axis] + pad_after[ x_axis] - input_latest_w if input_latest_h < in_height + pad_before[y_axis] + pad_after[y_axis]: pad_after[y_axis] -= in_height + pad_before[y_axis] + pad_after[ y_axis] - input_latest_h return nn.pad(data, pad_before, pad_after, name="pad_temp")
def expand_spatial_dimensions(in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w): """ Expands spatial dimensions to be dividable by factor 4. This will allow us to do extrimely better parallel computation on GPU. The drawback of this solution - it will be number of useless computations. By fact the speed-up of parallelism significantly overcomes the slowdown of extra compute and eventuially this is useful approach, at least for GPU Parameters ---------- in_height: int Height of the feature map in_width: int Width of the feature map kernel_h: int Height of the conv2d kernel kernel_w: int Width of the conv2d kernel dilation_h: int Vertical dilation of the conv2d kernel dilation_w: int Horizontal dilation of the conv2d kernel padding: tuple or list Conv2d paddings stride_h: int Vertical stride of the conv2d kernel stride_w: int Horizontal stride of the conv2d kernel """ dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height_orig = out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which values would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 return out_height_orig, out_height, out_width_orig, out_width
def compute_conv2d_NCHWc_tpack(Input, Filter, stride, padding, dilation, out_dtype=None, args={}): """Convolution operator in NCHWc layout. """ if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channels, in_height, in_width = Input.shape out_channles, _, kernel_h, kernel_w = Filter.shape in_channel_tail = in_channels % 4 in_channel_chunk = in_channels // 4 if in_channel_tail == 0: in_channel_tail = 4 else: in_channel_chunk += 1 num_filter_block = out_channles % 4 num_filter_chunk = out_channles // 4 if num_filter_block == 0: num_filter_block = 4 else: num_filter_chunk += 1 pad_value = tvm.tir.const(0, Input.dtype) def _reorder_data(*indices): condition = [] condition.append(indices[1] == in_channel_chunk - 1) condition.append(indices[4] >= in_channel_tail) condition = tvm.tir.all(*condition) return tvm.tir.if_then_else( condition, pad_value, Input[indices[0],indices[1] * 4 + indices[4], indices[2], indices[3]]) # compute: reordered_data = te.compute( [batch, in_channel_chunk, in_height, in_width, 4], _reorder_data, name="input_pack", tag="input_pack", ) def _reorder_weights(*indices): conditionA = [] conditionA.append(indices[0] == num_filter_chunk - 1) conditionA.append(indices[4] >= num_filter_block) conditionAT = tvm.tir.all(*conditionA) conditionO = [] conditionO.append(conditionAT) conditionO.append(indices[1] >= in_channel_chunk * 4 + in_channel_tail) conditionOT = tvm.tir.any(*conditionO) return tvm.tir.if_then_else( conditionOT, pad_value, Filter[indices[0] * 4 + indices[4], indices[1], indices[2], indices[3]]) reordered_filter = te.compute( [num_filter_chunk, in_channel_chunk * 4, kernel_h, kernel_w, 4], _reorder_weights, name="filter_pack", tag="filter_pack", ) # batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape # num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w) ) out_height_orig = out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width_orig = out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # can output shape be divded by 2 or even 4? # if it cannot be divided, need to extend for further help with split # theortically there should be addition padding for inputs, but it will be optimized by # cache_read InferBound. We must proceed pad here exactly to produce tensor which is # required for calculation of original out size, not more! In other case intermediate # tensor might be allcoated with less sizes while compute will try to fill the expanded # one - data discrepancy as a result # And in case of textures it is not a problem if we provide texture of less size because # 1. It is not important which valuses would be for extra calc - these calculations are # required only for better utilizatin of GPU fit to working groups # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned # above, the value itself is not important if out_height % 2 != 0: out_height += 1 if out_width % 2 != 0: out_width += 1 if out_height % 4 != 0: out_height += 2 if out_width % 4 != 0: out_width += 2 # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] # calculation of real used input size: input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 if input_latest_w < in_width + pad_before[3] + pad_after[3]: pad_after[3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w if input_latest_h < in_height + pad_before[2] + pad_after[2]: pad_after[2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h temp = nn.pad(reordered_data, pad_before, pad_after, name="pad_temp") rcc = te.reduce_axis((0, in_channel_chunk), name="rcc") rcb = te.reduce_axis((0, 4), name="rcb") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch, num_filter_chunk, out_height, out_width, 4), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * reordered_filter[ffc, rcc * 4 + rcb, ry, rx, ffb]).astype(args["accumulator"]), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc_tpack", ) # conv = s.cache_write(conv, "local") does not work properly, it does not create # intermediate buffer, continues to read/write from global tensor as accumulator and # leads to the crash in runtime # due to this reason we had to use such dummy cast and compute_at to create such intermediate # accumulator with local scope dummy_cast = te.compute((batch, num_filter_chunk, out_height_orig, out_width_orig, 4), lambda n,fc,y,x,fb: conv[n,fc,y,x,fb].astype(out_dtype), tag="dummy_cast") return te.compute((batch, out_channles, out_height_orig, out_width_orig), lambda n,c,y,x: dummy_cast[n,c // 4,y,x,c % 4], tag="cast_from_acc" + args["accumulator"][-2:])
def conv2d_winograd_comp( cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, layout ): """Compute declaration for winograd Parameters ---------- cfg: ConfigEntity The config for this template data: tvm.te.Tensor 4-D or 5-D Data tensor with shape NCHW or NCHW4c kernel: tvm.te.Tensor 4-D or 5-D tensor with shape OIHW or OIHW4o strides: int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding: int or a list/tuple of 2 or 4 ints padding size, or [pad_height, pad_width] for 2 ints, or [pad_top, pad_left, pad_bottom, pad_right] for 4 ints dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype: str The output type. This is used for mixed precision. args: dict Dictionary with additional arguments, e.g. accumulator type pre_computed: bool Flag if weights were pre computed if true or the weights should be computed in runtime layout: str NHWC or NCHW values are accepted Returns ------- output: tvm.te.Tensor 4-D or 5-D with shape NCHW or NCHW4c """ assert layout in ("NCHW", "NHWC") tile_size = infer_tile_size(data, layout) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides convert_from4d = False if len(data.shape) == 4: convert_from4d = True if layout == "NCHW": N, DCI, H, W = get_const_tuple(data.shape) else: N, H, W, DCI = get_const_tuple(data.shape) if not pre_computed: if layout == "NCHW": out_channels, CI, KH, KW = get_const_tuple(kernel.shape) else: KH, KW, CI, out_channels = get_const_tuple(kernel.shape) else: alpha, _, CI, out_channels = get_const_tuple(kernel.shape) KH = KW = alpha + 1 - tile_size in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(CI, 4) out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channels, 4) if autotvm.GLOBAL_SCOPE.in_tuning is True: if layout == "NCHW": dshape = (N, in_channel_chunks, H, W, in_channel_block) else: dshape = (N, H, W, in_channel_chunks, in_channel_block) if not pre_computed: # kernel tensor is raw tensor, do strict check if layout == "NCHW": kshape = (out_channel_chunks, CI, KH, KW, out_channel_block) else: kshape = (KH, KW, CI, out_channel_chunks, out_channel_block) else: kshape = (alpha, alpha, CI, out_channel_chunks, out_channel_block) data = tvm.te.placeholder(dshape, data.dtype, name="data_placeholder") kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel_placeholder") else: data = pack_input( data, layout, N, in_channel_chunks, in_channel_block, in_channel_tail, H, W ) kernel_layout = "OIHW" if layout == "NCHW" else "HWIO" if not pre_computed: # kernel tensor is raw tensor, do strict check kernel = pack_filter( kernel, kernel_layout, out_channel_chunks, out_channel_block, out_channel_tail, CI, in_channel_chunks, in_channel_block, in_channel_tail, KH, KW, ) else: kernel = pack_filter( kernel, "HWIO", out_channel_chunks, out_channel_block, out_channel_tail, CI, in_channel_chunks, in_channel_block, in_channel_tail, alpha, alpha, ) if layout == "NCHW": N, DCI, H, W, CB = get_const_tuple(data.shape) else: N, H, W, DCI, CB = get_const_tuple(data.shape) if not pre_computed: # kernel tensor is raw tensor, do strict check if layout == "NCHW": CO, CI, KH, KW, COB = get_const_tuple(kernel.shape) else: KH, KW, CI, CO, COB = get_const_tuple(kernel.shape) alpha = KW + tile_size - 1 assert HSTR == 1 and WSTR == 1 and KH == KW else: alpha, _, CI, CO, COB = get_const_tuple(kernel.shape) KH = KW = alpha + 1 - tile_size assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1 if isinstance(N, tvm.tir.Any): N = tvm.te.size_var("n") if not isinstance(H, int) or not isinstance(W, int): raise RuntimeError( "adreno winograd conv2d doesn't support dynamic input\ height or width." ) pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW)) if layout == "NCHW": data_pad = nn.pad(data, (0, 0, pt, pl, 0), (0, 0, pb, pr, 0), name="data_pad") else: data_pad = nn.pad(data, (0, pt, pl, 0, 0), (0, pb, pr, 0, 0), name="data_pad") r = KW m = tile_size A, B, G = winograd_transform_matrices(m, r, out_dtype) H = (H + pt + pb - KH) // HSTR + 1 W = (W + pl + pr - KW) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW if isinstance(N, int) else nH * nW # transform kernel if not pre_computed: r_kh = te.reduce_axis((0, KH), name="r_kh") r_kw = te.reduce_axis((0, KW), name="r_kw") if layout == "NCHW": kernel_pack = te.compute( (alpha, alpha, CI, CO, COB), lambda eps, nu, ci, co, cob: te.sum( kernel[co][ci][r_kh][r_kw][cob] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw] ), name="kernel_pack", ) else: kernel_pack = te.compute( (alpha, alpha, CI, CO, COB), lambda eps, nu, ci, co, cob: te.sum( kernel[r_kh][r_kw][ci][co][cob] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw] ), name="kernel_pack", ) else: kernel_pack = kernel idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod if layout == "NCHW": N, CI, _, _, CB = get_const_tuple(data.shape) else: N, _, _, CI, CB = get_const_tuple(data.shape) # pack input tile if layout == "NCHW": input_tile = te.compute( (alpha, alpha, CI, P, CB), lambda eps, nu, c, p, cb: data_pad[idxdiv(p, (nH * nW))][c][ idxmod(idxdiv(p, nW), nH) * m + eps ][idxmod(p, nW) * m + nu][cb], name="d", ) else: input_tile = te.compute( (alpha, alpha, CI, P, CB), lambda eps, nu, c, p, cb: data_pad[idxdiv(p, (nH * nW))][ idxmod(idxdiv(p, nW), nH) * m + eps ][idxmod(p, nW) * m + nu][c][cb], name="d", ) # transform data r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_a") data_pack = te.compute( (P, CI, alpha, alpha, CB), lambda p, ci, eps, nu, cb: te.sum( input_tile[r_a][r_b][ci][p][cb] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b] ), name="data_pack", ) # repack transformed data data_pack_trans = te.compute( (alpha, alpha, CI, P, CB), lambda eps, nu, c, p, cb: data_pack[p][c][eps][nu][cb], name="data_pack_trans", ) # do batch gemm ci = te.reduce_axis((0, CI), name="ci") cb = te.reduce_axis((0, CB), name="cb") bgemm = te.compute( (alpha, alpha, CO, P, COB), lambda eps, nu, co, p, cob: te.sum( ( kernel_pack[eps][nu][ci * CB + cb][co][cob] * data_pack_trans[eps][nu][ci][p][cb] ).astype(args["accumulator"]), axis=[ci, cb], ), name="bgemm", ) # inverse transform r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_a") inverse = te.compute( (CO, P, m, m, COB), lambda co, p, vh, vw, cob: te.sum( bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(args["accumulator"]), axis=[r_a, r_b], ), name="inverse", ) # output if layout == "NCHW": if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False: output = te.compute( (N, out_channels, H, W), lambda n, c, h, w: inverse[c // CB][n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)][ idxmod(h, m) ][idxmod(w, m)][c % CB].astype(out_dtype), name="output", tag="cast_from_acc" + args["accumulator"][-2:], ) else: output = te.compute( (N, CO, H, W, COB), lambda n, co, h, w, cob: inverse[co][ n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m) ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype), name="output", tag="cast_from_acc" + args["accumulator"][-2:], ) else: if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False: output = te.compute( (N, H, W, out_channels), lambda n, h, w, c: inverse[c // CB][n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)][ idxmod(h, m) ][idxmod(w, m)][c % CB].astype(out_dtype), name="output", tag="cast_from_acc" + args["accumulator"][-2:], ) else: output = te.compute( (N, H, W, CO, COB), lambda n, h, w, co, cob: inverse[co][ n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m) ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype), name="output", tag="cast_from_acc" + args["accumulator"][-2:], ) if isinstance(N, int): cfg.add_flop(2 * N * CO * COB * H * W * CI * CB * KH * KW) return output
def compute_conv2d_NCHWc_KCRSk_acc32(Input, Filter, stride, padding, dilation, out_dtype=None): """Convolution operator in NCHWc layout.""" if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") rcc = te.reduce_axis((0, in_channel_chunk), name="rc") rcb = te.reduce_axis((0, in_channel_block), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") # NCHWc x KCRSk # texture: NCH|W|c # texture: K|CRS|k # c = crs//RS # rs = crs % RS # r = rs // W == (crs // S) % R # s = rs % W == crs % S Filter = te.compute( (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), ( crs // kernel_w) % kernel_h, crs % kernel_w, ffb], name="packed_filter", ) conv = te.compute( (batch, num_filter_chunk, out_height, out_width, num_filter_block), lambda nn, ffc, yy, xx, ffb: te.sum( (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * Filter[ffc, ( (rcc * in_channel_block + rcb) * kernel_h + ry ) * kernel_w + rx, ffb]).astype(out_dtype), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc_kcrsk_texture", ) output = te.compute( conv.shape, lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype("float32")) return output