def group_conv3d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None): if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 3 assert isinstance(dilation, int) or len(dilation) == 3 if isinstance(stride, int): stride_z = stride_h = stride_w = stride else: stride_z, stride_h, stride_w = stride if isinstance(dilation, int): dilation_z = dilation_h = dilation_w = dilation else: dilation_z, dilation_h, dilation_w = dilation batch, in_channel, in_z, in_height, in_width = get_const_tuple(Input.shape) num_filter, _, kernel_z, kernel_h, kernel_w = get_const_tuple(Filter.shape) assert in_channel % groups == 0, "input channels must divide group size" assert num_filter % groups == 0, "output channels must divide group size" pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d( padding, (kernel_z, kernel_h, kernel_w)) # compute the output shape out_channel = num_filter out_z = simplify( (in_z - (kernel_z - 1) * dilation_z - 1 + pad_front + pad_back) // stride_z + 1) out_height = simplify( (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_front, pad_top, pad_left] pad_after = [0, 0, pad_back, pad_down, pad_right] temp = pad(Input, pad_before, pad_after, name="pad_temp") rc = tvm.reduce_axis((0, in_channel // groups), name='rc') rz = tvm.reduce_axis((0, kernel_z), name='rz') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') return tvm.compute( (batch, out_channel, out_z, out_height, out_width), lambda nn, ff, zz, yy, xx: tvm.sum(temp[ nn, ff // (num_filter // groups) * (in_channel // groups) + rc, zz * stride_z + rz * dilation_z, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w ].astype(out_dtype) * Filter[ff, rc, rz, ry, rx].astype(out_dtype), axis=[rc, rz, ry, rx]), tag='group_conv3d_nchw')
def dilation2d_nhwc(input, filter, stride, padding, dilations, out_dtype=None): """Morphological 2d dilation NHWC layout. Parameters ---------- input : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] filter : tvm.Tensor 3-D with shape [filter_height, filter_width, in_channel] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int Padding size dilations: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype : Optional[str] Specifies the output data type. Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_height, out_width, in_channel] """ if out_dtype is None: out_dtype = input.dtype assert isinstance(stride, int) or len(stride) == 2 assert isinstance(dilations, int) or len(dilations) == 2 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilations, int): dilation_h = dilation_w = dilations else: dilation_h, dilation_w = dilations batch, in_height, in_width, in_channel = input.shape kernel_h, kernel_w, channel = filter.shape assert in_channel.value == channel.value, \ "For Dilation2D input and filter channels should be same." # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] padded_input = pad(input, pad_before, pad_after, name="padded_input") ry = te.reduce_axis((0, kernel_h), name='ry') rx = te.reduce_axis((0, kernel_w), name='rx') return te.compute((batch, out_height, out_width, in_channel), lambda nn, yy, xx, ff: te.max(padded_input[ nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ff].astype(out_dtype) + filter[ ry, rx, ff].astype(out_dtype), axis=[ry, rx]), tag="dilation2d_nhcw")
def _depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): """Depthwise convolution nchw forward operator. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] Filter : tvm.Tensor 4-D with shape [in_channel, channel_multiplier, filter_height, filter_width] stride : tuple of two ints The spatial stride along height and width padding : int or str Padding size, or ['VALID', 'SAME'] dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype: str, optional Output data type Returns ------- Output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ out_dtype = Input.dtype if out_dtype is None else out_dtype if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_channel, in_height, in_width = Input.shape # shape of dilated kernel filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape dilated_kernel_h = (filter_height - 1) * dilation_h + 1 dilated_kernel_w = (filter_width - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = simplify(in_channel * channel_multiplier) out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) # padding stage pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] PaddedInput = topi.nn.pad(Input, pad_before, pad_after, name="PaddedInput") # depthconv stage di = tvm.te.reduce_axis((0, filter_height), name='di') dj = tvm.te.reduce_axis((0, filter_width), name='dj') Output = tvm.te.compute( (batch, out_channel, out_height, out_width), lambda b, c, i, j: tvm.te.sum( (PaddedInput[b, c/channel_multiplier, i*stride_h+di*dilation_h, j*stride_w+dj*dilation_w].astype(out_dtype) * Filter[c/channel_multiplier, c%channel_multiplier, di, dj].astype(out_dtype)), axis=[di, dj]), name='DepthwiseConv2d', tag="depthwise_conv2d_nchw") return Output
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): """Compute function for Cortex-M7 SIMD implementation of conv2d.""" assert isinstance(strides, int) or len(strides) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(strides, int): stride_h = stride_w = strides else: stride_h, stride_w = strides if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch_size, in_height, in_width, in_channels = data.shape kernel_h, kernel_w, out_channels, _ = kernel.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] padded_data = pad(data, pad_before, pad_after, name='padded_data') rc = te.reduce_axis((0, in_channels), name='rc') ry = te.reduce_axis((0, kernel_h), name='ry') rx = te.reduce_axis((0, kernel_w), name='rx') conv = te.compute( (batch_size, out_height, out_width, out_channels), lambda nn, yy, xx, ff: te.sum( padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * kernel[ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc]), name='conv2d', tag='conv2d_nhwc') ########################### # Config Space Definition # ########################### n, oh, ow, co = (cfg.axis(batch_size.value), cfg.axis(out_height.value), cfg.axis(out_width.value), cfg.axis(out_channels.value)) kh, kw, ci = (cfg.reduce_axis(kernel_h.value), cfg.reduce_axis(kernel_w.value), cfg.reduce_axis(in_channels.value)) assert in_channels.value % 4 == 0 owo, owi = cfg.define_split('tile_ow', ow, policy='factors', num_outputs=2) cio, cii = cfg.define_split('tile_ci', ci, policy='factors', num_outputs=2, filter=lambda x: x.size[-1] % 4 == 0) coo, coi = cfg.define_split('tile_co', co, policy='factors', num_outputs=2) cfg.define_reorder('reorder_0_simd', [n, oh, owo, owi, coo, coi, kh, kw, cio, cii], policy='candidate', candidate=[ [n, oh, kh, kw, owo, coo, cio, owi, coi, cii], [n, oh, kh, kw, coo, owo, cio, owi, coi, cii], [n, kh, kw, oh, owo, coo, cio, owi, coi, cii], [n, kh, kw, oh, coo, owo, cio, owi, coi, cii]]) cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32]) cfg.define_knob('unroll_explicit', [0, 1]) return conv
def depth_1by1_fused(Input, Filter_d, Filter_1, stride_d, padding_d='SAME', dilation_d=1, out_dtype=None, layout="NCHW"): """Fused depthwise convolution + 1x1 convolution forward operator (NCHW & NHWC). Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] (NCHW) or [batch, in_height, in_width, in_channel] (NHWC) Filter_d : tvm.Tensor 4-D with shape [in_channel, in_channel * channel_multiplier, filter_height, filter_width] or [filter_height, filter_width, in_channel, in_channel * channel_multiplier] Filter_1 : tvm.Tensor 4-D with shape [out_channel, in_channel * channel_multiplier, 0, 0] or [0, 0, out_channel, in_channel * channel_multiplier] stride_d : tuple of two ints The spatial stride along height and width padding_d : int or str Padding size, or ['VALID', 'SAME'] dilation_d: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype: str, optional Output data type Returns ------- output : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ assert layout in ["NCHW", "NHWC"] out_dtype = Input.dtype if out_dtype is None else out_dtype if isinstance(stride_d, int): stride_h_d = stride_w_d = stride_d else: stride_h_d, stride_w_d = stride_d if isinstance(dilation_d, int): dilation_h_d = dilation_w_d = dilation_d else: dilation_h_d, dilation_w_d = dilation_d if layout == "NCHW": if dilation_h_d != 1 or dilation_w_d != 1: Filter_d = dilate(Filter_d, (1, 1, dilation_h_d, dilation_w_d)) batch, in_channel_d, in_height_d, in_width_d = Input.shape filter_channel, _, filter_height, filter_width = Filter_d.shape num_filter, channel, _, _ = Filter_1.shape else: # NHWC if dilation_h_d != 1 or dilation_w_d != 1: Filter_d = dilate(Filter_d, (dilation_h_d, dilation_w_d, 1, 1)) batch, in_height_d, in_width_d, in_channel_d = Input.shape filter_height, filter_width, filter_channel, _ = Filter_d.shape _, _, num_filter, channel = Filter_1.shape pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding_d, (filter_height, filter_width)) out_channel = simplify(in_channel_d) out_height = simplify((in_height_d - filter_height + pad_top + pad_down) // stride_h_d + 1) out_width = simplify((in_width_d - filter_width + pad_left + pad_right) // stride_w_d + 1) out_channel = num_filter # padding stage if layout == "NCHW": pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] else: # NHWC pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") # depthconv stage di = tvm.reduce_axis((0, filter_height), name='di') dj = tvm.reduce_axis((0, filter_width), name='dj') # 1by1 stage c = tvm.reduce_axis((0, out_channel), name='c') if layout == "NCHW": Output = tvm.compute( (batch, out_channel, out_height, out_width), lambda b, f, i, j: tvm.sum( (PaddedInput[b, c, i * stride_h_d + di, j * stride_w_d + dj]. astype(out_dtype) * Filter_d[c, 0, di, dj].astype( out_dtype) * Filter_1[f, c, 0, 0].astype(out_dtype)), axis=[di, dj, c]), name='Depthwise1by1Fused', tag="depthwise_1by1_fused_nchw") else: # NHWC Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, f: tvm.sum( (PaddedInput[b, i * stride_h_d + di, j * stride_w_d + dj, c]. astype(out_dtype) * Filter_d[di, dj, c, 0].astype( out_dtype) * Filter_1[0, 0, c, f].astype(out_dtype)), axis=[di, dj, c]), name='Depthwise1by1Fused', tag="depthwise_1by1_fused_nhwc") return Output
def fused_convs(input_data, filters, resnet_block=False): out_dtype = input_data.dtype Input = None nodes = [input_data] params = [input_data] for f in filters: Input = nodes[-1] Filter = f.placeholder layout = f.layout depthwise = f.depthwise kernel = f.kernel stride = f.stride padding = f.padding dilation = f.dilation assert not (depthwise and kernel == 1) # Don't consider 1by1 depthwise padded_count = 0 conv_count = 0 depthwise_count = 0 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape if f.NHWC_transpose: # HWOI kernel_h, kernel_w, tmp, kernel_channel = Filter.shape else: # HWIO kernel_h, kernel_w, kernel_channel, tmp = Filter.shape if depthwise: channel_multiplier = tmp else: num_filter = tmp # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = simplify(in_channel * channel_multiplier) if depthwise else num_filter out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) if f.kernel > 1: print("Padding is needed!") pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput_{}".format(padded_count)) padded_count += 1 nodes.append(PaddedInput) # Update Input Input = PaddedInput batch, in_height, in_width, in_channel = Input.shape if not depthwise: rc = tvm.reduce_axis((0, in_channel), name='rc') if kernel > 1: ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') if not depthwise: # Normal convolution if kernel > 1: Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * (Filter[ry, rx, ff, rc] if f.NHWC_transpose else Filter[ry, rx, rc, ff]).astype(out_dtype), axis=[ry, rx, rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") else: # Only reduce rc axis Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h, xx * stride_w, rc].astype(out_dtype) * (Filter[0, 0, ff, rc] if f.NHWC_transpose else Filter[0, 0, rc, ff]).astype(out_dtype), axis=[rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") conv_count += 1 else: # Depthwise convolution (kernel > 1) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (Input[b, i*stride_h + ry*dilation_h, j*stride_w + rx*dilation_w, tvm.indexdiv(c, channel_multiplier)].astype(out_dtype) * (Filter[ry, rx, tvm.indexmod(c, channel_multiplier), tvm.indexdiv(c, channel_multiplier)] if f.NHWC_transpose else Filter[ry, rx, tvm.indexdiv(c, channel_multiplier), tvm.indexmod(c, channel_multiplier)]).astype(out_dtype)), axis=[ry, rx]), name='DepthwiseConv2dOutput_{}'.format(depthwise_count), tag="depthwise_nhwc") depthwise_count += 1 nodes.append(Output) params.append(Filter) if resnet_block: First = nodes[0] Last = nodes[-1] assert (first.shape == last.shape) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (First[b, i, j, c].astype(out_dtype) + (Last[b, i, j, c]).astype(out_dtype))), name='ElementwiseAddOutput_{}'.format(depthwise_count), tag="elem_nhwc") nodes.append(Output) params.append(nodes[-1]) # Final output return nodes, params