def group_conv3d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None): if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 3 assert isinstance(dilation, int) or len(dilation) == 3 if isinstance(stride, int): stride_z = stride_h = stride_w = stride else: stride_z, stride_h, stride_w = stride if isinstance(dilation, int): dilation_z = dilation_h = dilation_w = dilation else: dilation_z, dilation_h, dilation_w = dilation batch, in_channel, in_z, in_height, in_width = get_const_tuple(Input.shape) num_filter, _, kernel_z, kernel_h, kernel_w = get_const_tuple(Filter.shape) assert in_channel % groups == 0, "input channels must divide group size" assert num_filter % groups == 0, "output channels must divide group size" pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d( padding, (kernel_z, kernel_h, kernel_w)) # compute the output shape out_channel = num_filter out_z = simplify( (in_z - (kernel_z - 1) * dilation_z - 1 + pad_front + pad_back) // stride_z + 1) out_height = simplify( (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1) # compute graph pad_before = [0, 0, pad_front, pad_top, pad_left] pad_after = [0, 0, pad_back, pad_down, pad_right] temp = pad(Input, pad_before, pad_after, name="pad_temp") rc = tvm.reduce_axis((0, in_channel // groups), name='rc') rz = tvm.reduce_axis((0, kernel_z), name='rz') ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') return tvm.compute( (batch, out_channel, out_z, out_height, out_width), lambda nn, ff, zz, yy, xx: tvm.sum(temp[ nn, ff // (num_filter // groups) * (in_channel // groups) + rc, zz * stride_z + rz * dilation_z, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w ].astype(out_dtype) * Filter[ff, rc, rz, ry, rx].astype(out_dtype), axis=[rc, rz, ry, rx]), tag='group_conv3d_nchw')
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): print("Run in pure nChwc common decl") assert layout == 'NCHW', "only support NCHW convolution for AVX" wkl = get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) num_filter, _, kernel_height, kernel_width, _, co = get_const_tuple( kernel.shape) num_filter *= co pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 # pack data DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn) data_vec = tvm.compute( shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w], name='data_vec') kernel_vec = kernel # convolution oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) unpack_shape = (batch_size, num_filter, out_height, out_width) ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum( data_vec[n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % sch.ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw, ic % sch.ic_bn, oc_block], axis=[ic, kh, kw]), name='conv') unpack = tvm.compute( unpack_shape, lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn], name='output_unpack', tag='conv2d_nchw') return unpack
def _declaration_conv(wkl, data, kernel): sch = _get_schedule(wkl) out_dtype = wkl.out_dtype HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride batch_size = data.shape[0] out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") else: data_pad = data oshape = (batch_size, wkl.out_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) ic = tvm.reduce_axis((0, wkl.in_filter), name='ic') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_pad[ n, ic // sch.ic_bn, oh * HSTR, ow * WSTR, ic % sch.ic_bn].astype( out_dtype) * kernel[oc_chunk, ic // sch.ic_bn, ic % sch.ic_bn, oc_block, 0, 0], axis=[ic]), name='conv2d_NCHWc', tag='conv2d_NCHWc') return conv
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): assert layout == 'NCHW', "only support NCHW convolution on rasp" assert data.shape[ 0].value == 1, "only support batch size=1 convolution on rasp" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 # input: c, h, w DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn) data_vec = tvm.compute( shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w]) shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn, sch.oc_bn, 1, 1) kernel_pack = tvm.compute( shape, lambda CO, CI, ci, co, h, w: kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w]) oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) ic = tvm.reduce_axis((0, in_channel), name='ic') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // sch.ic_bn, oh * HSTR, ow * WSTR, ic % sch.ic_bn].astype( out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, ic % sch. ic_bn, oc_block, 0, 0], axis=[ic]), name='conv') oshape = (batch_size, num_filter, out_height, out_width) unpack = tvm.compute( oshape, lambda n, oc, oh, ow: conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn], tag='conv2d_nchw') return unpack
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): assert layout == 'NCHWc', "only support NCHW convolution on rasp" assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" wkl = get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride batch_size, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape) num_filter, _, _, co, kernel_height, kernel_width = get_const_tuple(kernel.shape) num_filter *= co pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 # input: c, h, w DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") else: data_pad = data in_channel = in_channel_block * in_channel_chunk if in_channel_block != sch.ic_bn: print('WARNING!!! (1x1) in_channel_block=%d vs sch.ic_bn=%d' % (in_channel_block, sch.ic_bn)) shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn) data_vec = tvm.compute(shape, lambda n, C, h, w, c: data_pad[n, (C * sch.ic_bn + c) // in_channel_block, h, w, (C * sch.ic_bn + c) % in_channel_block], tag='conv2d_data_pack') else: data_vec = data_pad kernel_pack = kernel oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) ic = tvm.reduce_axis((0, in_channel), name='ic') conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[n, ic // sch.ic_bn, oh * HSTR, ow * WSTR, ic % sch.ic_bn].astype(out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, ic % sch.ic_bn, oc_block, 0, 0], axis=[ic]), name='conv2d_nChwc', tag='conv2d_nChwc') return conv
def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'): """Convolution operator in NHWC layout. Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] Filter : tvm.Tensor 4-D with shape [filter_height, filter_width, in_channel, num_filter] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] Returns ------- output : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ assert isinstance(stride, int) or len(stride) == 2 batch, in_height, in_width, in_channel = Input.shape kernel_h, kernel_w, channel, num_filter = Filter.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride # compute the output shape out_channel = num_filter pad_before = [0, 0, 0, 0] pad_after = [0, 0, 0, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") _, a, b, _ = PaddedInput.shape out_height = a out_width = b rc = tvm.reduce_axis((0, in_channel), name='rc') Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum(PaddedInput[nn, yy, xx, rc].astype( out_dtype) * Filter[0, 0, rc, ff].astype(out_dtype), axis=[rc]), name="Conv2dOutput", tag="conv2d_nhwc") return Output
def conv2d(N, H, W, CI, CO, KH, KW, strides, padding, scaling_factor): dilation = 2 cfg = autotvm.get_config() data = tvm.placeholder((N, CI / BI, H, W, BI), name='data', dtype='int8') kernel = tvm.placeholder((CO / BO, CI / BI, KH, KW, BO, BI), name='kernel', dtype='int8') pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides pad_height = H + 2 * pad_h pad_width = W + 2 * pad_w out_height = (pad_height - ((KH - 1) * dilation + 1)) // stride_h + 1 out_width = (pad_width - ((KW - 1) * dilation + 1)) // stride_w + 1 DOPAD = (stride_h != 0 or stride_w != 0) if DOPAD: pad_data = pad(data, (0, 0, pad_h, pad_w, 0), name='pad_data') else: pad_data = data oshape = (N, CO / BO, out_height, out_width, BO) ic_chunk = tvm.reduce_axis((0, CI / BI), name='ic_chunk') ic_block = tvm.reduce_axis((0, BI), name='ic_block') kh = tvm.reduce_axis((0, KH), name='kh') kw = tvm.reduce_axis((0, KW), name='kw') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm. sum(pad_data[n, ic_chunk, oh * stride_h + kh * dilation, ow * stride_w + kw * dilation, ic_block].astype('int32') * kernel[oc_chunk, ic_chunk, kh, kw, oc_block, ic_block].astype( 'int32'), axis=[ic_chunk, kh, kw, ic_block])) output = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: (conv[n, oc_chunk, oh, ow, oc_block] * scaling_factor).astype('int8'), name='conv') s = tvm.create_schedule([output.op]) s[conv].set_scope('local') # inline padding if DOPAD: s[pad_data].compute_inline() data, raw_data = pad_data, data # create cache stage AA = s.cache_read(data, 'shared', [conv]) WW = s.cache_read(kernel, 'shared', [conv]) # tile and bind spatial axes n, f, y, x, c = s[output].op.axis cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) # this is the scope to attach global config inside this kernel kernel_scope, n = s[output].split(n, nparts=1) s[output].bind(n, tvm.thread_axis("blockIdx.z")) s[output].bind(bf, tvm.thread_axis("blockIdx.y")) s[output].bind(bx, tvm.thread_axis("blockIdx.x")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(vy, tvm.thread_axis("vthread")) s[output].bind(vx, tvm.thread_axis("vthread")) s[output].bind(tf, tvm.thread_axis("threadIdx.z")) s[output].bind(ty, tvm.thread_axis("threadIdx.y")) s[output].bind(tx, tvm.thread_axis("threadIdx.x")) s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) _, c = s[output].split(c, factor=4) #s[output].vectorize(c) s[conv].compute_at(s[output], tx) # tile and bind reduction axes n, f, y, x, c = s[conv].op.axis rc, ry, rx, rc_block = s[conv].op.reduce_axis cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2) cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2) cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2) rco, rci = cfg['tile_rc'].apply(s, conv, rc) ryo, ryi = cfg['tile_ry'].apply(s, conv, ry) rxo, rxi = cfg['tile_rx'].apply(s, conv, rx) s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block) _, rc_block = s[conv].split(rc_block, factor=4) s[conv].tensorize(rc_block, dot) s[AA].compute_at(s[conv], n) s[WW].compute_at(s[conv], rxo) # cooperative fetching for load in [AA, WW]: if load == AA: n, f, y, x, c = s[load].op.axis if not DOPAD: s[load].vectorize(c) fused = s[load].fuse(n, f, y, x) else: c, _ = s[load].split(c, factor=4) fused = s[load].fuse(n, f, y, x, c) else: n, f, y, x, oc_chunk, c = s[load].op.axis fused = s[load].fuse(n, f, y, x, oc_chunk) s[load].vectorize(c) fused, tx = s[load].split(fused, factor=cfg["tile_x"].size[2]) fused, ty = s[load].split(fused, factor=cfg["tile_y"].size[2]) fused, tz = s[load].split(fused, factor=cfg["tile_f"].size[2]) s[load].bind(tz, tvm.thread_axis("threadIdx.z")) s[load].bind(ty, tvm.thread_axis("threadIdx.y")) s[load].bind(tx, tvm.thread_axis("threadIdx.x")) for load in [AA, WW]: name = load.op.name + '_double_buffer' cfg.define_knob(name, [0, 1]) if cfg[name].val: s[load].double_buffer # tune unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', False) # num flop NH, NW = [e.value for e in output.shape[2:4]] cfg.add_flop(N * CO * NH * NW * (CI * KH * KW * 2)) return s, [raw_data, kernel, output]
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): # print('Run in avx512_conv_common decl') assert layout == 'NCHW', "only support NCHW convolution on rasp" assert data.shape[ 0].value == 1, "only support batch size=1 convolution on rasp" wkl = _get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) if len(kernel.shape) == 4: num_filter, _, kernel_height, kernel_width = get_const_tuple( kernel.shape) else: num_filter, _, kernel_height, kernel_width, ic, oc = get_const_tuple( kernel.shape) num_filter *= oc pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 # pack data # input: c, h, w shape = (batch_size, in_channel, pad_height, pad_width) DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data # data_pad = tvm.compute(shape, lambda n, c, h, w: tvm.select( # tvm.all(h >= HPAD, h < pad_height - HPAD, w >= WPAD, w < pad_width - WPAD), # data[n, c, h - HPAD, w - WPAD], 0.0 # ), name='data_pad') shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn, pad_width) data_vec = tvm.compute( shape, lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w], name='data_vec') # pack kernel # input: co, ci, h, w # output: gOIhw16i16o if False: shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, kernel_height, kernel_width, sch.ic_bn, sch.oc_bn) kernel_pack = tvm.compute( shape, lambda CO, CI, h, w, ci, co: kernel[CO * sch.oc_bn + co, CI * sch. ic_bn + ci, h, w], name='kernel_pack') else: kernel_pack = kernel # convolution oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) ovshape = (batch_size, num_filter // sch.oc_bn, out_height, sch.oc_bn, out_width) unpack_shape = (batch_size, num_filter, out_height, out_width) ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // sch.ic_bn, oh * HSTR + kh, ic % sch.ic_bn, ow * WSTR + kw ].astype(out_dtype) * kernel_pack[oc_chunk, ic // sch.ic_bn, kh, kw, ic % sch.ic_bn, oc_block], axis=[ic, kh, kw]), name='conv') unpack = tvm.compute( unpack_shape, lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn], name='output_unpack', tag='conv2d_nchw') return unpack
def _declaration_conv(wkl, data, kernel): sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride ndim_input = len(data.shape) if ndim_input == 5: batch_size, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple( data.shape) in_channel = in_channel_block * in_channel_chunk else: assert ndim_input == 4 in_channel_block = 0 batch_size, in_channel, in_height, in_width = get_const_tuple( data.shape) num_filter, _, kernel_height, kernel_width, _, co = get_const_tuple( kernel.shape) num_filter *= co pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 # pack data DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: if ndim_input == 5: data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") else: assert ndim_input == 4 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") else: data_pad = data if in_channel_block != sch.ic_bn: print('WARNING!!! (common) in_channel_block=%d vs sch.ic_bn=%d' % (in_channel_block, sch.ic_bn)) shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn) if ndim_input == 5: data_vec = tvm.compute( shape, lambda n, C, h, w, c: data_pad[ n, (C * sch.ic_bn + c) // in_channel_block, h, w, (C * sch.ic_bn + c) % in_channel_block], name='data_vec', tag="conv2d_data_pack") else: assert ndim_input == 4 data_vec = tvm.compute( shape, lambda n, C, h, w, c: data_pad[n, (C * sch.ic_bn + c), h, w], name='data_vec', tag="conv2d_data_pack") # data_pad = data_vec else: data_vec = data_pad kernel_vec = kernel # convolution oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') import re unpack_channel_block = re.findall(r'\d+', sch.layout_out) if len(unpack_channel_block) == 0: conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % sch. ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw, ic % sch .ic_bn, oc_block], axis=[ic, kh, kw]), name='conv2d') # , tag="conv2d_nChwc") unpack_shape = (batch_size, num_filter, out_height, out_width) unpack = tvm.compute( unpack_shape, lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn], name='output_unpack', tag='conv2d_nChwc_unpack') else: assert len(unpack_channel_block) == 1 unpack_channel_block = int(unpack_channel_block[0]) if unpack_channel_block == sch.oc_bn: return tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % sch.ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw, ic % sch.ic_bn, oc_block], axis= [ic, kh, kw]), name='conv2d', tag="conv2d_nChwc") else: conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum(data_vec[ n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % sch.ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw, ic % sch.ic_bn, oc_block], axis= [ic, kh, kw]), name='conv2d') unpack_shape = (batch_size, num_filter // unpack_channel_block, out_height, out_width, unpack_channel_block) unpack = tvm.compute( unpack_shape, lambda n, C, h, w, c: conv[ n, (C * unpack_channel_block + c) // sch.oc_bn, h, w, (C * unpack_channel_block + c) % sch.oc_bn], name='output_unpack', tag='conv2d_nChwc_unpack') return unpack
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): """Compute function for Cortex-M7 SIMD implementation of conv2d.""" assert isinstance(strides, int) or len(strides) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(strides, int): stride_h = stride_w = strides else: stride_h, stride_w = strides if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch_size, in_height, in_width, in_channels = data.shape kernel_h, kernel_w, out_channels, _ = kernel.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] padded_data = pad(data, pad_before, pad_after, name='padded_data') rc = te.reduce_axis((0, in_channels), name='rc') ry = te.reduce_axis((0, kernel_h), name='ry') rx = te.reduce_axis((0, kernel_w), name='rx') conv = te.compute( (batch_size, out_height, out_width, out_channels), lambda nn, yy, xx, ff: te.sum( padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * kernel[ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc]), name='conv2d', tag='conv2d_nhwc') ########################### # Config Space Definition # ########################### n, oh, ow, co = (cfg.axis(batch_size.value), cfg.axis(out_height.value), cfg.axis(out_width.value), cfg.axis(out_channels.value)) kh, kw, ci = (cfg.reduce_axis(kernel_h.value), cfg.reduce_axis(kernel_w.value), cfg.reduce_axis(in_channels.value)) assert in_channels.value % 4 == 0 owo, owi = cfg.define_split('tile_ow', ow, policy='factors', num_outputs=2) cio, cii = cfg.define_split('tile_ci', ci, policy='factors', num_outputs=2, filter=lambda x: x.size[-1] % 4 == 0) coo, coi = cfg.define_split('tile_co', co, policy='factors', num_outputs=2) cfg.define_reorder('reorder_0_simd', [n, oh, owo, owi, coo, coi, kh, kw, cio, cii], policy='candidate', candidate=[ [n, oh, kh, kw, owo, coo, cio, owi, coi, cii], [n, oh, kh, kw, coo, owo, cio, owi, coi, cii], [n, kh, kw, oh, owo, coo, cio, owi, coi, cii], [n, kh, kw, oh, coo, owo, cio, owi, coi, cii]]) cfg.define_knob('auto_unroll_max_step', [0, 2, 4, 8, 16, 32]) cfg.define_knob('unroll_explicit', [0, 1]) return conv
def depth_1by1_fused(Input, Filter_d, Filter_1, stride_d, padding_d='SAME', dilation_d=1, out_dtype=None, layout="NCHW"): """Fused depthwise convolution + 1x1 convolution forward operator (NCHW & NHWC). Parameters ---------- Input : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] (NCHW) or [batch, in_height, in_width, in_channel] (NHWC) Filter_d : tvm.Tensor 4-D with shape [in_channel, in_channel * channel_multiplier, filter_height, filter_width] or [filter_height, filter_width, in_channel, in_channel * channel_multiplier] Filter_1 : tvm.Tensor 4-D with shape [out_channel, in_channel * channel_multiplier, 0, 0] or [0, 0, out_channel, in_channel * channel_multiplier] stride_d : tuple of two ints The spatial stride along height and width padding_d : int or str Padding size, or ['VALID', 'SAME'] dilation_d: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype: str, optional Output data type Returns ------- output : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ assert layout in ["NCHW", "NHWC"] out_dtype = Input.dtype if out_dtype is None else out_dtype if isinstance(stride_d, int): stride_h_d = stride_w_d = stride_d else: stride_h_d, stride_w_d = stride_d if isinstance(dilation_d, int): dilation_h_d = dilation_w_d = dilation_d else: dilation_h_d, dilation_w_d = dilation_d if layout == "NCHW": if dilation_h_d != 1 or dilation_w_d != 1: Filter_d = dilate(Filter_d, (1, 1, dilation_h_d, dilation_w_d)) batch, in_channel_d, in_height_d, in_width_d = Input.shape filter_channel, _, filter_height, filter_width = Filter_d.shape num_filter, channel, _, _ = Filter_1.shape else: # NHWC if dilation_h_d != 1 or dilation_w_d != 1: Filter_d = dilate(Filter_d, (dilation_h_d, dilation_w_d, 1, 1)) batch, in_height_d, in_width_d, in_channel_d = Input.shape filter_height, filter_width, filter_channel, _ = Filter_d.shape _, _, num_filter, channel = Filter_1.shape pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding_d, (filter_height, filter_width)) out_channel = simplify(in_channel_d) out_height = simplify((in_height_d - filter_height + pad_top + pad_down) // stride_h_d + 1) out_width = simplify((in_width_d - filter_width + pad_left + pad_right) // stride_w_d + 1) out_channel = num_filter # padding stage if layout == "NCHW": pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down, pad_right] else: # NHWC pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") # depthconv stage di = tvm.reduce_axis((0, filter_height), name='di') dj = tvm.reduce_axis((0, filter_width), name='dj') # 1by1 stage c = tvm.reduce_axis((0, out_channel), name='c') if layout == "NCHW": Output = tvm.compute( (batch, out_channel, out_height, out_width), lambda b, f, i, j: tvm.sum( (PaddedInput[b, c, i * stride_h_d + di, j * stride_w_d + dj]. astype(out_dtype) * Filter_d[c, 0, di, dj].astype( out_dtype) * Filter_1[f, c, 0, 0].astype(out_dtype)), axis=[di, dj, c]), name='Depthwise1by1Fused', tag="depthwise_1by1_fused_nchw") else: # NHWC Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, f: tvm.sum( (PaddedInput[b, i * stride_h_d + di, j * stride_w_d + dj, c]. astype(out_dtype) * Filter_d[di, dj, c, 0].astype( out_dtype) * Filter_1[0, 0, c, f].astype(out_dtype)), axis=[di, dj, c]), name='Depthwise1by1Fused', tag="depthwise_1by1_fused_nhwc") return Output
def fused_convs(input_data, filters, resnet_block=False): out_dtype = input_data.dtype Input = None nodes = [input_data] params = [input_data] for f in filters: Input = nodes[-1] Filter = f.placeholder layout = f.layout depthwise = f.depthwise kernel = f.kernel stride = f.stride padding = f.padding dilation = f.dilation assert not (depthwise and kernel == 1) # Don't consider 1by1 depthwise padded_count = 0 conv_count = 0 depthwise_count = 0 if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape if f.NHWC_transpose: # HWOI kernel_h, kernel_w, tmp, kernel_channel = Filter.shape else: # HWIO kernel_h, kernel_w, kernel_channel, tmp = Filter.shape if depthwise: channel_multiplier = tmp else: num_filter = tmp # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_channel = simplify(in_channel * channel_multiplier) if depthwise else num_filter out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) if f.kernel > 1: print("Padding is needed!") pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput_{}".format(padded_count)) padded_count += 1 nodes.append(PaddedInput) # Update Input Input = PaddedInput batch, in_height, in_width, in_channel = Input.shape if not depthwise: rc = tvm.reduce_axis((0, in_channel), name='rc') if kernel > 1: ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') if not depthwise: # Normal convolution if kernel > 1: Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * (Filter[ry, rx, ff, rc] if f.NHWC_transpose else Filter[ry, rx, rc, ff]).astype(out_dtype), axis=[ry, rx, rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") else: # Only reduce rc axis Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda nn, yy, xx, ff: tvm.sum( Input[nn, yy * stride_h, xx * stride_w, rc].astype(out_dtype) * (Filter[0, 0, ff, rc] if f.NHWC_transpose else Filter[0, 0, rc, ff]).astype(out_dtype), axis=[rc]), name="Conv2dOutput_{}".format(conv_count), tag="conv2d_nhwc") conv_count += 1 else: # Depthwise convolution (kernel > 1) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (Input[b, i*stride_h + ry*dilation_h, j*stride_w + rx*dilation_w, tvm.indexdiv(c, channel_multiplier)].astype(out_dtype) * (Filter[ry, rx, tvm.indexmod(c, channel_multiplier), tvm.indexdiv(c, channel_multiplier)] if f.NHWC_transpose else Filter[ry, rx, tvm.indexdiv(c, channel_multiplier), tvm.indexmod(c, channel_multiplier)]).astype(out_dtype)), axis=[ry, rx]), name='DepthwiseConv2dOutput_{}'.format(depthwise_count), tag="depthwise_nhwc") depthwise_count += 1 nodes.append(Output) params.append(Filter) if resnet_block: First = nodes[0] Last = nodes[-1] assert (first.shape == last.shape) Output = tvm.compute( (batch, out_height, out_width, out_channel), lambda b, i, j, c: tvm.sum( (First[b, i, j, c].astype(out_dtype) + (Last[b, i, j, c]).astype(out_dtype))), name='ElementwiseAddOutput_{}'.format(depthwise_count), tag="elem_nhwc") nodes.append(Output) params.append(nodes[-1]) # Final output return nodes, params
def _declaration_conv(data, kernel, stride, padding, layout, out_dtype): assert layout == 'NCHWc', "only support NCHWc convolution for AVX" wkl = get_workload(data, kernel, stride, padding, out_dtype) sch = _get_schedule(wkl) HPAD, WPAD = wkl.hpad, wkl.wpad HSTR, WSTR = wkl.hstride, wkl.wstride batch_size, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple( data.shape) num_filter, _, kernel_height, kernel_width, _, co = get_const_tuple( kernel.shape) num_filter *= co pad_height = in_height + 2 * HPAD pad_width = in_width + 2 * WPAD out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1 out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1 # pack data DOPAD = (HPAD != 0 and WPAD != 0) if DOPAD: data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad") else: data_pad = data in_channel = in_channel_block * in_channel_chunk if in_channel_block != sch.ic_bn: print('WARNING!!! (common) in_channel_block=%d vs sch.ic_bn=%d' % (in_channel_block, sch.ic_bn)) shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn) data_vec = tvm.compute( shape, lambda n, C, h, w, c: data_pad[ n, (C * sch.ic_bn + c) // in_channel_block, h, w, (C * sch.ic_bn + c) % in_channel_block], name='data_vec', tag="conv2d_data_pack") else: data_vec = data_pad kernel_vec = kernel # convolution oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn) ic = tvm.reduce_axis((0, in_channel), name='ic') kh = tvm.reduce_axis((0, kernel_height), name='kh') kw = tvm.reduce_axis((0, kernel_width), name='kw') conv = tvm.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: tvm.sum( data_vec[n, ic // sch.ic_bn, oh * HSTR + kh, ow * WSTR + kw, ic % sch.ic_bn] * kernel_vec[oc_chunk, ic // sch.ic_bn, kh, kw, ic % sch.ic_bn, oc_block], axis=[ic, kh, kw]), name='conv2d_nChwc', tag="conv2d_nChwc") return conv